Projekt:Immerwährender Datenlaube-Kalender
Projekt | |
---|---|
Projekttitel | Immerwährender Datenlaube-Kalender |
Ansprechpartner | DieDatenlaube: Jens Bemme, Christian Erlinger, Matthias Erfurth |
Laufzeit | 2024... |
Kurzbeschreibung | |
Query-Sammlung | |
Der Immerwährende Datenlaube-Kalender bietet Queries für die Daten der Gartenlaube und Varianten, die mit 'unseren' Methoden andere Datenbestände und Themen befragen. Wir ergänzen hier. Die Datenlaube ist das Citizen Science-Projekt für offene Kulturdaten der Gartenlaube. Seit 2008 transkribiert und ocr-korrigiert eine wachsende Gemeinschaft in Wikisource diese Illustrierte aus Leipzig, d.h. insgesamt circa 18.500 Artikel des ersten deutschen Massenblatts seit 1853, vorerst bis 1899. Vergleiche den Datenlaube-Kalender für das Jahr 2023.
Queries zum 12. Wikidata-Geburtstag
BearbeitenWerke von Gartenlauben-Illustratoren, die im Boetticher (als Maler) gelistet wurden
BearbeitenImage-Grid
#defaultView:ImageGrid
# Werke von Gartenlaube-Illustratoren, die im Boetticher (als Maler) gelistet wurden
SELECT DISTINCT ?item ?itemLabel ?illustrator ?illustratorLabel ?image
WHERE
{
?item wdt:P1433 wd:Q655617.
?item wdt:P18 ?image.
?item wdt:P110 ?illustrator.
?illustrator wdt:P1343 wd:Q72628185
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de". }
}
Viewpoint
BearbeitenWie können wir Viewpoint für Die Gartenlaube variieren? Vgl. d:User:Jeb/Pesterwitz
n-grams from a random set of publications missing main subjects
BearbeitenThis page is used in conjunction with m:Special:UrlShortener as a workaround to https://phabricator.wikimedia.org/T220703 . URL shortening can also be triggered via the MediaWiki API. ANother option for URL shortening is Query Chest.
# Most frequent n-grams from a random set of publications in the Gartenlaube which are missing main subject tags
SELECT
DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
WITH
{ # Generating a list of entities to be analyzed
SELECT ?Publication
{
SERVICE bd:sample { ?Publication wdt:P1433 wd:Q655617 . bd:serviceParam bd:sample.limit 10000 }
FILTER NOT EXISTS { ?Publication wdt:P921 ?Schlagwort. }
}
} AS %items
WITH
{ # Preprocessing the titles
SELECT ?Title ?Publication ?Seeds ?ClearTitleLength
{
INCLUDE %items
?Publication wdt:P1476 ?Title.
BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons
BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength)
FILTER(LANG(?Title)="de")
# Basic processing of the titles
BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon)
BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon)
BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title
?ClearTitle,
?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title
AS ?Seeds )
}
} AS %titles
WITH
{ # Generating a list of regexes to look for the NumericValue-th word in a string
# Based on https://w.wiki/KG$ by Jura1
SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue
{
?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue .
FILTER( ?NumericValue > 0 )
FILTER( ?NumericValue < 151)
BIND("^([^ ]+ ){" AS ?RegexStart)
BIND("}([^ ]+) .*" AS ?RegexEnd)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4)
}
} AS %regexes
WITH
{ # Applying the regexes to the titles to extract ngrams (for n <= 8), and counting occurrences of the ngrams across titles
SELECT
DISTINCT ?Ngram
?N
(COUNT(DISTINCT ?Title) AS ?Count)
?Length
?Dashes
(( ?Count * ?Length * ( (?Dashes +1) / ?N)
) AS ?Score)
(SAMPLE(DISTINCT ?Publication) AS ?ExamplePub)
{
INCLUDE %regexes
INCLUDE %titles
BIND(
(CONCAT(
REPLACE(?Seeds, ?Regex1, "$1"), " ",
REPLACE(?Seeds, ?Regex1, "$2"), " ",
REPLACE(?Seeds, ?Regex2, "$1"), " ",
REPLACE(?Seeds, ?Regex2, "$2"), " ",
REPLACE(?Seeds, ?Regex3, "$1"), " ",
REPLACE(?Seeds, ?Regex3, "$2"), " ",
REPLACE(?Seeds, ?Regex4, "$1"), " ",
REPLACE(?Seeds, ?Regex4, "$2")
)
) AS ?NgramCandidate)
BIND(
(REPLACE
(REPLACE
(REPLACE
(REPLACE
(STR(?NgramCandidate),"([;:])",""),
"(^\\s+)",""),
"(\\s+$)",""),
"([ ]{2,})"," ")
) AS ?Ngram)
BIND(STRLEN(?Ngram) AS ?Length)
FILTER (?Length > 3 )
FILTER (?Length <= ?ClearTitleLength )
BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N)
BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", ""))) as ?Dashes)
}
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub
# HAVING(?Count > 1)
} AS %ngrams
WHERE {
INCLUDE %ngrams
# Exclude Ngrams starting or ending with any of a set of blacklisted words
BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist)
BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart)
BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd)
FILTER (!REGEX(?Ngram, ?RegexBlackStart))
FILTER (!REGEX(?Ngram, ?RegexBlackEnd))
# # Exclude Ngrams too similar to the target
# FILTER (!CONTAINS(?Ngram, "climate"))
# FILTER (!CONTAINS(?Ngram, "change"))
?ExamplePub wdt:P1476 ?ExamplePubTitle.
FILTER(LANG(?ExamplePubTitle)="de")
}
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
ORDER BY DESC(?Score) DESC(?Count) DESC(?Length)
LIMIT 200