Projekt Diskussion:Datenlaube-Kalender 2023
Letzter Kommentar: vor 8 Monaten von Jeb in Abschnitt 2024 by Daniel
2024 by Daniel
BearbeitenThis page is used in conjunction with m:Special:UrlShortener as a workaround to https://phabricator.wikimedia.org/T220703 . URL shortening can also be triggered via the MediaWiki API. ANother option for URL shortening is Query Chest.
# Most frequent n-grams from a random set of publications in the Gartenlaube which are missing main subject tags
SELECT
DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
WITH
{ # Generating a list of entities to be analyzed
SELECT ?Publication
{
SERVICE bd:sample { ?Publication wdt:P1433 wd:Q655617 . bd:serviceParam bd:sample.limit 10000 }
FILTER NOT EXISTS { ?Publication wdt:P921 ?Schlagwort. }
}
} AS %items
WITH
{ # Preprocessing the titles
SELECT ?Title ?Publication ?Seeds ?ClearTitleLength
{
INCLUDE %items
?Publication wdt:P1476 ?Title.
BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons
BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength)
FILTER(LANG(?Title)="de")
# Basic processing of the titles
BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon)
BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon)
BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title
?ClearTitle,
?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title
AS ?Seeds )
}
} AS %titles
WITH
{ # Generating a list of regexes to look for the NumericValue-th word in a string
# Based on https://w.wiki/KG$ by Jura1
SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue
{
?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue .
FILTER( ?NumericValue > 0 )
FILTER( ?NumericValue < 151)
BIND("^([^ ]+ ){" AS ?RegexStart)
BIND("}([^ ]+) .*" AS ?RegexEnd)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3)
BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4)
}
} AS %regexes
WITH
{ # Applying the regexes to the titles to extract ngrams (for n <= 8), and counting occurrences of the ngrams across titles
SELECT
DISTINCT ?Ngram
?N
(COUNT(DISTINCT ?Title) AS ?Count)
?Length
?Dashes
(( ?Count * ?Length * ( (?Dashes +1) / ?N)
) AS ?Score)
(SAMPLE(DISTINCT ?Publication) AS ?ExamplePub)
{
INCLUDE %regexes
INCLUDE %titles
BIND(
(CONCAT(
REPLACE(?Seeds, ?Regex1, "$1"), " ",
REPLACE(?Seeds, ?Regex1, "$2"), " ",
REPLACE(?Seeds, ?Regex2, "$1"), " ",
REPLACE(?Seeds, ?Regex2, "$2"), " ",
REPLACE(?Seeds, ?Regex3, "$1"), " ",
REPLACE(?Seeds, ?Regex3, "$2"), " ",
REPLACE(?Seeds, ?Regex4, "$1"), " ",
REPLACE(?Seeds, ?Regex4, "$2")
)
) AS ?NgramCandidate)
BIND(
(REPLACE
(REPLACE
(REPLACE
(REPLACE
(STR(?NgramCandidate),"([;:])",""),
"(^\\s+)",""),
"(\\s+$)",""),
"([ ]{2,})"," ")
) AS ?Ngram)
BIND(STRLEN(?Ngram) AS ?Length)
FILTER (?Length > 3 )
FILTER (?Length <= ?ClearTitleLength )
BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N)
BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", ""))) as ?Dashes)
}
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub
# HAVING(?Count > 1)
} AS %ngrams
WHERE {
INCLUDE %ngrams
# Exclude Ngrams starting or ending with any of a set of blacklisted words
BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist)
BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart)
BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd)
FILTER (!REGEX(?Ngram, ?RegexBlackStart))
FILTER (!REGEX(?Ngram, ?RegexBlackEnd))
# # Exclude Ngrams too similar to the target
# FILTER (!CONTAINS(?Ngram, "climate"))
# FILTER (!CONTAINS(?Ngram, "change"))
?ExamplePub wdt:P1476 ?ExamplePubTitle.
FILTER(LANG(?ExamplePubTitle)="de")
}
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
ORDER BY DESC(?Score) DESC(?Count) DESC(?Length)
LIMIT 200