User:GreenC/software/search wikipedia
Method to accurately search Wikipedia
[edit]Find all articles which contain the string "sportsillustrated.cnn.com" AND a {{dead}}
template AND < whatever > .. solving for complicated Wikipedia searches is trivial by downloading the Wikipedia database (dumps.wikimedia.org) and search using whatever tool you prefer. Here are two plug and play solutions.
Awk
[edit]Awk is probably the simplest language available though with a speed trade-off for lack of a real XML parser. Nevertheless, no additional software is required (awk is a POSIX tool).
- To run: awk -f search-wp.awk > out
#!/bin/awk -f # Search entire Wikipedia database. # Download: https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia # BEGIN { MySearch = "archive.org/w?e?b?/?[0-9]{1,14}/" WPdump = "/f/t/wikipedia-dump/enwiki-20150515-pages-articles.xml" RS=("<page|</page>") while ((getline rawstr < WPdump ) > 0) { # Skip blank content if(! gensub(/^[[:space:]]+|[[:space:]]+$/, "", "g", rawstr)) continue # Convert XML formating gsub(/&lt;/,"<",rawstr);gsub(/&gt;/,">",rawstr);gsub(/&quot;/,"\"",rawstr);gsub(/&amp;/,"\\&",rawstr) # Get article title if ( match(rawstr, "<title>.+</title>", a) ) { split(a[0], b, "(<title>|</title>)") title = b[2] } # Get article body if ( match(rawstr, "<text xml:space=\"preserve\">.+</text>", a) ) { split(a[0], b, "(<text xml:space=\"preserve\">|</text>)") body = b[2] } # ---------- Search ----- if ( match(body, MySearch, matched_text) ) { print title # print matched_text[0] # uncomment to print continue } } close(r) }
Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.
Nim
[edit]For a faster solution here is a Nim example. Nim compiles to optimized C code, which then compiles using gcc to an executable binary. In a test between Awk and Nim, it took Awk 3m31s to complete a search, the same in Nim took 0m43s. The code below is pretty much copy-paste compile and run, just add your RegEx Perl compatible regex, or plain text. Example regex strings:
- mySearchRe = re"djvu[.]txt"
- mySearchRe = re"http[:][^ ]*[^ ]"
- (the regex string is wrapped by re"" )
Then download Nim compiler (choosenim method is easiest), and compile the source with nim c -d:release --opt:speed -d:danger --passC:"-flto" --passL:"-flto" search.nim
.
# # Search wikipedia dump for a string and print the article title (or matched text) if located # Credit: Copyright User:Green_Cardamom, April 2016, MIT License # Language: Nim # Additional code credits: Rob Speer (https://github.com/rspeer/wiki2text) # import re, options, strutils, os, streams, parsexml var # configuration variables mySearchRe = re"djvu[.]txt" wpDump = "/mnt/WindowsFdriveTdir/wikipedia-dump/enwiki-20150901-pages-articles.xml" maxCount = 0 # Stop searching after X countArticle for speed testing. Set to 0 to find all. var countAllArticle = 0 # All article count countArticle = 0 # Article titles containing a match (any number of matches) countHits = 0 # Number of matches of search pattern (running total) type TagType = enum TITLE, TEXT, REDIRECT, NS ArticleData = array[TagType, string] # # Search text # proc searchText(article: ArticleData): bool {.discardable.} = var artcount = 0 pos = -1 # matches = newSeq[string](1) inc countAllArticle while pos < article[TEXT].len: pos = find(article[TEXT], mySearchRe, pos + 1) if pos == -1: break inc artcount if artcount > 0: inc countArticle # number of article titles matching countHits += artcount # number of matches of search pattern echo article[TITLE] result = true if maxCount > 0: if countAllArticle >= maxCount: echo "" echo "Articles all: ", countAllArticle echo "Articles with a match: ", countArticle echo "Number of pattern matches: ", countHits quit() var RELEVANT_XML_TAGS = ["title", "text", "ns"] textBuffer = "" s = newFileStream(wpDump, fmRead) gettingText = false gettingAttribute = false article: ArticleData xml: XmlParser if s == nil: quit("cannot open the file " & wpDump) for tag in TITLE..NS: article[tag] = "" xml.open(s, wpDump, options={reportWhitespace}) while true: # Scan through the XML, handling each token as it arrives. xml.next() case xml.kind of xmlElementStart, xmlElementOpen: if RELEVANT_XML_TAGS.contains(xml.elementName): # If this is a "title", "text", or "ns" tag, prepare to get its # text content. Move our writing pointer to the beginning of # the text buffer, so we can overwrite what was there. textBuffer.setLen(0) gettingText = true elif xml.elementName == "page": # If this is a new instance of the <page> tag that contains all # these tags, then reset the value that won't necessarily be # overridden, which is the redirect value. article[REDIRECT].setLen(0) elif xml.elementName == "redirect": # If this is the start of a redirect tag, prepare to get its # attribute value. gettingAttribute = true of xmlAttribute: # If we're looking for an attribute value, and we found one, add it # to the buffer. if gettingAttribute: textBuffer.add(xml.attrValue) of xmlCharData, xmlWhitespace: # If we're looking for text, and we found it, add it to the buffer. if gettingText: textBuffer.add(xml.charData) of xmlElementEnd: # When we reach the end of an element we care about, take the text # we've found and store it in the 'article' data structure. We can # accomplish this quickly by simply swapping their references. case xml.elementName of "title": swap article[TITLE], textBuffer of "text": swap article[TEXT], textBuffer of "redirect": swap article[REDIRECT], textBuffer of "ns": swap article[NS], textBuffer of "page": # When we reach the end of the <page> tag, send the article # data to searchText(). searchText(article) else: discard # Now that we've reached the end of an element, stop extracting # text. (We'll never need to extract text from elements that can # have other XML elements nested inside them.) gettingText = false gettingAttribute = false of xmlEof: break else: discard xml.close echo "Search Wikipedia completed" echo "----" echo "Articles all: ", countAllArticle echo "Articles with a match: ", countArticle echo "Number of pattern matches: ", countHits
Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.