Wikipedia:Bots/Requests for approval/BareRefBot/Code2
Appearance
function uploadentry(url, title, work = "", metatitle = "", isdead = false ) { const inquery = "INSERT INTO web (url, title, work, metatitle, isdead) VALUES ($1,$2,$3,$4,$5)" var insertarray = [url, title, work, metatitle, isdead] var res = sql.query(inquery, insertarray) } function getAndTagTitle(geturl) { var isDead = false var browser = webkit.launch() // Open up a web browser. Webkit (Safari) is relatively fast, light on memory / processing power, and works on all of the major operating systems. const page = browser.newPage(); // Open up a new page await page.goto(geturl) // Go to the page. var statusnum = result.status() // Get status code //https://developer.mozilla.org/en-US/docs/Web/HTTP/Status for more info. if (statusnum == 404 || statusnum == 410) { isDead = true } if (statusnum => 400 && statusnum < 500) { return // Some sort of error. Do nothing, ignore. When coming across the URL there will be no match and url will be left aone } if (statusnum => 500 && statusnum < 600) { return // Same other above. } const title = page.title() var additionalinfo = {} additionalinfo.metatitle = title // upgrade 1/28/2022, see bot page discussion. additionalinfo.work = new URL(geturl).hostname // If the website is "www.website.notexist/ffsdf", the "work" will be "www.website.notexist" uploadentry(geturl, title, additionalinfo.work, additionalinfo.metatitle, isDead) return // entry has been uploaded } function traverse(refitem ) { var traversedcount = -1 var removebaretemp = false // tracking category for multiple bare refs for (refobj of refitem) { // iterate over parser "objects" in the <ref></ref> in question traversedcount = traversedcount + 1 // count of objects traversed. if (typeof refobj == "string") { // This is a recursive function, so sometimes it calls a function on a string // A string can not be iterated and if the object passed in is a string it has gone too deep in, so step out. return } if (refobj.type == "url" && refobj.is_bare == true ) { usethisurl = refobj[0].toString() if (usethisurl.indexOf("archive.") >= 0 || // everything else (note the . at the end) usethisurl.indexOf("webcit") >= 0 || // webcite usethisurl.indexOf("youtube.com") >= 0 || usethisurl.indexOf("twitter.com") >= 0 || usethisurl.indexOf("facebook.com") >= 0 || usethisurl.indexOf("instagram.com") >= 0) { // Skip these, because these shoud either be in archive-url (out of scope) or I haven't integrated the fixes for these yet continue } var shoulddo = true for (refobj2 of refitem) { // iterate through the whole thing again to check for undeseriables if (typeof refobj2 == "string" && refobj2.trim() != "") { shoulddo = false // lets not fix middle ones. For exaple <ref>https://website.website is an amazing site</ref> is not something that should be filled break } if (refobj2.type == "transclusion" && refobj2.name.toLowerCase() != "bare url inline") { // If there is some sort of transcluion in the <ref></ref> that is not recognized, skip as it might be out of scope. shoulddo = false break } } if (!shoulddo) { continue } usethisurl = usethisurl.replaceAll("|", "%7C") // escape for CS1 parsethis = parsethis + " |url=" + usethisurl if (usethisurl.indexOf(".pdf") >=0) { continue } getAndTagTitle(usethisurl) } if (obj.type == "tag_inner") { traverse(obj[traversedcount]) // Deal with nested refs, and other parser strangeness. } } } function main(filename) { var wikitxt = fs.readFileSync(filename).toString() var page_data = CeL.net.wiki.parser(wikitxt) parsed_data = page_data.parse() parsed_data.each("tag_inner", function refprocess(token, index, parent) { if (!parent || parent.tag != "ref") { // we dont want to convert non ref bares (e.g.: URLS out of nowhere and external link sections) return } traverse(token, datetype) }) }