Jump to content

Wikipedia:Bots/Requests for approval/BareRefBot/Code2

From Wikipedia, the free encyclopedia
 function uploadentry(url, title, work = "", metatitle = "", isdead = false ) {
    const inquery = "INSERT INTO web (url, title, work, metatitle, isdead) VALUES ($1,$2,$3,$4,$5)"
    var insertarray = [url, title, work, metatitle, isdead]
    var res = sql.query(inquery, insertarray)

}
function getAndTagTitle(geturl) {
      var isDead = false
      var browser = webkit.launch()  // Open up a web browser. Webkit (Safari) is relatively fast, light on memory / processing power, and works on all of the major operating systems.  
      const page = browser.newPage(); // Open up a new page
      await page.goto(geturl) // Go to the page.

      var statusnum = result.status() // Get status code

  //https://developer.mozilla.org/en-US/docs/Web/HTTP/Status for more info.
  if (statusnum == 404 || statusnum == 410) {
      isDead = true
  }

  if (statusnum => 400 && statusnum < 500) {
      return // Some sort of error. Do nothing, ignore. When coming across the URL there will be no match and url will be left aone
  }
  if (statusnum => 500 && statusnum < 600) {
      return // Same other above.
  }
  const title = page.title()
  var additionalinfo = {}
  additionalinfo.metatitle = title // upgrade 1/28/2022, see bot page discussion. 
   additionalinfo.work = new URL(geturl).hostname // If the website is "www.website.notexist/ffsdf", the "work" will be "www.website.notexist"
  uploadentry(geturl, title, additionalinfo.work, additionalinfo.metatitle, isDead)
  return // entry has been uploaded


}

function traverse(refitem ) {
    var traversedcount = -1
    var removebaretemp = false // tracking category for multiple bare refs
    for (refobj of refitem) { // iterate over parser "objects" in the <ref></ref> in question
        traversedcount = traversedcount + 1 // count of objects traversed.
        if (typeof refobj == "string") { 
            // This is a recursive function, so sometimes it calls a function on a string
            // A string can not be iterated and if the object passed in is a string it has gone too deep in, so step out.
            return
        }
       if (refobj.type == "url" && refobj.is_bare == true  ) {
           usethisurl = refobj[0].toString()
        if (usethisurl.indexOf("archive.") >= 0  || // everything else (note the . at the end)
        usethisurl.indexOf("webcit") >= 0 || // webcite
        usethisurl.indexOf("youtube.com") >= 0 ||
         usethisurl.indexOf("twitter.com") >= 0 || 
         usethisurl.indexOf("facebook.com") >= 0 || 
         usethisurl.indexOf("instagram.com") >= 0) {
             // Skip these, because these shoud either be in archive-url (out of scope) or I haven't integrated the fixes for these yet
            continue

        }
        var shoulddo = true  
        for (refobj2 of refitem) { //  iterate through the whole thing again to check for undeseriables
            if (typeof refobj2 == "string" && refobj2.trim() != "") {
                shoulddo = false // lets not fix middle ones. For exaple <ref>https://website.website is an amazing site</ref> is not something that should be filled
                break 
            }

            if (refobj2.type == "transclusion" && refobj2.name.toLowerCase() != "bare url inline") {
                // If there is some sort of transcluion in the <ref></ref> that is not recognized, skip as it might be out of scope.
                shoulddo = false
                break
            }

        }
        if (!shoulddo) {
            continue
        }
         usethisurl = usethisurl.replaceAll("|", "%7C") // escape for  CS1
         parsethis = parsethis + " |url=" + usethisurl
         if (usethisurl.indexOf(".pdf") >=0) {
             continue
         }
         getAndTagTitle(usethisurl)

       }
       if (obj.type == "tag_inner") {
            traverse(obj[traversedcount]) // Deal with nested refs, and other parser strangeness.
       }
    }

} 
function main(filename) {
	var wikitxt = fs.readFileSync(filename).toString()
	var page_data = CeL.net.wiki.parser(wikitxt)
    parsed_data = page_data.parse()
    parsed_data.each("tag_inner", function refprocess(token, index, parent) { 
      if (!parent || parent.tag != "ref") {
          // we dont want to convert non ref bares (e.g.: URLS out of nowhere and external link sections)
          return
      }
      traverse(token, datetype)
  })
}