Jump to content

User:Opencooper/showKanji.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// This script shows, if found, the kanji and kana for an article
// It then calls another script, bindKana.js, to clean up the display of ruby
// For configuration, please see the documentation

// License: CC0

function setup() {
    // If we're not reading an article, do nothing
    if (!(mw.config.get( 'wgAction' ) === 'view'
          && mw.config.get( 'wgIsArticle' )
          && !location.search.split('oldid=')[1]
          && !mw.config.get("wgIsMainPage")
          && mw.config.get("wgContentLanguage") !== "ja")) {
        return;
    }

    // Assuming that if there's no wikidata, there're no 1:1 interlanguage links,
    // and we don't want cases where a page links to a subsection of a jawiki
    // article
    if (wikidataId === null) {
        return;
    }

    // Placeholder so other elements don't push it down later
    var header;
    if ($('#firstHeading').length) { // Vector
    	header = $('#firstHeading');
    } else if ($('.page-heading').length) { // Minerva
    	header =  $('.page-heading');
    } else {
    	console.error("showKanji.js: Couldn't find a page heading. This skin ("
    	              + mw.config.get( 'skin' ) + ") might not be supported.");
    	return;
    }
    header.append("<div id='kanjiInfo' lang='ja' dir='ltr'></div>");

    // Get the Japanese label from wikidata
    // API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities
    $.ajax({
        url: "https://www.wikidata.org/w/api.php",
        data: {
            action: "wbgetentities",
            ids: wikidataId,
            props: "labels",
            languages: "ja",
            format: "json",
            origin: "*"
        },
        success: parseJaLabel
    });
}

function parseJaLabel(response) {
    var wikidataInfo = response.entities[wikidataId];
    var jaLabel;
    if (!jQuery.isEmptyObject(wikidataInfo.labels.ja)) {
        jaLabel = wikidataInfo.labels.ja.value;
    }

    if (jaLabel) {
    	jaLabel = jaLabel.toHalfWidth();
        buildRegexes(jaLabel);
        displayKanji(jaLabel);
    } else {
        return;
    }

    // If the japanese title is not just only kana, get the reading
    if (!kanjiRegexes.kanaOnly.test(jaLabel)) {
        requestKana();
    }
}

function buildRegexes(kanji) {
    // Strip $kanji of all kanji and kana, adding whatever is left to the regex
    var reKanjiKana = /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴー-]/g;
    var kanjiStripped = kanji.replace(reKanjiKana, "");
    kanjiStripped += " ";
    // Need to add hyphen escaped since it has special behavior in regex classes
    kanjiStripped += "\\-";
    var kanjiAuxillary = kanjiStripped.replace(/\w/g, "");

    kanjiRegexes.latinOnly = /^[A-Za-z0-9\-.?!/,:;@#$%&+=*'"・ ]+$/;
    kanjiRegexes.kanaOnly = new RegExp("^[ぁ-ゔァ-ヴー" + kanjiAuxillary + "]+$");
    kanjiRegexes.hiraganaOnly = new RegExp("^[ぁ-ゔーA-Za-z" + kanjiAuxillary + "]+$");
    kanjiRegexes.katakanaOnly = new RegExp("^[ァ-ヴーA-Za-z" + kanjiAuxillary + "]+$");

    // Add midpoint for Latin in titles
    if (/\w/.test(kanji)) { kanjiStripped += "・"; }

    var leadReBase = "([ぁ-ゔァ-ヴー" + kanjiStripped + "]+)";
    var kanjiEscaped = mw.util.escapeRegExp(kanji);
    // Account for spaces, but ignore backslash and other misc characters
    var reKanjiKanaLatin = /([\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴーA-Za-z0-9])/g;
    var kanjiSpaced = kanjiEscaped.replace(/ /g, " ?");
    kanjiSpaced = kanjiSpaced.replace(reKanjiKanaLatin, "$1 ?");

    // Add kanji to regex to make sure we're not getting the reading of some
    // other term
    kanjiRegexes.lead = new RegExp(kanjiSpaced + "[^(\n)]*?\\(" + leadReBase, "i"); // brittle
}

function displayKanji(kanji) {
	wikidataKanji = kanji;
    $('#kanjiInfo').append("<ruby>" + kanji + "</ruby>");

    // Add some classes so users can choose to not display for example
    // katakana-only kanji in their CSS
    if (kanjiRegexes.latinOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-latin-only");
        $("#kanjiInfo").prop("title", "Japanese title in Latin script");
        $("#kanjiInfo").css("display", "none");
    } else if (kanjiRegexes.hiraganaOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-hiragana-only");
        $("#kanjiInfo").prop("title", "Japanese title in hiragana");
    } else if (kanjiRegexes.katakanaOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-katakana-only");
        $("#kanjiInfo").prop("title", "Japanese title in katakana");
    } else {
    	$("#kanjiInfo").prop("title", "Japanese title in kanji");
    }
}

function requestKana() {
    // API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetclaims
    // We have to wholesale get all the claims instead of just one because the
    // kana might be present as a qualifier to another claim
    $.ajax({
        url: "https://www.wikidata.org/w/api.php",
        data: {
            action: "wbgetclaims",
            entity: wikidataId,
            format: "json",
            origin: "*"
        },
        success: parseKanaClaim
    });
}

function parseKanaClaim(response) {
    var kana;
    var properties = {
    	                 title: "P1476",
                         nativeLabel: "P1705",
                         officialName: "P1448",
    	                 nameInNativeLanguage: "P1559"
                     };
    var nameInKana = "P1814";
    
    // Try getting nameInKana as a qualifier to some properties                  
    for (var prop in properties) {
    	var pnum = properties[prop];
    	
    	if (response.claims[pnum]) {
            var kanji = response.claims[pnum][0].mainsnak.datavalue.value.text;
            if (kanji.replace(/ /g, "") == wikidataKanji.replace(/ /g, "")
                && response.claims[pnum][0].qualifiers
                && response.claims[pnum][0].qualifiers[nameInKana]) {
                kana = response.claims[pnum][0].qualifiers[nameInKana][0].datavalue.value;
    	        break;
            }
    	}
    }

    // Try getting nameInKana as a general claim
    if (!kana && response.claims[nameInKana]) {
    	prop = "nameInKana";
        kana = response.claims[nameInKana][0].mainsnak.datavalue.value;
    }
    
    // We couldn't find nameInKana
    if (!kana) {
        getInterlanguage();
        return;
    }

    kana = kana.toHalfWidth();
    displayKana(kana);
    $("#kanjiInfo").addClass("kanjiInfo-wikidata");
    $("#kanjiInfo").addClass("kanjiInfo-wikidata-" + prop);
}

function getInterlanguage() {
    var apiUrl = location.origin + "/w/api.php";
    // Documentation: https://en.wikipedia.org/w/api.php?action=help&modules=query%2Blanglinks
    $.ajax({
        url: apiUrl,
        data: {
            action: "query",
            format: "json",
            prop: "langlinks",
            lllang: "ja",
            titles: mw.config.get( 'wgTitle' )
        },
        success: function(response) {
        	var pageId = mw.config.get( 'wgArticleId' );
        	var page = response.query.pages[pageId];
            var langlinks = page ? page.langlinks : undefined;
        	var jaLabel;
        	if (langlinks) {
        	    jaLabel = langlinks[0]["*"];
        	    jaLabel = jaLabel.replace(/(.*)#.*/, "$1"); // rm anchors
        	} else {
        		getWiktionary();
        		return;
        	}
        	scrapeKana(jaLabel);
        }
    });
}

function scrapeKana(jaLabel) {
    // Get jawiki article's lead wikitext
    // API docs: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bextracts
    $.ajax({
        url: "https://ja.wikipedia.org/w/api.php",
        data: {
            action: "query",
            prop: "extracts",
            format: "json",
            redirects: true,
            exintro: true,
            exsentences: 2,
            exlimit: 1,
            explaintext: true,
            titles: jaLabel,
            origin: "*"
        },
        success: getFirstSentence
    });
}

function getFirstSentence(response) {
    var responsePart = response.query.pages;
    // Have to split parsing into two parts since jawiki pageid is unknown
    var pageId = Object.keys(responsePart)[0];
    var introText = responsePart[pageId].extract;

    if (!introText) {
        console.error("showKanji.js: TextExtracts failed to get a lead for the Japanese article.");
        getWiktionary();
        return;
    }

    var wikitext = introText.toHalfWidth();

    var kana;
    var kanaSearch = wikitext.match(kanjiRegexes.lead);
    if (kanaSearch && kanaSearch.length == 2) {
        kana = kanaSearch[1];
    } else {
    	getWiktionary();
        return;
    }

    // Rm trailing characters
    kana = kana.replace(/[・、 ]$/, "");

    // Abort if our reading is only katakana (for non-Latin) or Latin 
    if ((!kanjiRegexes.latinOnly.test(wikidataKanji) && kanjiRegexes.katakanaOnly.test(kana))
        || kanjiRegexes.latinOnly.test(kana)) {
    	getWiktionary();
    	return;
    }

    displayKana(kana);
    $("#kanjiInfo").addClass("kanjiInfo-jawiki");
}

// Adapted from:
//     http://ilog4.blogspot.com/2015/09/javascript-convert-full-width-and-half.html
//     https://stackoverflow.com/a/20488304/1995949
//     https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms
String.prototype.toHalfWidth = function() {
    var halfWidth = this.replace(/[\uff01-\uff5e]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) - 0xFEE0)});
    halfWidth = halfWidth.replace(/ /g, " ");
    return halfWidth;
};

// We use the English Wiktionary because it has more terms and better structure
function getWiktionary() {
	// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse
    $.ajax({
        url: "https://en.wiktionary.org/w/api.php",
        data: {
            action: "parse",
            format: "json",
            page: wikidataKanji,
            prop: "sections",
            origin: "*"
        },
        success: findJapaneseSection
    });
}

function findJapaneseSection(response) {
	if (response.error) {
		return;
	}
	
    var sectionsCount = response.parse.sections.length;
    var sectionIndex;
    for (let i = 0; i < sectionsCount; i++) {
        var sectionHeader = response.parse.sections[i].line;
        if (sectionHeader == "Japanese") {
        	sectionIndex = response.parse.sections[i].index;
        	break;
        }
    }
    
    if (sectionIndex == null) {
    	return;
    }

	// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse
    $.ajax({
        url: "https://en.wiktionary.org/w/api.php",
        data: {
            action: "parse",
            format: "json",
            page: wikidataKanji,
            prop: "text",
            section: sectionIndex,
            origin: "*"
        },
        success: parseWiktionary
    });    
}

function parseWiktionary(response) {
	var html = response.parse.text["*"];
	var parsed = $($.parseHTML(html));

	// Wiktionary adds readings as furigana
	var headword = parsed.find(".headword:lang(ja)").first();
	var seeTable = parsed.find(".Jpan ruby").first();
	
	var kanji = "";
	var kana = "";
	if (headword.length) {
	    // Wiktionary already binds their kana, so we have to undo the process to get
	    // the constituent parts, at least with the current markup
	    var childNodes = headword[0].childNodes;
	    for (let i = 0; i < childNodes.length; i++) {
	    	if (childNodes[i].nodeName == "RUBY") {
	    		var ruby = $(childNodes[i]); // convert back to JQuery for convenience
	    		ruby.children("rp").remove();
	    		kana += ruby.children("rt").detach().text();
	    		kanji += ruby.text();
	    	} else if (childNodes[i].nodeType == 3) { // "#text"
			    kanji += childNodes[i].nodeValue;
			    kana += childNodes[i].nodeValue;
		    }
        }

        if (kanji != wikidataKanji) { return; }
	} else if (seeTable.length) {
		kanji = seeTable.children("rb").text();
		kana = seeTable.children("rt").text();
	} else {
		return;
	}

	if (kana) {
		displayKana(kana);
		$("#kanjiInfo").addClass("kanjiInfo-wiktionary");
	}
}

function displayKana(kana) {
    $("#kanjiInfo ruby").append("<rt>" + kana + "</rt>");

    // Cleanup redundant furigana with another script
    var kanjiOnlyRe = /^[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]+$/;
    if (!kanjiOnlyRe.test(wikidataKanji)) {
       mw.loader.load( '//en.wikipedia.org/w/index.php?title=User:Opencooper/bindKana.js&action=raw&ctype=text/javascript' );
    }
}

var wikidataId = mw.config.get( 'wgWikibaseItemId' );
var wikidataKanji;
var kanjiRegexes = {};
$(setup);