Jump to content

User:Trey314159/homoglyphHunter.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// Latin-to-Cyrillic mapping
var Lat2CyrMap = {
	'a':'а', 'A':'А', 'ă':'ӑ', 'Ă':'Ӑ', 'ä':'ӓ', 'Ä':'Ӓ', 'æ':'ӕ', 'Æ':'Ӕ', 'B':'В', 'c':'с', 'C':'С', 'ç':'ҫ', 'Ç':'Ҫ', 'e':'е', 'E':'Е', 'è':'ѐ', 'È':'Ѐ', 'ë':'ё', 'Ë':'Ё', 'ĕ':'ӗ', 'Ĕ':'Ӗ', 'ə':'ә', 'Ə':'Ә', 'H':'Н', 'i':'і', 'I':'І', 'ï':'ї', 'Ï':'Ї', 'j':'ј', 'J':'Ј', 'k':'к', 'K':'К', 'M':'М', 'o':'о', 'O':'О', 'ö':'ӧ', 'Ö':'Ӧ', 'p':'р', 'P':'Р', 'Q':'Ԛ', 's':'ѕ', 'S':'Ѕ', 'T':'Т', 'W':'Ԝ', 'x':'х', 'X':'Х', 'y':'у', 'Y':'У', 'ȳ':'ӯ', 'ÿ':'ӱ', 'á':'а́', 'é':'е́', 'í':'і́', 'ó':'о́', 'ý':'у́', 'ħ':'ћ', 'ɜ':'з' };

var EncErrMap = {'ц':'ö', 'ч':'ç', 'у':'ã', 'б':'á', 'ж':'æ'};

var Cyr2LatMap = {};

// invert Lat2CyrMap to Cyr2LatMap and strip keys of length > 1 in both directions
invertAndLengthFilter(Lat2CyrMap, Cyr2LatMap);

// define patterns and regexes for matching all chars in script, or just homoglyphs
var LatAllPat = 'A-Za-zÀ-ɏɐ-ʯ';
var LatHomoglyphPat = Object.keys(Lat2CyrMap).join('');
var LatAllRegex = new RegExp("[" + LatAllPat + "]+", "g");
var LatOneRegex = new RegExp("[" + LatAllPat + "]", "g");
var LatHomoglyphRegex = new RegExp("[" + LatHomoglyphPat + "]+", "g");

var CyrAllPat = 'Ѐ-ԯ';
var CyrHomoglyphPat = Object.keys(Cyr2LatMap).join('');
var CyrAllRegex = new RegExp("[" + CyrAllPat + "]+", "g");
var CyrOneRegex = new RegExp("[" + CyrAllPat + "]", "g");
var CyrHomoglyphRegex = new RegExp("[" + CyrHomoglyphPat + "]+", "g");

var insourcePat = "/[" + CyrAllPat + LatAllPat + "]*([" + CyrAllPat + "][" + LatAllPat + "]|[" + LatAllPat + "][" + CyrAllPat + "])["  + CyrAllPat + LatAllPat + "]*/";

// Config
var viceversa = 1;
var sortbyscore = 1;
var limitresults = 50;

var slowFetch = 0;
var startTime = '';

var letsGo = "\
	<b>Options:</b><br>\
	<ul><li>Looking for <b>Latin</b> words with <i>Cyrillic</i> characters.</li>\
	<li><a href='#' onclick='viceversa=1-viceversa; $(\"#FHOptViceVersa\").html(viceversa?\"Also show\":\"Skip\");'>Vice Versa</a>: <b><div style='display:inline' id='FHOptViceVersa'>[Wait for it...]</div></b> predominantly Cyrillic words.</li>\
	<li><a href='#' onclick='sortbyscore=1-sortbyscore; $(\"#FHOptSort\").html(sortbyscore?\"magic score\":\"raw results count\");'>Sort</a>: Sort by <b><div style='display:inline' id='FHOptSort'>[wait for it...]</div></b>. (Magic score puts impactful, more obviously correctable results first.)</li>\
	</ul><br>\
	<a href='#' onclick='findHomoglyphs();'>Let's go</a>!";


function initialize_HHunter() {

	if ($("#HHContainer").length === 0){
		var div = document.createElement('div');
		div.setAttribute('id', 'HHContainer');
		var dstyle = div.style;
		dstyle.position = 'fixed';
		dstyle.width = "90%";
		dstyle.height = "90%";
		dstyle.top = "3%";
		dstyle.left = "5%";
		dstyle.margin = "0";
		dstyle.zIndex = "1000000";
		dstyle.backgroundColor = "#fefefe";
		dstyle.border = "1px solid #aaa";
		dstyle.overflow = "scroll";
		dstyle.display = "none";
		document.body.append(div);
		}
	$('#HHContainer').css('display','inline');
	$('#HHContainer').html("<div style='padding:0.75em; direction:ltr' id='HHContent'> <div style='float:right; margin:0; padding:0; font-family:sans-serif; cursor:pointer; color:#999; text-align:center; padding:1px' onclick='closeHH();'>ⓧ</div> <h4 style='text-align:center'>Homoglyph Hunter</h4> <div id=HHStatus>" + letsGo + "</div> <div id=HHMixedWords></div><br><br><div id=HHSnippets></div> </div><br><br><br><br><br><br>");

	$("#FHOptViceVersa").html(viceversa?"Also show":"Skip");
	$("#FHOptSort").html(sortbyscore?"magic score":"raw results count");

	return;
	}

function closeHH() {
	$('#HHContainer').css('display','none');
	}

function getHHSnippets (mixedWord, theTitle) {
	var regexSearch = new mw.Api().get( {
		action: 'query',
		prop: 'revisions',
		titles: theTitle,
		rvprop: 'content',
		format: 'json',
		curtimestamp: '1',
		} );
	$.when( regexSearch ).then(function(article) {
		var pages = article.query.pages;
		var resultHTML = '';

		startTime = article.curtimestamp;

		var page;
		for (var prop in pages) {
			if (pages.hasOwnProperty(prop)) {
				page = pages[prop];
				break;
				}
			}
		var articleText = page.revisions[0]["*"];

		var contextPat = ".{0,75}" + mixedWord + ".{0,75}";
		var contextRegex = new RegExp(contextPat, "g");
		var myMatches = articleText.match(contextRegex);

		if (myMatches) {
			var displayTitle = theTitle;
			var mixedWordRegex = new RegExp (mixedWord, "g");
			displayTitle = displayTitle.replace(mixedWordRegex, colorizeString(mixedWord));

			var latVersion = convertScript(mixedWord, Cyr2LatMap);
			var cyrVersion = convertScript(mixedWord, Lat2CyrMap);
			var encVersion = convertScript(mixedWord, EncErrMap);

			resultHTML += '<font size=-1>';
			if (latVersion != mixedWord) {
				resultHTML += '(<a style="color:blue" href=# onclick=\'fixHHArticle(this, "' + mixedWord + '","' + quoteEsc(theTitle) + '", 1)\'><b>fix-latn:</b> ' + colorizeString(latVersion) + '</a>) ';
				}
			if (cyrVersion != mixedWord) {
				resultHTML += '(<a style="color:red" href=# onclick=\'fixHHArticle(this, "' + mixedWord + '","' + quoteEsc(theTitle) + '", 2)\'><b>fix-cyrl:</b> ' + colorizeString(cyrVersion) + '</a>) ';
				}
			if (encVersion != mixedWord) {
				resultHTML += '(<a style="color:black" href=# onclick=\'fixHHArticle(this, "' + mixedWord + '","' + quoteEsc(theTitle) + '", 3)\'><b>fix-enc:</b> ' + colorizeString(encVersion) + '</a>) ';
				}
			resultHTML += '(<a href="/wiki/' + quoteEsc(theTitle) + '" target=_blank>open</a>) (<a href="/w/index.php?title=' + quoteEsc(theTitle) + '&action=edit" target=_blank>edit</a>)</font> <b>' + displayTitle + '</b> <ol>';
			for (var i = 0; i < myMatches.length; i++) {
				var display = myMatches[i].replace(/</g, "&lt;");
				display = display.replace(/\[\[[^\]|]+]?]?|([^\s=|]+\s*=)|(&lt;[^\s|>]*>?)|https?:\/\/[^\s|]*|(\.(jpe?g|gif|png|svg|tiff|xcf|mp3|mid|ogg|flac|wav|djvu?|pdf|tab))/ig, "<span style='background-color:#FFFF99;'>$&</span>");
				display = display.replace(mixedWordRegex, "<span style='background-color:#CFC'>$&</span>");
				resultHTML += '<li style="font-family:monospace">...' + display + '...</li>';
				}
			resultHTML += '</ol><br>';
			$('#HHSnippets').append(resultHTML);
			}
		});
	return;
	}

function getHHTitles( target ) {

	if (slowFetch) {
		return;
		}
	slowFetch = 1;

	var title_target = target;

	if (target.length > 2) {
		title_target='/' + target + '/';
		}

	$('#HHSnippets').html('<i>Be careful changing text in links!</i><br><br>');

	var titlesearch = new mw.Api().get( {
		action: 'query',
		list: 'search',
		format: 'json',
		srlimit: '50',
		srsearch: 'intitle:' + title_target
		} ).fail( function( code, result ) {
		if ( code === "http" ) {
			alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
		} else if ( code === "ok-but-empty" ) {
			alert( "Error: Got an empty response from the server" );
		} else {
			alert( "API error: " + code );
		}
		return;
		} );
	$.when( titlesearch ).then(function(results) {
		searches = results.query.search;

		if (searches.length !== 0) {
			$('#HHSnippets').append('<h4>Titles (' + searches.length + ') for ' + colorizeString(target) + '</h4>');
			for (var i = 0; i < searches.length; i++) {
				getHHSnippets(target, searches[i].title);
				}
			}
		getHHTemplates(target);
		});
	return;
	}

function getHHTemplates( target ) {

	slowFetch = 1;

	var templatesearch = new mw.Api().get( {
		action: 'query',
		list: 'search',
		format: 'json',
		srlimit: '50',
		srsearch: 'template:"' + target + '"'
		} ).fail( function( code, result ) {
		if ( code === "http" ) {
			alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
		} else if ( code === "ok-but-empty" ) {
			alert( "Error: Got an empty response from the server" );
		} else {
			alert( "API error: " + code );
		}
		return;
		} );
	$.when( templatesearch ).then(function(results) {
		searches = results.query.search;

		if (searches.length != 0) {
			$('#HHSnippets').append('<h4>Templates (' + searches.length + ') for ' + colorizeString(target) + '</h4>');
			for (var i = 0; i < searches.length; i++) {
				getHHSnippets(target, searches[i].title);
				}
			}
		getHHFullText(target);
		});
	return;
	}

function getHHFullText( target ) {

	slowFetch = 1;

	var fulltextsearch = new mw.Api().get( {
		action: 'query',
		list: 'search',
		format: 'json',
		srlimit: '50',
		srsearch: 'insource:' + target
		} ).fail( function( code, result ) {
		if ( code === "http" ) {
			alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
		} else if ( code === "ok-but-empty" ) {
			alert( "Error: Got an empty response from the server" );
		} else {
			alert( "API error: " + code );
		}
		return;
		} );
	$.when( fulltextsearch ).then(function(results) {
		searches = results.query.search;

		if (searches.length != 0) {
			$('#HHSnippets').append('<h4>Full-Text Results (' + searches.length + ') for ' + colorizeString(target) + '</h4>');
			for (var i = 0; i < searches.length; i++) {
				getHHSnippets(target, searches[i].title);
				}
			}

		slowFetch = 0;

		});

	return;
	}

function fixHHArticle( linkElem, mixedWord, theTitle, direction ) {
	theTitle = quoteUnesc(theTitle);
	$(linkElem).attr('onclick','');
	$(linkElem).css('display', 'none');

	//Get content of article
	new mw.Api().get( {
		action: 'query',
		titles: theTitle,
		prop: [ 'revisions', 'info' ],
		rvprop: 'content',
		indexpageids: 1,
		rawcontinue: ''
	} ).done( function( result ) {
		var artID = result.query.pageids;
		var artContents = result.query.pages[ artID ].revisions[ 0 ][ '*' ];
		var mixedWordRegex = new RegExp (mixedWord, "g");

		var displayMixedWord = '';
		var fixMsg = '';

		if (direction == 3) {
			// Encoding Error
			var reEncoded = convertScript(mixedWord, EncErrMap);
			artContents = artContents.replace(mixedWordRegex, reEncoded );
			fixMsg = 'fix encoding error: ' + mixedWord + ' → ' + reEncoded;
			}
		else if (direction == 2) {
			// Latin to Cyrillic
			artContents = artContents.replace(mixedWordRegex, convertScript(mixedWord, Lat2CyrMap) );
			displayMixedWord = mixedWord.replace(LatHomoglyphRegex, "[$&]");
			fixMsg = 'fix homoglyphs: convert Latin characters in ' + displayMixedWord + ' to Cyrillic';
			}
		else {
			// Cyrillic to Latin
			artContents = artContents.replace(mixedWordRegex, convertScript(mixedWord, Cyr2LatMap) );
			displayMixedWord = mixedWord.replace(CyrHomoglyphRegex, "[$&]");
			fixMsg = 'fix homoglyphs: convert Cyrillic characters in ' + displayMixedWord + ' to Latin';
			}

		new mw.Api().postWithToken( 'edit', {
			action: 'edit',
			title: theTitle,
			text: artContents,
			summary: fixMsg,
			minor: '1',
			starttimestamp: startTime,
		} ).done( function( result, jqXHR ) {
			$(linkElem).after("<b style='font-size:80%'>FIXED</b>");
			return;
		} ).fail( function( code, result ) {
			if ( code === "http" ) {
				alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
			} else if ( code === "ok-but-empty" ) {
				alert( "Error: Got an empty response from the server" );
			} else {
				alert( "API error: " + code );
			}
			$(linkElem).after("<b style='font-size:80%'>ERROR</b>");
			return;
		} );


	} ).fail( function( code, result ) {
		if ( code === "http" ) {
			alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
		} else if ( code === "ok-but-empty" ) {
			alert( "Error: Got an empty response from the server" );
		} else {
			alert( "API error: " + code );
		}
		$(linkElem).after("<b style='font-size:80%'>ERROR</b>");
		return;
	} );
	}

function findHomoglyphs() {
	if (slowFetch) {
		return;
		}
	slowFetch = 1;

	$('#HHStatus').html("<b>Fetching data... this can take 30 seconds or more.</b>");

	var regexSearch = new mw.Api().get( {
		action: 'query',
		list: 'search',
		format: 'json',
		srlimit: '10000',
		srsearch: 'insource:' + insourcePat
		} );

	$.when( regexSearch ).then(function(x) {

		var matches = {};
		var re = /<span class="searchmatch">(.*?)<\/span>/g;
		var m;

		if (x.query.search.length == 0) {
			$('#HHMixedWords').html("Nothing found.");
			return;
			}

		for (var i = 0; i < x.query.search.length; i++) {
			var snip = x.query.search[i].snippet;

			while (m = re.exec(snip)) {
				if (typeof matches[m[1]] == 'undefined') {
					matches[m[1]] = (m[1].match(LatOneRegex) || []).length / m[1].length;
					if (viceversa == 1 && sortbyscore == 1 && matches[m[1]] < 0.5) {
						matches[m[1]] = (m[1].match(CyrOneRegex) || []).length / m[1].length;
						}
					}
				}
			}
		var terms = Object.keys(matches).sort(function(a, b) {
			return matches[b] - matches[a];
			});

		var artCountPromises = [];
		var mwapi = new mw.Api();
		for (var i = 0; i < terms.length; i++) {
			artCountPromises.push( mwapi.get( { action: 'query', list: 'search', format: 'json', srlimit: '1', srsearch: 'insource:' + terms[i] } ) );
			}

		var count = [];
		var score = [];
		$.when ( ...artCountPromises ).then(function() {
			var results = arguments;
			var resultHTML = '';
			for (var i = 0; i < results.length; i++) {
				count[terms[i]] = results[i][0].query.searchinfo.totalhits;
				// give some weight to score, but more to Latin-ness, with a small x/1000 addition to sort 0-count items properly
				score[terms[i]] = Math.log10(count[terms[i]] + 1) * matches[terms[i]] * matches[terms[i]] + (matches[terms[i]]/1000);
				}

			terms = terms.sort(function(a, b) {
				if (sortbyscore) {
					return score[b] - score[a];
					}
				return count[b] - count[a];
				});

			for (var i = 0; i < terms.length; i++) {
				if (score[terms[i]] <= 0 && viceversa == 0) {
					continue;
					}
				if (viceversa == 1 || matches[terms[i]] >= 0.5) {
					var display = colorizeString(terms[i]);
					if (resultHTML) {
						resultHTML += ' — ';
						}
					resultHTML += "<a href='#' style='color:black' onclick='copyToClipboard(\"" + terms[i] + "\"); getHHTitles(\"" + terms[i] + "\")'>" + display + "</a>&nbsp;(" + count[terms[i]] +
						// "/" + matches[terms[i]] + "/" +  score[terms[i]] +
						")";
					}
				}

			if ('' === resultHTML) {
				resultHTML = "Nothing found.";
				}

			$('#HHStatus').html('');
			$('#HHMixedWords').html(resultHTML);

			} );

		slowFetch = 0;
		} );

	return;
	}

function colorizeString(str) {
	var str2 = [];

	for (var i = 0; i < str.length; i++) {
		if (str[i].match(LatOneRegex)) {
			// Latin
			if (str[i].match(LatHomoglyphRegex)) {
				// Latin homoglyph
				str2.push("<span style='color:#00F'>" + str[i] + "</span>")
				}
			else {
				str2.push("<span style='color:#AAF'>" + str[i] + "</span>")
				}
			}
		else if (str[i].match(CyrOneRegex)) {
			// Cyrillic
			if (str[i].match(CyrHomoglyphRegex)) {
				// Cyrillic homoglyph
				str2.push("<span style='color:#F00'>" + str[i] + "</span>")
				}
			else {
				str2.push("<span style='color:#FAA'>" + str[i] + "</span>")
				}
			}
		else {
			// Hmm, what's this?
			str2.push(str[i]);
			}
		}
	str2 = str2.join('');

	return str2;
	}

// copy a string to the clipboard
function copyToClipboard(string) {
	var $temp = $("<input>");
	$("body").append($temp);
	$temp.val(string).select();
	document.execCommand("copy");
	$temp.remove();
	}

// invert one map into another; in both directions remove keys (but not values) with length > 1
function invertAndLengthFilter(src, dest) {
	for (var key in src) {
		var cyr = src[key];
		if (cyr.length == 1) {
			dest[cyr] = key;
			}
		if (key.length > 1) {
			delete src[key];
			}
		}
	}

// map all available characters in string from one script to another based on a given map
function convertScript(str, map) {
	var str2 = [];

	for (var i = 0; i < str.length; i++) {
		str2.push( map[str[i]] || str[i] );
		}
	str2 = str2.join('');

	return str2;
	}

// html encode quotes
function quoteEsc (theString) {
	theString = theString.replace(/'/g, "%27");
	theString = theString.replace(/"/g, "%22");
	return theString;
	}

// html decode quotes
function quoteUnesc (theString) {
	theString = theString.replace(/%27/g, "'");
	theString = theString.replace(/%22/g, '"');
	return theString;
	}


// when everything is loaded, add the Homoglyph Hunter link
$.when( mw.loader.using( ['mediawiki.util']), $.ready ).then( function() {
    var portletLink = mw.util.addPortletLink( 'p-tb', '#', 'Homoglyph Hunter' );
    $( portletLink ).click( function ( e ) {
		e.preventDefault();
		initialize_HHunter();
		});
	});