User:V111P/js/wikiParserV.js
Appearance
< User:V111P | js
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
This user script seems to have a documentation page at User:V111P/js/wikiParserV. |
/*
* wikiParserV.js
* ver. 2013-11-02
* Home: http://en.wikipedia.org/wiki/User:V111P/js/wikiParserV
*
* This is a library of useful functions, mostly for working with wiki code.
* Includes functions for removing html tags.
*
* You can use the code in this script under the
* Creative Commons Attribution 3.0 Unported License (CC-BY 3.0)
* http://creativecommons.org/licenses/by/3.0/
* If you do use it, please let me know. Thanks.
*/
mediaWiki.libs.wikiParserV = window.wikiParser = (function () {
"use strict";
var version = 1000;
var re = {
escForRegExpG: /[.*+?^$|()[\]{\\^$]/g,
testRe: /<(?!\/?(a|b)>)/g,
nonAlphanumericAndHyphenCharsG: /[^A-Za-z0-9_-]/g,
htmlCommentsG: /(\n)?<!--[\S\s]*?-->\1?/g // replace it with $1
};
var locale = {}; // used in removeElements()
var $tempDiv = $('<div/>'); // used in unescapeCharEntities()
var wgScriptPath;
var sectionNameUriEncodingAdditionalReplacements;
function unescapeCharEntities(str) {
return $tempDiv.html(str.replace('<', '<').replace('>', '>')).text();
}
function formatUrl(article, noredir, edit) {
wgScriptPath = mw.config.get('wgScriptPath');
article = article.replace(/ /g, '_');
var pagePlusHash = article.match(/(.+)#(.+)/);
if (pagePlusHash)
article = encodeURIComponent(pagePlusHash[1]) + '#'
+ encodeURIComponent(pagePlusHash[2]).replace(/%/g, '.');
if (noredir)
return wgScriptPath + '/index.php?title=' + article + '&redirect=no';
else if (edit)
return wgScriptPath + '/index.php?title='
+ article.replace(/#.*/, '') + '&action=edit';
else
return '/wiki/' + article;
} // formatUrl
function encodeSectionNameForUrl(str) {
var res = sectionNameUriEncodingAdditionalReplacements
|| (sectionNameUriEncodingAdditionalReplacements = [
{re: /~/g, newVal: '.7E'},
{re: /!/g, newVal: '.21'},
{re: /\*/g, newVal: '.2A'},
{re: /\(/g, newVal: '.28'},
{re: /\)/g, newVal: '.29'},
{re: /\'/g, newVal: '.27'},
{re:/%3A/g, newVal: ':'}
]);
var str = encodeURIComponent(str.replace(/ /g, '_'));
$.each(res, function (i, val) {
str = str.replace(val.re, val.newVal);
});
return str.replace(/%/g, '.');
} // encodeSectionNameForUrl
function encodeSectionNameForId(str) {
str = encodeSectionNameForUrl(str.replace(/\./g, '_46'))
.replace(/:/, '_3A')
.replace(re.nonAlphanumericAndHyphenCharsG, '_');
return str;
} // encodeSectionNameForId
function escapeForRegExp(str) {
return str.replace(re.escForRegExpG, '\\$&');
} // escapeForRegExp
// pretreat for embeded elements with the same closing tag
function removeElRegExp(startTag, endTag, startTagOfEmbededEl) {
var res = {pretreat: null, main: null};
var startTagEsc = escapeForRegExp(startTag)
.replace(/<<</g, '(').replace(/@@@/g, '|').replace(/>>>/g, ')');
var endTagEsc = escapeForRegExp(endTag);
if (startTagOfEmbededEl) {
var startTagOfEmbededElEsc = escapeForRegExp(startTagOfEmbededEl);
res.pretreat = new RegExp('(' + startTagEsc + '(?:(?!' + endTagEsc + ')[\\S\\s])*?)'
+ startTagOfEmbededElEsc + '(?:(?!' + startTagOfEmbededElEsc + ')[\\S\\s])*?'
+ endTagEsc, 'gi');
}
res.main = new RegExp('(\\n)?' + startTagEsc + '((?!' + startTagEsc + '|' + endTagEsc + ')[\\S\\s])*'
+ endTagEsc + '\\1?', 'gi');
return res;
} // removeElRegExp
// startTagOfEmbededEl - needed because for example files and wiki links have the same
// closing tags, so to remove files, pass '[[File:' as startTag and '[[' as startTagOfEmbededEl
function removeElRegExpStartArr(startTagPre, startTagArr, startTagPost,
endTag, startTagOfEmbededEl) {
var st = startTagPre + '<<<' + startTagArr.join('@@@') + '>>>' + startTagPost;
return removeElRegExp(st, endTag, startTagOfEmbededEl);
} // removeElRegExpStartArr
function removeEls(data, res, iterationLimit) {
var prev, cntr;
iterationLimit = iterationLimit || 1000;
if (res.pretreat) {
cntr = iterationLimit;
do {
cntr--; // anti infinite-loop var just in case...
prev = data;
data = data.replace(res.pretreat, '$1');
} while (data != prev && cntr > 0);
}
cntr = iterationLimit;
do {
cntr--;
prev = data;
data = data.replace(res.main, '$1');
} while (data != prev && cntr > 0);
return data;
} // removeEls
// saves all versions of some namespace names
function saveNsNames() {
locale.specialNsArr = [];
locale.fileNsArr = [];
locale.categoryNsArr = [];
$.each(mw.config.get('wgNamespaceIds'), function (key, val) {
if (val == '-1') { // 'special'
if ($.inArray(key, locale.specialNsArr) == -1)
locale.specialNsArr.push(key);
}
else if (val == '6' || val == '-2') { // 'file'/'image' or 'media'
if ($.inArray(key, locale.fileNsArr) == -1)
locale.fileNsArr.push(key);
}
else if (val == '14') { // 'category'
if ($.inArray(key, locale.categoryNsArr) == -1)
locale.categoryNsArr.push(key);
}
});
} // saveNsNames
// won't work in all cases
function escCharsForNowikiTags(data) {
var nowikiCharTranslMap = {
'[': '[', ']': ']', '{': '{', '}': '}',
'<': '<', '>': '>', ':': ':', '*': '*', '#': '#'
};
//en.wikipedia.org/wiki/Help:Nowiki#WP:NOWIKI
var singleCharEscReG = re.singleCharEscG
|| (re.singleCharEscG = /(.|^)(?:nowiki ?\/|nowiki><\/nowiki)>(.)/g);
data = data.replace(singleCharEscReG, function (m, $1, $2) {
if ($1 == '<') return '<' + $2;
else if (nowikiCharTranslMap[$2]) return $1 + nowikiCharTranslMap[$2];
else if (nowikiCharTranslMap[$1]) return nowikiCharTranslMap[$1] + $2;
});
var noWikiElReG = re.noWikiElG || (re.noWikiElG = /<(nowiki|pre)>([\S\s]*?)<\/\1>/g);
var noWikiReplaceCharsReG = re.noWikiReplG || (re.noWikiReplG = /\[|]|\{|}|<|>|:|\*|#/g);
data = data.replace(noWikiElReG, function (match, $1, $2) {
return $2.replace(noWikiReplaceCharsReG, function (match) {
return nowikiCharTranslMap[$2];
})});
return data;
} // escCharsForNowikiTags
function removeElements(data, elStr) {
var arr = elStr.split(', ');
if ($.inArray('comments', arr) > -1)
data = data.replace(re.htmlCommentsG, '$1');
if ($.inArray('tables', arr) > -1) {
data = removeEls(data, re.wikiTable
|| (re.wikiTable = removeElRegExp('{|', '|}')));
data = removeEls(data, re.htmlTable
|| (re.htmlTable = removeElRegExp('<table', '</table>')));
}
if ($.inArray('templates', arr) > -1)
data = removeEls(data, re.templates
|| (re.templates = removeElRegExp('{{', '}}') ));
if ($.inArray('references', arr) > -1)
data = data.replace(re.refs
|| (re.refs = /<ref[^>]*?(\/>|>[\S\s]*?<\/ref\s*>)/ig), '');
if ($.inArray('files', arr) > -1) {
if (!locale.fileNsArr)
saveNsNames();
data = removeEls(data, re.files
|| (re.files = removeElRegExpStartArr('[[', locale.fileNsArr, ':', ']]', '[[')));
data = data.replace(re.gallery
|| (re.gallery = /(\n)?<gallery[^>]*>[\S\s]*?<\/gallery>\1?/gi), '$1');
}
if ($.inArray('categories', arr) > -1) {
if (!locale.categoryNsArr)
saveNsNames();
data = removeEls(data, re.category
|| (re.category = removeElRegExpStartArr('[[', locale.categoryNsArr, ':', ']]')));
}
if ($.inArray('bold/italic', arr) > -1) {
data = data.replace(re.boldItalicG
|| (re.boldItalicG = /<\/?(i|b|strong|em)>|'''?|('){2,3}/gi), '');
}
if ($.inArray('behavior switches', arr) > -1) {
data = data.replace(re.behaviorSwitchesG
|| (re.behaviorSwitchesG = /(\n)?__[^\s]+?__\1?/g), '$1');
}
if ($.inArray('others', arr) > -1) {
data = data.replace(re.timelineG
|| (re.timelineG = /(\n)?<timeline>[\S\s]*?<\/timeline>\1?/gi), '$1');
}
return data;
} // removeElements;
// all files ([[File:...]]) must be removed BEFORE calling this function
function unlink(data) {
// remove all wikilinks and files
var prev, cntr = 1000;
var remAddrReG = re.remAddrG || (re.remAddr = /\[\[[^|\]]*\|/g);
var unlinkLinksReG = re.unlinkLinksReG || (re.unlinkLinksReG = /\[\[([^\]\[]+)\]\]/g);
do {
cntr--;
prev = data;
// remove addresses from all links:
data = data.replace(remAddrReG, '[[');
} while (data != prev && cntr > 0);
// unlink all links:
data = data.replace(unlinkLinksReG, '$1');
return data;
} // unlink
function boldAndItalicToHtml(data) {
if (!re.boldAndItalicToHtml1) {
// the first regex removes four, six, or more apostrophes
re.boldAndItalicToHtml1 = /(^|[^'])''''('{2,})?([^']|$)/g;
re.boldAndItalicToHtml2 = /'''([^'\n][^\n]*?)('''|\n)/g;
re.boldAndItalicToHtml3 = /''([^\n]+?)(''|\n)/g;
}
return data.replace(re.boldAndItalicToHtml1, '')
.replace(re.boldAndItalicToHtml2, '<b>$1</b>')
.replace(re.boldAndItalicToHtml3, '<i>$1</i>');
} // boldAndItalicToHtml
function beforeTheFirstSection(data, removeCategories) {
var tempArr;
// keep only the text before the start of the first section title
// (section titles starts with = on a new line).
// If there are no sections, remove the categories
var beforeFirstSectRe = re.beforeFirstSect
|| (re.beforeFirstSect = /^([\S\s]*?)(?=(\n(=+).+?\3[^\S\n]*)(\n|$))/);
var newData = (tempArr = beforeFirstSectRe.exec(data)) && tempArr[1];
return newData || (removeCategories ? removeElements(data, 'categories') : data);
} // beforeTheFirstSection
function divideSections(data) {
var sections = [];
sections.push({
eq: '',
level: 0,
heading: '',
contents: beforeTheFirstSection(data, false)
});
var match;
var regex = re.divSectionsG ||
(re.divSectionsG = /(^|\n)(=+)(.+?)\2[^\S\n]*(?=\n)([\S\s]*?)(?=\n(=+).+?\5[^\S\n]*(?:\n|$)|$)/g);
var cntr = 1000;
while ((match = regex.exec(data)) && cntr > 0) {
cntr--;
sections.push({
eq: match[2],
level: match[2].length,
heading: $.trim(match[3]),
contents: $.trim(match[4])
});
}
return sections;
} // divideSections
function checkRegexSupport() {
return ('<a><bd</e></b>'.replace(re.testRe, '<') == '<a><bd</e></b>');
}
// removes html tags and some whole elements, except
// for the tags in the comma+space-separated whiteListTagsStr list
// Removes all the attributes from the white-listed tags tags.
// Converts < before a whitespace character into <
function sanitizeHtml(data, whiteListTagsStr, leaveSpecialChars) {
if (!checkRegexSupport())
throw 1; // no (lookahead) regex support
var whiteList = (whiteListTagsStr || '').split(', ').join('|');
var commentReG = re.htmlCommentG || (re.htmlCommentG = /<!--[\S\s]*?-->/g);
var nonWhiteListedTagsReG, allTagsG;
var lessThanNotBeforeWLTagG;
var grThanNotAndAfterWLTagG;
var tagAttributesReG;
var oldData, cntr;
if (whiteList !== '') {
var byAll = re.resByWhitelist = (re.resByWhitelist || {});
var by = byAll[whiteListTagsStr] || (byAll[whiteListTagsStr] = {});
nonWhiteListedTagsReG = by.nonWhiteListedTagsG
|| (by.nonWhiteListedTagsG = new RegExp('<(?!/?(' + whiteList + ')(\\b|/))[^>]*>', 'gi'));
lessThanNotBeforeWLTagG = by.lessThanNotBeforeWLTagG
|| (by.lessThanNotBeforeWLTagG = new RegExp('<(?!/?(' + whiteList + ')/?>)', 'gi'));
grThanNotAndAfterWLTagG = by.grThanNotAndAfterWLTagG
|| (by.grThanNotAndAfterWLTagG = new RegExp('(</?(' + whiteList + ')/?)?>', 'gi'));
tagAttributesReG = re.tagAttributesG
|| (re.tagAttributesG = /<(\/?[a-z][a-z0-9]*)[^>]*?(\/)?>/gi);
}
else
allTagsG = re.allTagsG || (re.allTagsG = /<(\b|\/)[^>]*>/g);
cntr = 1000;
do {
oldData = data;
cntr--;
// remove comments:
data = data.replace(re.htmlCommentsG, '$1');
// remove all tags except the white-listed ones
if (whiteList !== '') {
data = data.replace(nonWhiteListedTagsReG, '');
// remove all attributes from the remaining tags:
data = data.replace(tagAttributesReG, '<$1$2>');
}
else
data = data.replace(allTagsG, '');
} while (oldData != data && cntr > 0);
if (cntr <= 0) throw 2;
if (!leaveSpecialChars) {
var ampNotInCharRefReG = re.ampReG || (re.ampReG = /&(?!#?[xX]?[a-zA-Z0-9]+;)/g);
var ltReG = /</g;
var gtReG = />/g;
var quoteReG = /"/g;
var aposReG = /'/g;
var graveReG = /`/g;
cntr = 1000;
do {
oldData = data;
cntr--;
if (whiteList !== '') {
// html-escape all < and > except if part of a whitelisted tag
data = data.replace(lessThanNotBeforeWLTagG, '<');
data = data.replace(grThanNotAndAfterWLTagG, function ($0, $1) {
return $1 ? $0 : '>';
});
}
else { // html-escape all < and > chars
data = data.replace(ltReG, '<').replace(gtReG, '>');
}
// escape & to & if obviously not a part of a char ref:
data = data.replace(ampNotInCharRefReG, '&');
// escape all quotes (` is used in old IE)
data = data.replace(quoteReG, '"').replace(aposReG, ''')
.replace(graveReG, '`');
} while (oldData != data && cntr > 0);
if (cntr <= 0) throw 2;
}
return data;
} // sanitizeHtml
function focusedSegment(bsa, segmentNames) {
segmentNames = (typeof segmentNames == 'object') ? segmentNames : segmentNames.split(', ');
for (var i = 0; i < segmentNames.length; i++) {
if (segmentNames[i] == 'wikilink')
return focusedCustomSegment(bsa, '[[', ']]', '', '[]<>{}');
}
}
// bsa - an array with 3 elements: [text_before_the_selection/cursor, selection, text_after]
// the other arguments - the char(s) indicating the start/end of the segment
// otherStartChars (optional) - start chars of other segments with the same endChars,
// needed only for some elements, for example if startChars is [[File:,
// otherStartChars needs to be [[ because links can be embeded in file elements.
// invalidBeforePipe - a string with individual illegal characters. Illigal only if before
// the first pipe character "|" (or anywhere, if there is no pipe character).
function focusedCustomSegment(bsa, startChars, endChars, otherStartChars, invalidBeforePipe) {
function endMatches(str, endChars) {
return (str.slice(-endChars.length) === endChars);
}
function startMatches(str, startChars) {
return (str.slice(0, startChars.length) === startChars);
}
var before = bsa[0];
var selection = bsa[1]; // the selection
var after = bsa[2];
var spaces;
if (!startChars || !endChars)
return;
if (selection) { // there is some selected text
spaces = selection.match(/^\s+/);
if (spaces) { // spaces at the beginning of the selected text
if (endMatches(before, startChars)) {
selection = startChars + selection;
before = before.slice(0, -startChars.length);
}
else {
// move the spaces to the end of the text-before-the-selection:
before += spaces[0];
selection = selection.slice(spaces[0].length);
// check for startChars at beginning of selection:
if (!startMatches(selection, startChars))
return;
}
}
else {
// while no (complete) startChars string at beginning of selection:
// move a char from the end of textBefore to the beginning of selection
var startCharsFound = false;
for (i = 0; i <= startChars.length; i++) {
if (startMatches(selection, startChars)) {
startCharsFound = true;
break;
}
if (before.length == 0)
break;
selection = before.slice(before.length - 1) + selection;
before = before.slice(0, before.length - 1);
}
if (!startCharsFound)
return;
// TODO: check if selection contains only one outer element,
// and the start-end chars are ballanced
}
spaces = selection.match(/\s+$/);
if (spaces) { // spaces at the end of the selected text
if (startMatches(after, endChars)) {
selection = selection + endChars;
after = after.slice(endChars.length);
}
else {
// move spaced to the beginning of the text-after-the-selection:
after = spaces[0] + after;
selection = selection.slice(0, -spaces[0].length);
if (!endMatches(selection, endChars))
return;
}
}
else {
// while no (complete) endChars string found at end of selection:
// move a char from the beginning of textBefore to the end of selection
var endCharsFound = false;
for (i = 0; i <= endChars.length; i++) {
if (endMatches(selection, endChars)) {
endCharsFound = true;
break;
}
if (after.length == 0)
break;
selection = selection + after.charAt(0);
after = after.slice(1);
}
if (!endCharsFound)
return;
}
} // if (selection)
else { // no text selected
var text = before + after;
// TODO: add a loop to allow the cursor to be after an embeded element
var startCharsAt = text.lastIndexOf(startChars, before.length + startChars.length - 3);
if (startCharsAt == -1)
return;
var closing = startCharsAt;
var opening = startCharsAt;
var openingOther;
var i = 0;
while (i++ < 10) {
closing = text.indexOf(endChars, closing + 1);
if (closing == -1) {
return;
}
if (otherStartChars) {
openingOther = text.indexOf(otherStartChars, opening);
}
opening = text.indexOf(startChars, opening + 1);
if (opening == -1)
opening = text.length;
if (otherStartChars) {
if (openingOther > -1)
opening = (openingOther < opening ? openingOther : opening);
}
if (closing < opening) {
if (closing < before.length - endChars.length) {
return;
}
selection = text.slice(startCharsAt, closing + startChars.length);
before = text.slice(0, startCharsAt);
after = text.slice(closing + startChars.length);
break;
}
}
}
if (invalidBeforePipe) {
var invalidEscForRe = escapeForRegExp(invalidBeforePipe);
var beforePipe = selection.slice(startChars.length, -endChars.length).match(/[^|]*/)[0];
if (beforePipe.match('[' + invalidEscForRe + ']'))
return;
}
return [before, selection, after];
} // focusedSegment
return {
version: version,
unescapeCharEntities: unescapeCharEntities,
formatUrl: formatUrl,
encodeSectionNameForUrl: encodeSectionNameForUrl,
encodeSectionNameForId: encodeSectionNameForId,
checkRegexSupport: checkRegexSupport,
escCharsForNowikiTags: escCharsForNowikiTags,
removeElRegExp: removeElRegExp,
removeElRegExpStartArr: removeElRegExpStartArr,
removeElements: removeElements,
unlink: unlink,
sanitizeHtml: sanitizeHtml,
boldAndItalicToHtml: boldAndItalicToHtml,
beforeTheFirstSection: beforeTheFirstSection,
divideSections: divideSections,
focusedCustomSegment: focusedCustomSegment, // incomplete implementation
focusedSegment: focusedSegment // works only for wikilinks right now
};
})();