User:Verdy p/tablesort.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
Documentation for this user script can be added at User:Verdy p/tablesort. |
/* Please DON'T delete, this is only in MY own user space, and it is NOT a test. See rule: [[Wikipedia:CSD#G2]] */
/* <pre> */
/* =====================================================================
Support for sorting text cells of tables in a much more consistent way.
(c) 2010 Philippe Verdy : [[fr:Utilisateur:Verdy_p]] (http://fr.wikipedia.org/wiki/Utilisateur:Verdy_p).
Available under the following licences: {{LGPL}} {{GFDL}} {{CC-BY-SA}} {{CeCILL}}
(see details about these licence templates on http://commons.mediawiki.org)
This implements a 2-level collation, nearly compatible with UCA (see UTS#10 and CLDR for tailoring), but:
* It is still using a the binary ordering of characters used in subkeys at each level
(instead of their preferred collation weight).
* Ternary differences are all left in the last default level that uses the binary ordering of characters
(actually it uses the ordering of UTF-16 code units, instead of code point values of UTF-32).
This has the effect of sorting "a-a" before "aa", because they have the same primary and secundary
subkeys, and they differ only on the ternary level.
* It is incomplete in the decomposition mappings of graphemes.
* It is almost completely locale-neutral (not tailored for any different locales), except that it contains
a very limited support for French reverse ordering of the ternary level.
But despite these defects, it gives much better results than the default binary order of UTF-16 code units
(the only option natively implemented in Javascript which makes it a level-0 collation), or than a
simple 1-level collation with case mapping only (currently used in wikibits.js).
A complete UCA implementation, using true weights from the Unicode DUCET and possible per-locale
tailoring would require much more work (but would produce more compact sort keys). This implementation
is very fast and its code is simple enough and small, that it can be a full replacement of the current
ts_toLowerCase() function used to generate sort keys for text cells.
It does not impact the existing non-text (number, currency, date) ordering types of Mediawiki tables.
======================================================================= */
/* Utility */
function ts_toOtherCase(s) {
var c, d;
s = s.split('');
for (var i = s.length; --i >= 0;) s[i] = (c = s[i]) == (d = c.toLowerCase()) ? c.toUpperCase() : d;
return s.join('');
}
/* ===================================================================== */
ts_textToKeyReversed = mw.config.get("wgContentLanguage") == 'fr';
/* Grapheme_Cluster_Break property of Unicode:
(see UAX #29 3.1 Default Grapheme Cluster Boundary Specification)
This includes Grapheme_Extend = true, i.e.:
* General_Category = Nonspacing_Mark
* General_Category = Enclosing_Mark
* U+200C ZERO WIDTH NON-JOINER (ZWNJ) // TODO: can't be unsupported here
* U+200D ZERO WIDTH JOINER (ZWJ) // TODO: can't be unsupported here
* plus a few Spacing Marks needed for canonical equivalence.
as well as the following prepended characters (that don't follow the logical order):
* U+0E30 ( ะ ) THAI CHARACTER SARA A
* U+0E32 ( า ) THAI CHARACTER SARA AA
* U+0E33 ( ำ ) THAI CHARACTER SARA AM
* U+0E45 ( ๅ ) THAI CHARACTER LAKKHANGYAO
* U+0EB0 ( ະ ) LAO VOWEL SIGN A
* U+0EB2 ( າ ) LAO VOWEL SIGN AA
* U+0EB3 ( ຳ ) LAO VOWEL SIGN AM
FIXME: need support for graphemes longer than 2 characters (notably with ZWJ/ZWNJ).
FIXME: check Unicode 5/6 completeness.
*/
ts_textToKeyGraphemeExtends =
'\u0300-\u036F' +
'\u0483-\u0489' +
'\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7' +
'\u0610-\u061A\u064B-\u065E\u0670\u06D6-\u06DC\u06DE-\u06E4\u06E7\u06E8\u06EA-\u06ED' +
'\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3' +
'\u0901-\u0903\u093C\u093E-\u094D\u0951-\u0954\u0962\u0963' +
'\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3' +
'\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75' +
'\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3' +
'\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63' +
'\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7' +
'\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63' +
'\u0C82\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3' +
'\u0D02\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63' +
'\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3' +
'\u0E30-\u0E3A\u0E45\u0E47-\u0E4E' + /* including '\u0E30\u0E32\u0E33\u0E45' (see notice above) */
'\u0EB0-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD' + /* including '\u0EB0\u0EB2\u0EB3' (see notice above) */
'\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F90-\u0F97\u0F99-\u0FBC\u0FC6' +
'\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F' +
'\u135F' +
'\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B6-\u17D3\u17DD' +
'\u180B-\u180D\u18A9' +
'\u1920-\u192B\u1930-\u193B\u19B0-\u19C0\u19C8\u19C9' +
'\u1A17-\u1A1B' +
'\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAA' +
'\u1C24-\u1C37' +
'\u1DC0-\u1DE6\u1DFE\u1DFF' +
'\u20D0-\u20F0' +
'\u2DE0-\u2DFF' +
'\u302A-\u302F\u3099\u309A' +
'\uA66F-\uA672\uA67C\uA67D' +
'\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C4' +
'\uA926-\uA92D\uA947-\uA953' +
'\uAA29-\uAA36\uAA43\uAA4C\uAA4D' +
'\uFB1E' +
'\uFE00-\uFE0F\uFE20-\uFE26';
/*
The "Prepend" characters (not obeying the logical structure) should be handled.
(see UAX #29 3.1 Default Grapheme Cluster Boundary Specification).
*/
ts_textToKeyGraphemePrepends =
'\u0E40-\u0E44\u0EC0-\u0EC4';
/*
FIXME: check the missing currency symbols or non-separating punctuations to ignore.
TODO: add the missing format controls to ignore.
*/
ts_textToKeyDropped = new RegExp(
'[-\'\x7F-\xBF\xAD' +
'⁃·´`‚‘’~^†‡¶§©®$£€¢₩฿₨₧₪₣₡₢' +
ts_textToKeyGraphemeExtends +
']+', 'g' );
/*
FIXME: add the missing whitespaces and word-separating punctuation signs.
*/
ts_textToKeyBlanked = new RegExp(
'[\\s\\"\\[\\]\\\\ \xA0' +
'{}()|+*/±×÷%<=>°„“”«».,;:¡!‼¿?…•_–—#&' +
'ƧƨƻƼƽ' + /* tone letters ignored for sort */
'ʔʡʕʢʖƾ' + /* glottal letters */
'ǀǁǂǃʘ' + /* click letters */
']+', 'g' );
/*
Note: For performance reason in browsers, the values in this map MUST be precompiled
regexps, ready for use as parameters of "string.replace(regexp, subst)" in loops.
TODO: add more mappings for more languages...
This mapping is nearly complete for modern languages written in Latin.
Should add mappings for precomposed characters in Greek, Cyrillic...
Hangul precomposed syllables should be decomposed algorithmically into jamos.
*/
ts_textToKeyRemapped = {
'0': /[⁰₀]/g, '1': /[¹₁]/g, '2': /[²₂]/g, '3': /[³₃]/g, '4': /[⁴₄]/g, '5': /[⁵₅]/g, '6': /[⁶₆]/g, '7': /[⁷₇]/g, '8': /[⁸₈]/g, '9': /[⁹₉]/g,
'a': /[áàãâấầẫẩäǟȁảǎăắằẵẳȃāǡåǻẚạậặḁąɐɑɒª@]/g, 'A': /[ÁÀÃÂẤẦẪẨÄǞȀẢǍĂẮẰẴẲȂĀǠÅǺẠẬẶḀĄ]/g,
'ae': /[æǽǣ]/g, 'AE': /[ÆǼǢ]/g,
'b': /[ḃḅḇƀƃƅɓʙ]/g, 'B': /[ḂḄḆƁƂƄ]/g,
'c': /[ćĉčċçḉƈɕʗɔ]/g, 'C': /[ĆĈČĊÇḈƇƆ]/g,
'd': /[ďḋḍḑḏḓƌɗɖ]/g, 'D': /[ĎḊḌḐḎḒƉƊƋ]/g,
'dh': /[ðđ]/g, 'DH': /[ÐĐ]/g,
'dz': /[ʣdzdžʤʥ]/g, 'Dz': /[DzDž]/g, 'DZ': /[DZDŽ]/g,
'e': /[éèẽêếềễểëȅẻěĕȇḗḕḛḙḝėẹệęɘəɚɛʚɜɝɞ]/g, 'E': /[ÉÈẼÊẾỀỄỂËȄẺĚĔȆḖḔĖẸỆḘḚḜĘƏƎƐ]/g,
'f': /[ḟƒ]/g, 'F': /[ḞƑ]/g,
'fi': /[fi]/g,
'fl': /[fl]/g,
'g': /[ǵĝǧğḡġģǥɠɡɢʛɣɤ/g, 'G': /[ǴĜǦĞḠĠĢǤƓƔ]/g,
'gh': /[ƣ]/g, 'GH': /[Ƣ]/g,
'h': /[ĥḧḣḫḥẖḩħɥɦɧʜ]/g, 'H': /[ĤḦḢḪḤḨĦ]/g,
'hv': /[ƕ]/g,
'i': /[íìĩîïḯǐĭȋīȉỉıḭịįɨɩɪ]/g, 'I': /[ÍÌĨÎÏḮǏĬȊĪȈỈİḬỊĮƗƖ]/g,
'ij': /[ij]/g, 'IJ': /[IJ]/g,
'j': /[ĵǰɟʝ]/g, 'J': /[Ĵ]/g,
'k': /[ḱǩḳķḵƙĸʞ]/g, 'K': /[ḰǨḲĶḴƘ]/g,
'l': /[ĺḻḷḹḽļľŀłƚɫɬɭʟƛ]/g, 'L': /[ĽḺḶḸḼĻĿŁ]/g,
'lj': /[lj]/g, 'Lj': /[Lj]/g, 'LJ': /[LJ]/g,
'lz': /[ɮ]/g,
'm': /[ḿṁṃɯɰɱ]/g, 'M': /[ḾṀṂƜ]/g,
'n': /[ńñňṅṇņṉṋʼnɲŋɳƞɴ]/g, 'N': /[ŃÑŇṄṆŅṈṊƝŊ]/g,
'nj': /[nj]/g, 'Nj': /[Nj]/g, 'NJ': /[NJ]/g,
'o': /[óòõṍṏôốồỗổǒöǒŏȏōṓṑőȍỏọộǫǭơớờỡởợøǿɵɷº]/g, 'O': /[ÓÒÕṌṎÔỐỒỖỔÖǑŎȎŌṒṐŐȌỎỌỘǪǬƠỚỜỠỞỢØǾƟ]/g,
'oe': /[œɶ]/g, 'OE': /[Œ]/g,
'p': /[ṕṗƥ]/g, 'P': /[ṔṖƤ]/g,
'ph': /[ɸ]/g,
'q': /[ʠ]/g,
'r': /[ŕřȓṙȑŗṛṝṟɼɽɾɹɺɻɿʅʀʁ]/g, 'R': /[ŔŘȒṘȐŖṚṜṞƦ]/g,
's': /[śṥŝšṧṡṣṩşʂſẛʃʄʆƪ]/g, 'S': /[ŚṤŜŠṦṠṢṨŞƩ]/g,
'ss': /[ß]/g,
't': /[ẗťṫṭṯṱţƫƭŧʈʇ]/g, 'T': /[ŤṪṬṮṰŢƮƬŦ]/g,
'tc': /[ʨ]/g,
'ts': /[ʦʧ]/g,
'th': /[þ]/g, 'TH': /[Þ]/g,
'u': /[úùũṹûǔüǘǜǖǚŭȗūṻůűȕủṵṳụṷųưứừữửựʉʊ]/g, 'U': /[ÚÙŨṸÛǓÜǗǛǕǙŬȖŪṺŮŰȔỦṴṲỤṶŲƯỨỪỮỬỰƱ]/g,
'v': /[ṽṿʋʌ]/g, 'V': /[ṼṾƲ]/g,
'w': /[ẃẁŵẅẇẘẉʍƿ]/g, 'W': /[ẂẀŴẄẆẈ]/g,
'x': /[ẍẋ]/g, 'X': /[ẌẊ]/g,
'y': /[ýỳŷỹÿẏỷẙỵƴʎʏ]/g, 'Y': /[ÝỲŶỸŸẎỶỴƳ]/g,
'z': /[źẑžżẓẕƶʐʑʒǯƺʓƹ]/g, 'Z': /[ŹẐŽŻẒẔƵƷǮƸ]/g,
};
/*
Utility for reversing the ternary key containing differences such as accents (mostly for French).
Note that with a full UCA implementation, this should be the secondary level, before case differences.
However we currently don't convert letters to sortable collation weights, so this would not work as
expected, and letters with all accent would sort after all letters without accents.
So this implementation swaps levels 2 and 3, to make sure that the same base letters with the same
case will be sorted together. Difference of accents are left in the final level, binary-ordered here
according to their UTF-16 code units, along with other ignorable differences.
Ideally, this should correctly use the Grapheme Cluster Boundary Rules of UAX#29, i.e. all legacy grapheme clusters
plus rule GB9b of extended grapheme clusters (but probably not rule GB9a here for string reversal,
because spacing marks such as '^' or '~' should be treated isolately, just like apostrophes; note that
these spacing marks are ignored here in the primary and secondary subkeys).
BUG: cannot handle string reversal so that it will detect and not break all extended grapheme clusters
that are longer than 2 characters (especially Hangul L+V+T* clusters), or longer than 1 character in
in supplementary planes. If Hangul is handled later, these clusters should be preserved
As this is just used here for French, this bug should not cause any practical
problem for sorting actual tables.
Don't use this function for meaningful display ! A true grapheme cluster breaker, needed for rendering
and text layout, would have to handle all these cases (including rule GB9a or extended grapheme
clusters)
A more precise detection of grapheme clusters for collation would require a huge and complex map of
collation weights (at least the DUCET).
*/
ts_textToKeyReverseGraphemeExtends = new RegExp(
'(.)([' + '\uDC00-\uDFFF' + /* include trailing surrogates */
ts_textToKeyGraphemeExtends + '])', 'g' );
ts_textToKeyReverseGraphemeExtends = new RegExp(
'([' + ts_textToKeyGraphemePrepends + '])(.)', 'g' );
function ts_textToKeyReverse(s) {
return s
.replace(ts_textToKeyReverseGraphemeExtends, '$2$1')
.replace(ts_textToKeyReverseGraphemePrepends, '$2$1')
.split('').reverse().join('');
}
function ts_textToKeyFilter(s) {
var t = s.replace(ts_textToKeyDropped, '').replace(ts_textToKeyBlanked, ' ').replace(/^ /, '').replace(/ $/, '');
for (var k in ts_textToKeyRemapped) t = t.replace(ts_textToKeyRemapped[k], k);
/* TODO: Hangul precomposed syllables should be decomposed algorithmically into jamos. */
return t;
}
/*
Main function.
*/
function ts_textToKey(s) {
var f = ts_toOtherCase(ts_textToKeyFilter(s)), k = f.toLowerCase(), x = '\t';
if (ts_textToKeyReversed) s = ts_textToKeyReverse(s); /* mainly for French */
return f != s ? k + x + f + x + s : k != f ? k + x + f : k;
}
/*
Override the existing "wikibits.js" for table sorts, when processing text cells into sort keys.
*/
ts_toLowerCase = ts_textToKey;
/* ===================================================================== */
/* </pre> */