You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
309 lines
8.1 KiB
309 lines
8.1 KiB
/* eslint no-bitwise: "off", max-statements: "off", max-lines: "off" */ |
|
|
|
// Taken from: https://github.com/walling/unorm/blob/master/lib/unorm.js |
|
|
|
/* |
|
* UnicodeNormalizer 1.0.0 |
|
* Copyright (c) 2008 Matsuza |
|
* Dual licensed under the MIT (MIT-LICENSE.txt) and |
|
* GPL (GPL-LICENSE.txt) licenses. |
|
* $Date: 2008-06-05 16:44:17 +0200 (Thu, 05 Jun 2008) $ |
|
* $Rev: 13309 $ |
|
*/ |
|
|
|
"use strict"; |
|
|
|
var primitiveSet = require("../../../object/primitive-set") |
|
, validValue = require("../../../object/valid-value") |
|
, data = require("./_data"); |
|
|
|
var floor = Math.floor |
|
, forms = primitiveSet("NFC", "NFD", "NFKC", "NFKD") |
|
, DEFAULT_FEATURE = [null, 0, {}] |
|
, CACHE_THRESHOLD = 10 |
|
, SBase = 0xac00 |
|
, LBase = 0x1100 |
|
, VBase = 0x1161 |
|
, TBase = 0x11a7 |
|
, LCount = 19 |
|
, VCount = 21 |
|
, TCount = 28 |
|
, NCount = VCount * TCount |
|
, SCount = LCount * NCount |
|
, UChar |
|
, cache = {} |
|
, cacheCounter = [] |
|
, fromCache |
|
, fromData |
|
, fromCpOnly |
|
, fromRuleBasedJamo |
|
, fromCpFilter |
|
, strategies |
|
, UCharIterator |
|
, RecursDecompIterator |
|
, DecompIterator |
|
, CompIterator |
|
, createIterator |
|
, normalize; |
|
|
|
UChar = function (cp, feature) { |
|
this.codepoint = cp; |
|
this.feature = feature; |
|
}; |
|
|
|
// Strategies |
|
(function () { for (var i = 0; i <= 0xff; ++i) cacheCounter[i] = 0; })(); |
|
|
|
fromCache = function (nextStep, cp, needFeature) { |
|
var ret = cache[cp]; |
|
if (!ret) { |
|
ret = nextStep(cp, needFeature); |
|
if (Boolean(ret.feature) && ++cacheCounter[(cp >> 8) & 0xff] > CACHE_THRESHOLD) { |
|
cache[cp] = ret; |
|
} |
|
} |
|
return ret; |
|
}; |
|
|
|
fromData = function (next, cp) { |
|
var hash = cp & 0xff00, dunit = UChar.udata[hash] || {}, feature = dunit[cp]; |
|
return feature ? new UChar(cp, feature) : new UChar(cp, DEFAULT_FEATURE); |
|
}; |
|
fromCpOnly = function (next, cp, needFeature) { |
|
return needFeature ? next(cp, needFeature) : new UChar(cp, null); |
|
}; |
|
|
|
fromRuleBasedJamo = function (next, cp, needFeature) { |
|
var char, base, i, arr, SIndex, TIndex, feature, j; |
|
if (cp < LBase || (LBase + LCount <= cp && cp < SBase) || SBase + SCount < cp) { |
|
return next(cp, needFeature); |
|
} |
|
if (LBase <= cp && cp < LBase + LCount) { |
|
char = {}; |
|
base = (cp - LBase) * VCount; |
|
for (i = 0; i < VCount; ++i) { |
|
char[VBase + i] = SBase + TCount * (i + base); |
|
} |
|
arr = new Array(3); |
|
arr[2] = char; |
|
return new UChar(cp, arr); |
|
} |
|
|
|
SIndex = cp - SBase; |
|
TIndex = SIndex % TCount; |
|
feature = []; |
|
if (TIndex === 0) { |
|
feature[0] = [LBase + floor(SIndex / NCount), VBase + floor((SIndex % NCount) / TCount)]; |
|
feature[2] = {}; |
|
for (j = 1; j < TCount; ++j) { |
|
feature[2][TBase + j] = cp + j; |
|
} |
|
} else { |
|
feature[0] = [SBase + SIndex - TIndex, TBase + TIndex]; |
|
} |
|
return new UChar(cp, feature); |
|
}; |
|
|
|
fromCpFilter = function (next, cp, needFeature) { |
|
return cp < 60 || (cp > 13311 && cp < 42607) |
|
? new UChar(cp, DEFAULT_FEATURE) |
|
: next(cp, needFeature); |
|
}; |
|
|
|
strategies = [fromCpFilter, fromCache, fromCpOnly, fromRuleBasedJamo, fromData]; |
|
|
|
UChar.fromCharCode = strategies.reduceRight(function (next, strategy) { |
|
return function (cp, needFeature) { return strategy(next, cp, needFeature); }; |
|
}, null); |
|
|
|
UChar.isHighSurrogate = function (cp) { return cp >= 0xd800 && cp <= 0xdbff; }; |
|
UChar.isLowSurrogate = function (cp) { return cp >= 0xdc00 && cp <= 0xdfff; }; |
|
|
|
UChar.prototype.prepFeature = function () { |
|
if (!this.feature) { |
|
this.feature = UChar.fromCharCode(this.codepoint, true).feature; |
|
} |
|
}; |
|
|
|
UChar.prototype.toString = function () { |
|
var num; |
|
if (this.codepoint < 0x10000) return String.fromCharCode(this.codepoint); |
|
num = this.codepoint - 0x10000; |
|
return String.fromCharCode(floor(num / 0x400) + 0xd800, (num % 0x400) + 0xdc00); |
|
}; |
|
|
|
UChar.prototype.getDecomp = function () { |
|
this.prepFeature(); |
|
return this.feature[0] || null; |
|
}; |
|
|
|
UChar.prototype.isCompatibility = function () { |
|
this.prepFeature(); |
|
return Boolean(this.feature[1]) && this.feature[1] & (1 << 8); |
|
}; |
|
UChar.prototype.isExclude = function () { |
|
this.prepFeature(); |
|
return Boolean(this.feature[1]) && this.feature[1] & (1 << 9); |
|
}; |
|
UChar.prototype.getCanonicalClass = function () { |
|
this.prepFeature(); |
|
return this.feature[1] ? this.feature[1] & 0xff : 0; |
|
}; |
|
UChar.prototype.getComposite = function (following) { |
|
var cp; |
|
this.prepFeature(); |
|
if (!this.feature[2]) return null; |
|
cp = this.feature[2][following.codepoint]; |
|
return cp ? UChar.fromCharCode(cp) : null; |
|
}; |
|
|
|
UCharIterator = function (str) { |
|
this.str = str; |
|
this.cursor = 0; |
|
}; |
|
UCharIterator.prototype.next = function () { |
|
if (Boolean(this.str) && this.cursor < this.str.length) { |
|
var cp = this.str.charCodeAt(this.cursor++), d; |
|
if ( |
|
UChar.isHighSurrogate(cp) && |
|
this.cursor < this.str.length && |
|
UChar.isLowSurrogate((d = this.str.charCodeAt(this.cursor))) |
|
) { |
|
cp = (cp - 0xd800) * 0x400 + (d - 0xdc00) + 0x10000; |
|
++this.cursor; |
|
} |
|
return UChar.fromCharCode(cp); |
|
} |
|
this.str = null; |
|
return null; |
|
}; |
|
|
|
RecursDecompIterator = function (it, cano) { |
|
this.it = it; |
|
this.canonical = cano; |
|
this.resBuf = []; |
|
}; |
|
|
|
RecursDecompIterator.prototype.next = function () { |
|
var recursiveDecomp, uchar; |
|
recursiveDecomp = function (cano, ucharLoc) { |
|
var decomp = ucharLoc.getDecomp(), ret, i, a, j; |
|
if (Boolean(decomp) && !(cano && ucharLoc.isCompatibility())) { |
|
ret = []; |
|
for (i = 0; i < decomp.length; ++i) { |
|
a = recursiveDecomp(cano, UChar.fromCharCode(decomp[i])); |
|
// Ret.concat(a); //<-why does not this work? |
|
// following block is a workaround. |
|
for (j = 0; j < a.length; ++j) ret.push(a[j]); |
|
} |
|
return ret; |
|
} |
|
return [ucharLoc]; |
|
}; |
|
if (this.resBuf.length === 0) { |
|
uchar = this.it.next(); |
|
if (!uchar) return null; |
|
this.resBuf = recursiveDecomp(this.canonical, uchar); |
|
} |
|
return this.resBuf.shift(); |
|
}; |
|
|
|
DecompIterator = function (it) { |
|
this.it = it; |
|
this.resBuf = []; |
|
}; |
|
|
|
DecompIterator.prototype.next = function () { |
|
var cc, uchar, inspt, uchar2, cc2; |
|
if (this.resBuf.length === 0) { |
|
do { |
|
uchar = this.it.next(); |
|
if (!uchar) break; |
|
cc = uchar.getCanonicalClass(); |
|
inspt = this.resBuf.length; |
|
if (cc !== 0) { |
|
for (inspt; inspt > 0; --inspt) { |
|
uchar2 = this.resBuf[inspt - 1]; |
|
cc2 = uchar2.getCanonicalClass(); |
|
// eslint-disable-next-line max-depth |
|
if (cc2 <= cc) break; |
|
} |
|
} |
|
this.resBuf.splice(inspt, 0, uchar); |
|
} while (cc !== 0); |
|
} |
|
return this.resBuf.shift(); |
|
}; |
|
|
|
CompIterator = function (it) { |
|
this.it = it; |
|
this.procBuf = []; |
|
this.resBuf = []; |
|
this.lastClass = null; |
|
}; |
|
|
|
CompIterator.prototype.next = function () { |
|
var uchar, starter, composite, cc; |
|
while (this.resBuf.length === 0) { |
|
uchar = this.it.next(); |
|
if (!uchar) { |
|
this.resBuf = this.procBuf; |
|
this.procBuf = []; |
|
break; |
|
} |
|
if (this.procBuf.length === 0) { |
|
this.lastClass = uchar.getCanonicalClass(); |
|
this.procBuf.push(uchar); |
|
} else { |
|
starter = this.procBuf[0]; |
|
composite = starter.getComposite(uchar); |
|
cc = uchar.getCanonicalClass(); |
|
if (Boolean(composite) && (this.lastClass < cc || this.lastClass === 0)) { |
|
this.procBuf[0] = composite; |
|
} else { |
|
if (cc === 0) { |
|
this.resBuf = this.procBuf; |
|
this.procBuf = []; |
|
} |
|
this.lastClass = cc; |
|
this.procBuf.push(uchar); |
|
} |
|
} |
|
} |
|
return this.resBuf.shift(); |
|
}; |
|
|
|
createIterator = function (mode, str) { |
|
switch (mode) { |
|
case "NFD": |
|
return new DecompIterator(new RecursDecompIterator(new UCharIterator(str), true)); |
|
case "NFKD": |
|
return new DecompIterator(new RecursDecompIterator(new UCharIterator(str), false)); |
|
case "NFC": |
|
return new CompIterator( |
|
new DecompIterator(new RecursDecompIterator(new UCharIterator(str), true)) |
|
); |
|
case "NFKC": |
|
return new CompIterator( |
|
new DecompIterator(new RecursDecompIterator(new UCharIterator(str), false)) |
|
); |
|
default: |
|
throw new Error(mode + " is invalid"); |
|
} |
|
}; |
|
normalize = function (mode, str) { |
|
var it = createIterator(mode, str), ret = "", uchar; |
|
while ((uchar = it.next())) ret += uchar.toString(); |
|
return ret; |
|
}; |
|
|
|
/* Unicode data */ |
|
UChar.udata = data; |
|
|
|
module.exports = function (/* Form*/) { |
|
var str = String(validValue(this)), form = arguments[0]; |
|
if (form === undefined) form = "NFC"; |
|
else form = String(form); |
|
if (!forms[form]) throw new RangeError("Invalid normalization form: " + form); |
|
return normalize(form, str); |
|
};
|
|
|