forked from bbloomf/jgabc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Using hypher.js for Polish syllabification
- Loading branch information
Benjamin Bloomfield
committed
Apr 18, 2016
1 parent
b8277a8
commit a27b987
Showing
9 changed files
with
348 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
(function () { | ||
|
||
var module = { | ||
exports: null | ||
}; | ||
/** | ||
* @constructor | ||
* @param {!{patterns: !Object, leftmin: !number, rightmin: !number}} language The language pattern file. Compatible with Hyphenator.js. | ||
*/ | ||
function Hypher(language) { | ||
var exceptions = [], | ||
i = 0; | ||
/** | ||
* @type {!Hypher.TrieNode} | ||
*/ | ||
this.trie = this.createTrie(language['patterns']); | ||
|
||
/** | ||
* @type {!number} | ||
* @const | ||
*/ | ||
this.leftMin = language['leftmin']; | ||
|
||
/** | ||
* @type {!number} | ||
* @const | ||
*/ | ||
this.rightMin = language['rightmin']; | ||
|
||
/** | ||
* @type {!Object.<string, !Array.<string>>} | ||
*/ | ||
this.exceptions = {}; | ||
|
||
if (language['exceptions']) { | ||
exceptions = language['exceptions'].split(/,\s?/g); | ||
|
||
for (; i < exceptions.length; i += 1) { | ||
this.exceptions[exceptions[i].replace(/\u2027/g, '').toLowerCase()] = new RegExp('(' + exceptions[i].split('\u2027').join(')(') + ')', 'i'); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* @typedef {{_points: !Array.<number>}} | ||
*/ | ||
Hypher.TrieNode; | ||
|
||
/** | ||
* Creates a trie from a language pattern. | ||
* @private | ||
* @param {!Object} patternObject An object with language patterns. | ||
* @return {!Hypher.TrieNode} An object trie. | ||
*/ | ||
Hypher.prototype.createTrie = function (patternObject) { | ||
var size = 0, | ||
i = 0, | ||
c = 0, | ||
p = 0, | ||
chars = null, | ||
points = null, | ||
codePoint = null, | ||
t = null, | ||
tree = { | ||
_points: [] | ||
}, | ||
patterns; | ||
|
||
for (size in patternObject) { | ||
if (patternObject.hasOwnProperty(size)) { | ||
patterns = patternObject[size].match(new RegExp('.{1,' + (+size) + '}', 'g')); | ||
|
||
for (i = 0; i < patterns.length; i += 1) { | ||
chars = patterns[i].replace(/[0-9]/g, '').split(''); | ||
points = patterns[i].split(/\D/); | ||
t = tree; | ||
|
||
for (c = 0; c < chars.length; c += 1) { | ||
codePoint = chars[c].charCodeAt(0); | ||
|
||
if (!t[codePoint]) { | ||
t[codePoint] = {}; | ||
} | ||
t = t[codePoint]; | ||
} | ||
|
||
t._points = []; | ||
|
||
for (p = 0; p < points.length; p += 1) { | ||
t._points[p] = points[p] || 0; | ||
} | ||
} | ||
} | ||
} | ||
return tree; | ||
}; | ||
|
||
/** | ||
* Hyphenates a text. | ||
* | ||
* @param {!string} str The text to hyphenate. | ||
* @return {!string} The same text with soft hyphens inserted in the right positions. | ||
*/ | ||
Hypher.prototype.hyphenateText = function (str, minLength) { | ||
minLength = minLength || 4; | ||
|
||
// Regexp("\b", "g") splits on word boundaries, | ||
// compound separators and ZWNJ so we don't need | ||
// any special cases for those characters. Unfortunately | ||
// it does not support unicode word boundaries, so | ||
// we implement it manually. | ||
var words = str.split(/([a-zA-Z0-9_\u0027\u00DF-\u00EA\u00EC-\u00EF\u00F1-\u00F6\u00F8-\u00FD\u0101\u0103\u0105\u0107\u0109\u010D\u010F\u0111\u0113\u0117\u0119\u011B\u011D\u011F\u0123\u0125\u012B\u012F\u0131\u0135\u0137\u013C\u013E\u0142\u0144\u0146\u0148\u0151\u0153\u0155\u0159\u015B\u015D\u015F\u0161\u0165\u016B\u016D\u016F\u0171\u0173\u017A\u017C\u017E\u017F\u0219\u021B\u02BC\u0390\u03AC-\u03CE\u03F2\u0401\u0410-\u044F\u0451\u0454\u0456\u0457\u045E\u0491\u0531-\u0556\u0561-\u0587\u0902\u0903\u0905-\u090B\u090E-\u0910\u0912\u0914-\u0928\u092A-\u0939\u093E-\u0943\u0946-\u0948\u094A-\u094D\u0982\u0983\u0985-\u098B\u098F\u0990\u0994-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BE-\u09C3\u09C7\u09C8\u09CB-\u09CD\u09D7\u0A02\u0A03\u0A05-\u0A0A\u0A0F\u0A10\u0A14-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A82\u0A83\u0A85-\u0A8B\u0A8F\u0A90\u0A94-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABE-\u0AC3\u0AC7\u0AC8\u0ACB-\u0ACD\u0B02\u0B03\u0B05-\u0B0B\u0B0F\u0B10\u0B14-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3E-\u0B43\u0B47\u0B48\u0B4B-\u0B4D\u0B57\u0B82\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C02\u0C03\u0C05-\u0C0B\u0C0E-\u0C10\u0C12\u0C14-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C3E-\u0C43\u0C46-\u0C48\u0C4A-\u0C4D\u0C82\u0C83\u0C85-\u0C8B\u0C8E-\u0C90\u0C92\u0C94-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBE-\u0CC3\u0CC6-\u0CC8\u0CCA-\u0CCD\u0D02\u0D03\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D3E-\u0D43\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D60\u0D61\u0D7A-\u0D7F\u1F00-\u1F07\u1F10-\u1F15\u1F20-\u1F27\u1F30-\u1F37\u1F40-\u1F45\u1F50-\u1F57\u1F60-\u1F67\u1F70-\u1F7D\u1F80-\u1F87\u1F90-\u1F97\u1FA0-\u1FA7\u1FB2-\u1FB4\u1FB6\u1FB7\u1FBD\u1FBF\u1FC2-\u1FC4\u1FC6\u1FC7\u1FD2\u1FD3\u1FD6\u1FD7\u1FE2-\u1FE7\u1FF2-\u1FF4\u1FF6\u1FF7\u200D\u2019]+)/g); | ||
|
||
for (var i = 0; i < words.length; i += 1) { | ||
if (words[i].indexOf('/') !== -1) { | ||
// Don't insert a zero width space if the slash is at the beginning or end | ||
// of the text, or right after or before a space. | ||
if (i !== 0 && i !== words.length - 1 && !(/\s+\/|\/\s+/.test(words[i]))) { | ||
words[i] += '\u200B'; | ||
} | ||
} else if (words[i].length > minLength) { | ||
words[i] = this.hyphenate(words[i]).join('\u00AD'); | ||
} | ||
} | ||
return words.join(''); | ||
}; | ||
|
||
/** | ||
* Hyphenates a word. | ||
* | ||
* @param {!string} word The word to hyphenate | ||
* @return {!Array.<!string>} An array of word fragments indicating valid hyphenation points. | ||
*/ | ||
Hypher.prototype.hyphenate = function (word) { | ||
var characters, | ||
characterPoints = [], | ||
originalCharacters, | ||
i, | ||
j, | ||
k, | ||
node, | ||
points = [], | ||
wordLength, | ||
lowerCaseWord = word.toLowerCase(), | ||
nodePoints, | ||
nodePointsLength, | ||
m = Math.max, | ||
trie = this.trie, | ||
result = ['']; | ||
|
||
if (this.exceptions.hasOwnProperty(lowerCaseWord)) { | ||
return word.match(this.exceptions[lowerCaseWord]).slice(1); | ||
} | ||
|
||
if (word.indexOf('\u00AD') !== -1) { | ||
return [word]; | ||
} | ||
|
||
word = '_' + word + '_'; | ||
|
||
characters = word.toLowerCase().split(''); | ||
originalCharacters = word.split(''); | ||
wordLength = characters.length; | ||
|
||
for (i = 0; i < wordLength; i += 1) { | ||
points[i] = 0; | ||
characterPoints[i] = characters[i].charCodeAt(0); | ||
} | ||
|
||
for (i = 0; i < wordLength; i += 1) { | ||
node = trie; | ||
for (j = i; j < wordLength; j += 1) { | ||
node = node[characterPoints[j]]; | ||
|
||
if (node) { | ||
nodePoints = node._points; | ||
if (nodePoints) { | ||
for (k = 0, nodePointsLength = nodePoints.length; k < nodePointsLength; k += 1) { | ||
points[i + k] = m(points[i + k], nodePoints[k]); | ||
} | ||
} | ||
} else { | ||
break; | ||
} | ||
} | ||
} | ||
|
||
for (i = 1; i < wordLength - 1; i += 1) { | ||
if (i > this.leftMin && i < (wordLength - this.rightMin) && points[i] % 2) { | ||
result.push(originalCharacters[i]); | ||
} else { | ||
result[result.length - 1] += originalCharacters[i]; | ||
} | ||
} | ||
|
||
return result; | ||
}; | ||
|
||
module.exports = Hypher; | ||
window['Hypher'] = module.exports; | ||
|
||
window['Hypher']['languages'] = {}; | ||
}());(function ($) { | ||
$.fn.hyphenate = function (language) { | ||
if (window['Hypher']['languages'][language]) { | ||
return this.each(function () { | ||
var i = 0, len = this.childNodes.length; | ||
for (; i < len; i += 1) { | ||
if (this.childNodes[i].nodeType === 3) { | ||
this.childNodes[i].nodeValue = window['Hypher']['languages'][language].hyphenateText(this.childNodes[i].nodeValue); | ||
} | ||
} | ||
}); | ||
} | ||
}; | ||
}(jQuery)); |
Oops, something went wrong.