Add ASCII fold snippet
This commit is contained in:
160
javascript/ascii-fold/ascii-fold.js
Normal file
160
javascript/ascii-fold/ascii-fold.js
Normal file
@ -0,0 +1,160 @@
|
||||
/**
|
||||
* @typedef {Object} AsciiFoldOptions
|
||||
* @property {"plain"|"paren"} [marksStyle="plain"] How to render © ® ™ ℠ (plain: "c", "r", "tm", "sm"; paren: "(c)", "(r)", "(tm)", "(sm)")
|
||||
* @property {boolean} [asciiOnly=true] If true, removes any remaining non-ASCII chars after mapping
|
||||
* @property {string} [unknown="?"] Replacement for still-unknown non-ASCII when asciiOnly=false
|
||||
*/
|
||||
|
||||
const ligaturesMap = new Map([
|
||||
// Common typographic ligatures
|
||||
["\uFB00", "ff"], ["\uFB01", "fi"], ["\uFB02", "fl"],
|
||||
["\uFB03", "ffi"], ["\uFB04", "ffl"],
|
||||
["\uFB05", "st"], ["\uFB06", "st"],
|
||||
// Latin letters that don't decompose via NFKD the way we want
|
||||
["Æ", "AE"], ["æ", "ae"],
|
||||
["Œ", "OE"], ["œ", "oe"],
|
||||
["ß", "ss"],
|
||||
["Þ", "Th"], ["þ", "th"],
|
||||
["Ð", "D"], ["ð", "d"],
|
||||
["Ł", "L"], ["ł", "l"],
|
||||
["Ø", "O"], ["ø", "o"],
|
||||
["Đ", "D"], ["đ", "d"]
|
||||
]);
|
||||
|
||||
/** Symbols, punctuation, and marks with sensible ASCII fallbacks */
|
||||
function makeSymbolsMap(marksStyle = "plain") {
|
||||
const tm = marksStyle === "paren" ? "(tm)" : "tm";
|
||||
const sm = marksStyle === "paren" ? "(sm)" : "sm";
|
||||
const r = marksStyle === "paren" ? "(r)" : "r";
|
||||
const c = marksStyle === "paren" ? "(c)" : "c";
|
||||
|
||||
return new Map([
|
||||
// Legal & marks
|
||||
["\u00A9", c], // ©
|
||||
["\u00AE", r], // ®
|
||||
["\u2122", tm], // ™
|
||||
["\u2120", sm], // ℠
|
||||
|
||||
// Quotes → straight ASCII
|
||||
["\u2018", "'"],
|
||||
["\u2019", "'"],
|
||||
["\u201A", "'"],
|
||||
["\u201B", "'"],
|
||||
["\u201C", "\""],
|
||||
["\u201D", "\""],
|
||||
["\u201E", "\""],
|
||||
["\u201F", "\""],
|
||||
|
||||
// Dashes & ellipsis
|
||||
["\u2013", "-"], // – en dash
|
||||
["\u2014", "-"], // — em dash
|
||||
["\u2212", "-"], // − minus
|
||||
["\u2026", "..."],// … ellipsis
|
||||
|
||||
// Misc symbols
|
||||
["\u00B0", "deg"],// °
|
||||
["\u00D7", "x"], // ×
|
||||
["\u00F7", "/"], // ÷
|
||||
["\u2022", "*"], // •
|
||||
["\u00A0", " "], // non-breaking space
|
||||
["\u2007", " "], // figure space
|
||||
["\u202F", " "], // narrow no-break space
|
||||
["\u2009", " "], // thin space
|
||||
["\u2002", " "], ["\u2003", " "], ["\u2004", " "], ["\u2005", " "], ["\u2006", " "], ["\u2008", " "], ["\u200A", " "], ["\u205F", " "], ["\u3000", " "], // spaces
|
||||
["\u200B", ""], // zero-width space
|
||||
["\u200C", ""], ["\u200D", ""], ["\uFEFF", ""], // zero-width non-joiner/joiner & BOM
|
||||
// Fractions (basic)
|
||||
["\u00BD", "1/2"], ["\u00BC", "1/4"], ["\u00BE", "3/4"]
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a string to a best-effort ASCII equivalent.
|
||||
* 1) Normalize to NFKD (compatibility decomposition)
|
||||
* 2) Remove combining marks \p{M}
|
||||
* 3) Apply explicit ligature/letter & symbol maps
|
||||
* 4) Optionally force pure ASCII
|
||||
*
|
||||
* @param {string} input
|
||||
* @param {AsciiFoldOptions} [options]
|
||||
* @returns {string}
|
||||
*/
|
||||
export function toASCII(input, options = {}) {
|
||||
const {
|
||||
marksStyle = "plain",
|
||||
asciiOnly = true,
|
||||
unknown = "?"
|
||||
} = options;
|
||||
|
||||
if(input == null) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Step 1/2: Normalize + strip combining marks
|
||||
let output = input
|
||||
.normalize("NFKD")
|
||||
.replace(/\p{M}+/gu, ""); // remove diacritical combining marks
|
||||
|
||||
// Step 3a: known ligatures/letters that NFKD doesn't map as desired
|
||||
output = [...output].map(_char => ligaturesMap.get(_char) ?? _char).join("");
|
||||
|
||||
// Step 3b: symbols & punctuation fallbacks
|
||||
const symbolsMap = makeSymbolsMap(marksStyle);
|
||||
output = [...output].map(_char => symbolsMap.get(_char) ?? _char).join("");
|
||||
|
||||
// Step 4: optionally enforce ASCII-only
|
||||
if(asciiOnly) {
|
||||
// Replace anything outside U+0000..U+007F with nothing
|
||||
output = output.replace(/[^\x00-\x7F]+/g, "");
|
||||
} else {
|
||||
// Keep but flag unknowns
|
||||
output = output.replace(/[^\x00-\x7F]/g, unknown);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a string into a slug-style ASCII string.
|
||||
*
|
||||
* @param {string} input
|
||||
* @param {{
|
||||
* separator?: string,
|
||||
* caseStyle?: "lower" | "upper" | "title" | "camel" | "pascal" | "none",
|
||||
* strict?: boolean,
|
||||
* toASCIIOptions?: AsciiFoldOptions
|
||||
* }} [options]
|
||||
* @returns {string}
|
||||
*/
|
||||
export function toSlug(input, options = {}) {
|
||||
const {
|
||||
separator = "-",
|
||||
caseStyle = "lower",
|
||||
strict = false,
|
||||
toASCIIOptions = {
|
||||
marksStyle: "plain",
|
||||
asciiOnly: true,
|
||||
unknown: "?"
|
||||
}
|
||||
} = options;
|
||||
|
||||
let slug = toASCII(input, toASCIIOptions);
|
||||
|
||||
slug = slug
|
||||
.replace(/[^A-Za-z0-9]+/g, separator)
|
||||
.replace(new RegExp(`${separator}{2,}`, "g"), separator)
|
||||
.replace(new RegExp(`^${separator}|${separator}$`, "g"), "");
|
||||
|
||||
switch(caseStyle) {
|
||||
case "none": break; // Keep whatever case remains after toASCII()
|
||||
case "upper": slug = slug.toUpperCase(); break;
|
||||
case "lower":
|
||||
default: slug = slug.toLowerCase(); break;
|
||||
}
|
||||
|
||||
if (strict) {
|
||||
slug = slug.replace(new RegExp(`[^A-Za-z0-9${separator}]`, "g"), "");
|
||||
}
|
||||
|
||||
return slug;
|
||||
}
|
||||
Reference in New Issue
Block a user