161 lines
5.0 KiB
JavaScript
161 lines
5.0 KiB
JavaScript
/**
|
||
* @typedef {Object} AsciiFoldOptions
|
||
* @property {"plain"|"paren"} [marksStyle="plain"] How to render © ® ™ ℠ (plain: "c", "r", "tm", "sm"; paren: "(c)", "(r)", "(tm)", "(sm)")
|
||
* @property {boolean} [asciiOnly=true] If true, removes any remaining non-ASCII chars after mapping
|
||
* @property {string} [unknown="?"] Replacement for still-unknown non-ASCII when asciiOnly=false
|
||
*/
|
||
|
||
const ligaturesMap = new Map([
|
||
// Common typographic ligatures
|
||
["\uFB00", "ff"], ["\uFB01", "fi"], ["\uFB02", "fl"],
|
||
["\uFB03", "ffi"], ["\uFB04", "ffl"],
|
||
["\uFB05", "st"], ["\uFB06", "st"],
|
||
// Latin letters that don't decompose via NFKD the way we want
|
||
["Æ", "AE"], ["æ", "ae"],
|
||
["Œ", "OE"], ["œ", "oe"],
|
||
["ß", "ss"],
|
||
["Þ", "Th"], ["þ", "th"],
|
||
["Ð", "D"], ["ð", "d"],
|
||
["Ł", "L"], ["ł", "l"],
|
||
["Ø", "O"], ["ø", "o"],
|
||
["Đ", "D"], ["đ", "d"]
|
||
]);
|
||
|
||
/** Symbols, punctuation, and marks with sensible ASCII fallbacks */
|
||
function makeSymbolsMap(marksStyle = "plain") {
|
||
const tm = marksStyle === "paren" ? "(tm)" : "tm";
|
||
const sm = marksStyle === "paren" ? "(sm)" : "sm";
|
||
const r = marksStyle === "paren" ? "(r)" : "r";
|
||
const c = marksStyle === "paren" ? "(c)" : "c";
|
||
|
||
return new Map([
|
||
// Legal & marks
|
||
["\u00A9", c], // ©
|
||
["\u00AE", r], // ®
|
||
["\u2122", tm], // ™
|
||
["\u2120", sm], // ℠
|
||
|
||
// Quotes → straight ASCII
|
||
["\u2018", "'"],
|
||
["\u2019", "'"],
|
||
["\u201A", "'"],
|
||
["\u201B", "'"],
|
||
["\u201C", "\""],
|
||
["\u201D", "\""],
|
||
["\u201E", "\""],
|
||
["\u201F", "\""],
|
||
|
||
// Dashes & ellipsis
|
||
["\u2013", "-"], // – en dash
|
||
["\u2014", "-"], // — em dash
|
||
["\u2212", "-"], // − minus
|
||
["\u2026", "..."],// … ellipsis
|
||
|
||
// Misc symbols
|
||
["\u00B0", "deg"],// °
|
||
["\u00D7", "x"], // ×
|
||
["\u00F7", "/"], // ÷
|
||
["\u2022", "*"], // •
|
||
["\u00A0", " "], // non-breaking space
|
||
["\u2007", " "], // figure space
|
||
["\u202F", " "], // narrow no-break space
|
||
["\u2009", " "], // thin space
|
||
["\u2002", " "], ["\u2003", " "], ["\u2004", " "], ["\u2005", " "], ["\u2006", " "], ["\u2008", " "], ["\u200A", " "], ["\u205F", " "], ["\u3000", " "], // spaces
|
||
["\u200B", ""], // zero-width space
|
||
["\u200C", ""], ["\u200D", ""], ["\uFEFF", ""], // zero-width non-joiner/joiner & BOM
|
||
// Fractions (basic)
|
||
["\u00BD", "1/2"], ["\u00BC", "1/4"], ["\u00BE", "3/4"]
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* Convert a string to a best-effort ASCII equivalent.
|
||
* 1) Normalize to NFKD (compatibility decomposition)
|
||
* 2) Remove combining marks \p{M}
|
||
* 3) Apply explicit ligature/letter & symbol maps
|
||
* 4) Optionally force pure ASCII
|
||
*
|
||
* @param {string} input
|
||
* @param {AsciiFoldOptions} [options]
|
||
* @returns {string}
|
||
*/
|
||
export function toASCII(input, options = {}) {
|
||
const {
|
||
marksStyle = "plain",
|
||
asciiOnly = true,
|
||
unknown = "?"
|
||
} = options;
|
||
|
||
if(input == null) {
|
||
return "";
|
||
}
|
||
|
||
// Step 1/2: Normalize + strip combining marks
|
||
let output = input
|
||
.normalize("NFKD")
|
||
.replace(/\p{M}+/gu, ""); // remove diacritical combining marks
|
||
|
||
// Step 3a: known ligatures/letters that NFKD doesn't map as desired
|
||
output = [...output].map(_char => ligaturesMap.get(_char) ?? _char).join("");
|
||
|
||
// Step 3b: symbols & punctuation fallbacks
|
||
const symbolsMap = makeSymbolsMap(marksStyle);
|
||
output = [...output].map(_char => symbolsMap.get(_char) ?? _char).join("");
|
||
|
||
// Step 4: optionally enforce ASCII-only
|
||
if(asciiOnly) {
|
||
// Replace anything outside U+0000..U+007F with nothing
|
||
output = output.replace(/[^\x00-\x7F]+/g, "");
|
||
} else {
|
||
// Keep but flag unknowns
|
||
output = output.replace(/[^\x00-\x7F]/g, unknown);
|
||
}
|
||
|
||
return output;
|
||
}
|
||
|
||
/**
|
||
* Convert a string into a slug-style ASCII string.
|
||
*
|
||
* @param {string} input
|
||
* @param {{
|
||
* separator?: string,
|
||
* caseStyle?: "lower" | "upper" | "title" | "camel" | "pascal" | "none",
|
||
* strict?: boolean,
|
||
* toASCIIOptions?: AsciiFoldOptions
|
||
* }} [options]
|
||
* @returns {string}
|
||
*/
|
||
export function toSlug(input, options = {}) {
|
||
const {
|
||
separator = "-",
|
||
caseStyle = "lower",
|
||
strict = false,
|
||
toASCIIOptions = {
|
||
marksStyle: "plain",
|
||
asciiOnly: true,
|
||
unknown: "?"
|
||
}
|
||
} = options;
|
||
|
||
let slug = toASCII(input, toASCIIOptions);
|
||
|
||
slug = slug
|
||
.replace(/[^A-Za-z0-9]+/g, separator)
|
||
.replace(new RegExp(`${separator}{2,}`, "g"), separator)
|
||
.replace(new RegExp(`^${separator}|${separator}$`, "g"), "");
|
||
|
||
switch(caseStyle) {
|
||
case "none": break; // Keep whatever case remains after toASCII()
|
||
case "upper": slug = slug.toUpperCase(); break;
|
||
case "lower":
|
||
default: slug = slug.toLowerCase(); break;
|
||
}
|
||
|
||
if (strict) {
|
||
slug = slug.replace(new RegExp(`[^A-Za-z0-9${separator}]`, "g"), "");
|
||
}
|
||
|
||
return slug;
|
||
}
|