diff --git a/javascript/ascii-fold/README.md b/javascript/ascii-fold/README.md new file mode 100644 index 0000000..b24d42c --- /dev/null +++ b/javascript/ascii-fold/README.md @@ -0,0 +1,86 @@ +# ascii-fold (JavaScript) + +Best-effort ASCII folding and slug generation utilities extracted from a reusable snippet. It focuses on practical, predictable results: + +- Removes diacritics using Unicode NFKD normalization and strips combining marks +- Handles common ligatures and special Latin letters (Æ/æ → AE/ae, ß → ss, etc.) +- Maps typographic quotes, dashes, ellipsis, spaces, and a few symbols to sensible ASCII +- Optional strict ASCII-only output or keep-non-ASCII-with-placeholder +- Includes a small `toSlug` helper built on top of `toASCII` + +## Quick usage + +Minimal examples showing what the functions do. Adjust to your environment as needed. + +```js +// Assume you have the functions available in scope + +const input = "Café™ — 50 °C"; + +// Basic ASCII folding (default: asciiOnly=true, marksStyle="plain") +const ascii = toASCII(input); +// => "Cafe tm - 50 C" + +// Keep non-ASCII by substituting unknowns +const kept = toASCII("Emoji: 😀", { asciiOnly: false, unknown: "?" }); +// => "Emoji: ?" + +// Slugify +const slug = toSlug("Hello, World! © 2025"); +// => "hello-world-c-2025" +``` + +## Functions + +### `toASCII(input, options)` +Converts a string to a best-effort ASCII equivalent: +1) NFKD normalize, 2) strip combining marks, 3) map ligatures/letters and symbols, 4) optionally enforce ASCII-only. + +Options (`AsciiFoldOptions`): +- `marksStyle`: `"plain" | "paren"` (default `"plain"`) + - `"plain"`: © ® ™ ℠ → `c r tm sm` + - `"paren"`: © ® ™ ℠ → `(c) (r) (tm) (sm)` +- `asciiOnly`: `boolean` (default `true`) + - When `true`, removes any remaining non-ASCII after mapping + - When `false`, keeps non-ASCII but replaces still-unknowns with `unknown` +- `unknown`: `string` (default `"?"`) + - Placeholder for non-ASCII characters that remain when `asciiOnly=false` + +Examples: +```js +toASCII("Äffin – ½ kg", { marksStyle: "paren" }); +// => "Affin - 1/2 kg" + +toASCII("naïve façade", {}); +// => "naive facade" +``` + +### `toSlug(input, options)` +Builds on `toASCII` and normalizes to a URL-friendly slug. + +Options: +- `separator`: string (default `"-"`) +- `caseStyle`: `"lower" | "upper" | "none"` (default `"lower"`) +- `strict`: `boolean` (default `false`) + - When `true`, removes everything except `A–Z a–z 0–9` and the chosen separator +- `toASCIIOptions`: `AsciiFoldOptions` (passed to `toASCII` first) + +Examples: +```js +toSlug("Crème brûlée — ©", { separator: "-" }); +// => "creme-brulee-c" + +toSlug("Über cool", { caseStyle: "upper", separator: "_" }); +// => "UBER_COOL" +``` + +## Notes on behavior + +- Ligatures and special letters handled explicitly: ff/fi/fl/ffi/ffl, Æ/æ, Œ/œ, ß, Þ/þ, Ð/ð, Ł/ł, Ø/ø, Đ/đ +- Typographic punctuation mapped to ASCII: curly quotes → straight quotes, en/em dashes → `-`, ellipsis → `...`, non-breaking and thin/figure spaces → normal space +- Some miscellaneous symbols mapped: `° → deg`, `× → x`, `÷ → /`, `• → *`, simple fraction glyphs like `½ ¼ ¾` +- Zero-width marks (ZWNJ/ZWJ/BOM) are removed + +## License + +See the repository-level `LICENSE` file. diff --git a/javascript/ascii-fold/ascii-fold.js b/javascript/ascii-fold/ascii-fold.js new file mode 100644 index 0000000..c6fd863 --- /dev/null +++ b/javascript/ascii-fold/ascii-fold.js @@ -0,0 +1,160 @@ +/** + * @typedef {Object} AsciiFoldOptions + * @property {"plain"|"paren"} [marksStyle="plain"] How to render © ® ™ ℠ (plain: "c", "r", "tm", "sm"; paren: "(c)", "(r)", "(tm)", "(sm)") + * @property {boolean} [asciiOnly=true] If true, removes any remaining non-ASCII chars after mapping + * @property {string} [unknown="?"] Replacement for still-unknown non-ASCII when asciiOnly=false + */ + +const ligaturesMap = new Map([ + // Common typographic ligatures + ["\uFB00", "ff"], ["\uFB01", "fi"], ["\uFB02", "fl"], + ["\uFB03", "ffi"], ["\uFB04", "ffl"], + ["\uFB05", "st"], ["\uFB06", "st"], + // Latin letters that don't decompose via NFKD the way we want + ["Æ", "AE"], ["æ", "ae"], + ["Œ", "OE"], ["œ", "oe"], + ["ß", "ss"], + ["Þ", "Th"], ["þ", "th"], + ["Ð", "D"], ["ð", "d"], + ["Ł", "L"], ["ł", "l"], + ["Ø", "O"], ["ø", "o"], + ["Đ", "D"], ["đ", "d"] +]); + +/** Symbols, punctuation, and marks with sensible ASCII fallbacks */ +function makeSymbolsMap(marksStyle = "plain") { + const tm = marksStyle === "paren" ? "(tm)" : "tm"; + const sm = marksStyle === "paren" ? "(sm)" : "sm"; + const r = marksStyle === "paren" ? "(r)" : "r"; + const c = marksStyle === "paren" ? "(c)" : "c"; + + return new Map([ + // Legal & marks + ["\u00A9", c], // © + ["\u00AE", r], // ® + ["\u2122", tm], // ™ + ["\u2120", sm], // ℠ + + // Quotes → straight ASCII + ["\u2018", "'"], + ["\u2019", "'"], + ["\u201A", "'"], + ["\u201B", "'"], + ["\u201C", "\""], + ["\u201D", "\""], + ["\u201E", "\""], + ["\u201F", "\""], + + // Dashes & ellipsis + ["\u2013", "-"], // – en dash + ["\u2014", "-"], // — em dash + ["\u2212", "-"], // − minus + ["\u2026", "..."],// … ellipsis + + // Misc symbols + ["\u00B0", "deg"],// ° + ["\u00D7", "x"], // × + ["\u00F7", "/"], // ÷ + ["\u2022", "*"], // • + ["\u00A0", " "], // non-breaking space + ["\u2007", " "], // figure space + ["\u202F", " "], // narrow no-break space + ["\u2009", " "], // thin space + ["\u2002", " "], ["\u2003", " "], ["\u2004", " "], ["\u2005", " "], ["\u2006", " "], ["\u2008", " "], ["\u200A", " "], ["\u205F", " "], ["\u3000", " "], // spaces + ["\u200B", ""], // zero-width space + ["\u200C", ""], ["\u200D", ""], ["\uFEFF", ""], // zero-width non-joiner/joiner & BOM + // Fractions (basic) + ["\u00BD", "1/2"], ["\u00BC", "1/4"], ["\u00BE", "3/4"] + ]); +} + +/** + * Convert a string to a best-effort ASCII equivalent. + * 1) Normalize to NFKD (compatibility decomposition) + * 2) Remove combining marks \p{M} + * 3) Apply explicit ligature/letter & symbol maps + * 4) Optionally force pure ASCII + * + * @param {string} input + * @param {AsciiFoldOptions} [options] + * @returns {string} + */ +export function toASCII(input, options = {}) { + const { + marksStyle = "plain", + asciiOnly = true, + unknown = "?" + } = options; + + if(input == null) { + return ""; + } + + // Step 1/2: Normalize + strip combining marks + let output = input + .normalize("NFKD") + .replace(/\p{M}+/gu, ""); // remove diacritical combining marks + + // Step 3a: known ligatures/letters that NFKD doesn't map as desired + output = [...output].map(_char => ligaturesMap.get(_char) ?? _char).join(""); + + // Step 3b: symbols & punctuation fallbacks + const symbolsMap = makeSymbolsMap(marksStyle); + output = [...output].map(_char => symbolsMap.get(_char) ?? _char).join(""); + + // Step 4: optionally enforce ASCII-only + if(asciiOnly) { + // Replace anything outside U+0000..U+007F with nothing + output = output.replace(/[^\x00-\x7F]+/g, ""); + } else { + // Keep but flag unknowns + output = output.replace(/[^\x00-\x7F]/g, unknown); + } + + return output; +} + +/** + * Convert a string into a slug-style ASCII string. + * + * @param {string} input + * @param {{ + * separator?: string, + * caseStyle?: "lower" | "upper" | "title" | "camel" | "pascal" | "none", + * strict?: boolean, + * toASCIIOptions?: AsciiFoldOptions + * }} [options] + * @returns {string} + */ +export function toSlug(input, options = {}) { + const { + separator = "-", + caseStyle = "lower", + strict = false, + toASCIIOptions = { + marksStyle: "plain", + asciiOnly: true, + unknown: "?" + } + } = options; + + let slug = toASCII(input, toASCIIOptions); + + slug = slug + .replace(/[^A-Za-z0-9]+/g, separator) + .replace(new RegExp(`${separator}{2,}`, "g"), separator) + .replace(new RegExp(`^${separator}|${separator}$`, "g"), ""); + + switch(caseStyle) { + case "none": break; // Keep whatever case remains after toASCII() + case "upper": slug = slug.toUpperCase(); break; + case "lower": + default: slug = slug.toLowerCase(); break; + } + + if (strict) { + slug = slug.replace(new RegExp(`[^A-Za-z0-9${separator}]`, "g"), ""); + } + + return slug; +}