2025-10-31 13:00:04 +00:00
2 changed files with 246 additions and 0 deletions
--- a/javascript/ascii-fold/README.md
+++ b/javascript/ascii-fold/README.md
@ -0,0 +1,86 @@
+# ascii-fold (JavaScript)
+
+Best-effort ASCII folding and slug generation utilities extracted from a reusable snippet. It focuses on practical, predictable results:
+
+- Removes diacritics using Unicode NFKD normalization and strips combining marks
+- Handles common ligatures and special Latin letters (Æ/æ → AE/ae, ß → ss, etc.)
+- Maps typographic quotes, dashes, ellipsis, spaces, and a few symbols to sensible ASCII
+- Optional strict ASCII-only output or keep-non-ASCII-with-placeholder
+- Includes a small `toSlug` helper built on top of `toASCII`
+
+## Quick usage
+
+Minimal examples showing what the functions do. Adjust to your environment as needed.
+
+```js
+// Assume you have the functions available in scope
+
+const input = "Café™ — 50 °C";
+
+// Basic ASCII folding (default: asciiOnly=true, marksStyle="plain")
+const ascii = toASCII(input);
+// => "Cafe tm - 50 C"
+
+// Keep non-ASCII by substituting unknowns
+const kept = toASCII("Emoji: 😀", { asciiOnly: false, unknown: "?" });
+// => "Emoji: ?"
+
+// Slugify
+const slug = toSlug("Hello, World! © 2025");
+// => "hello-world-c-2025"
+```
+
+## Functions
+
+### `toASCII(input, options)`
+Converts a string to a best-effort ASCII equivalent:
+1) NFKD normalize, 2) strip combining marks, 3) map ligatures/letters and symbols, 4) optionally enforce ASCII-only.
+
+Options (`AsciiFoldOptions`):
+- `marksStyle`: `"plain" | "paren"` (default `"plain"`)
+  - `"plain"`: © ® ™ ℠ → `c r tm sm`
+  - `"paren"`: © ® ™ ℠ → `(c) (r) (tm) (sm)`
+- `asciiOnly`: `boolean` (default `true`)
+  - When `true`, removes any remaining non-ASCII after mapping
+  - When `false`, keeps non-ASCII but replaces still-unknowns with `unknown`
+- `unknown`: `string` (default `"?"`)
+  - Placeholder for non-ASCII characters that remain when `asciiOnly=false`
+
+Examples:
+```js
+toASCII("Äffin – ½ kg", { marksStyle: "paren" });
+// => "Affin - 1/2 kg"
+
+toASCII("naïve façade", {});
+// => "naive facade"
+```
+
+### `toSlug(input, options)`
+Builds on `toASCII` and normalizes to a URL-friendly slug.
+
+Options:
+- `separator`: string (default `"-"`)
+- `caseStyle`: `"lower" | "upper" | "none"` (default `"lower"`)
+- `strict`: `boolean` (default `false`)
+  - When `true`, removes everything except `A–Z a–z 0–9` and the chosen separator
+- `toASCIIOptions`: `AsciiFoldOptions` (passed to `toASCII` first)
+
+Examples:
+```js
+toSlug("Crème brûlée — ©", { separator: "-" });
+// => "creme-brulee-c"
+
+toSlug("Über cool", { caseStyle: "upper", separator: "_" });
+// => "UBER_COOL"
+```
+
+## Notes on behavior
+
+- Ligatures and special letters handled explicitly: ff/fi/fl/ffi/ffl, Æ/æ, Œ/œ, ß, Þ/þ, Ð/ð, Ł/ł, Ø/ø, Đ/đ
+- Typographic punctuation mapped to ASCII: curly quotes → straight quotes, en/em dashes → `-`, ellipsis → `...`, non-breaking and thin/figure spaces → normal space
+- Some miscellaneous symbols mapped: `° → deg`, `× → x`, `÷ → /`, `• → *`, simple fraction glyphs like `½ ¼ ¾`
+- Zero-width marks (ZWNJ/ZWJ/BOM) are removed
+
+## License
+
+See the repository-level `LICENSE` file.
--- a/javascript/ascii-fold/ascii-fold.js
+++ b/javascript/ascii-fold/ascii-fold.js
@ -0,0 +1,160 @@
+/**
+ * @typedef {Object} AsciiFoldOptions
+ * @property {"plain"|"paren"} [marksStyle="plain"]  How to render © ® ™ ℠ (plain: "c", "r", "tm", "sm"; paren: "(c)", "(r)", "(tm)", "(sm)")
+ * @property {boolean} [asciiOnly=true]              If true, removes any remaining non-ASCII chars after mapping
+ * @property {string}  [unknown="?"]                 Replacement for still-unknown non-ASCII when asciiOnly=false
+ */
+
+const ligaturesMap = new Map([
+    // Common typographic ligatures
+    ["\uFB00", "ff"], ["\uFB01", "fi"], ["\uFB02", "fl"],
+    ["\uFB03", "ffi"], ["\uFB04", "ffl"],
+    ["\uFB05", "st"], ["\uFB06", "st"],
+    // Latin letters that don't decompose via NFKD the way we want
+    ["Æ", "AE"], ["æ", "ae"],
+    ["Œ", "OE"], ["œ", "oe"],
+    ["ß", "ss"],
+    ["Þ", "Th"], ["þ", "th"],
+    ["Ð", "D"], ["ð", "d"],
+    ["Ł", "L"], ["ł", "l"],
+    ["Ø", "O"], ["ø", "o"],
+    ["Đ", "D"], ["đ", "d"]
+]);
+
+/** Symbols, punctuation, and marks with sensible ASCII fallbacks */
+function makeSymbolsMap(marksStyle = "plain") {
+    const tm = marksStyle === "paren" ? "(tm)" : "tm";
+    const sm = marksStyle === "paren" ? "(sm)" : "sm";
+    const r = marksStyle === "paren" ? "(r)" : "r";
+    const c = marksStyle === "paren" ? "(c)" : "c";
+
+    return new Map([
+        // Legal & marks
+        ["\u00A9", c],   // ©
+        ["\u00AE", r],   // ®
+        ["\u2122", tm],  // ™
+        ["\u2120", sm],  // ℠
+
+        // Quotes → straight ASCII
+        ["\u2018", "'"],
+        ["\u2019", "'"],
+        ["\u201A", "'"],
+        ["\u201B", "'"],
+        ["\u201C", "\""],
+        ["\u201D", "\""],
+        ["\u201E", "\""],
+        ["\u201F", "\""],
+
+        // Dashes & ellipsis
+        ["\u2013", "-"],  // – en dash
+        ["\u2014", "-"],  // — em dash
+        ["\u2212", "-"],  // − minus
+        ["\u2026", "..."],// … ellipsis
+
+        // Misc symbols
+        ["\u00B0", "deg"],// °
+        ["\u00D7", "x"],  // ×
+        ["\u00F7", "/"],  // ÷
+        ["\u2022", "*"],  // •
+        ["\u00A0", " "],  // non-breaking space
+        ["\u2007", " "],  // figure space
+        ["\u202F", " "],  // narrow no-break space
+        ["\u2009", " "],  // thin space
+        ["\u2002", " "], ["\u2003", " "], ["\u2004", " "], ["\u2005", " "], ["\u2006", " "], ["\u2008", " "], ["\u200A", " "], ["\u205F", " "], ["\u3000", " "], // spaces
+        ["\u200B", ""],   // zero-width space
+        ["\u200C", ""], ["\u200D", ""], ["\uFEFF", ""], // zero-width non-joiner/joiner & BOM
+        // Fractions (basic)
+        ["\u00BD", "1/2"], ["\u00BC", "1/4"], ["\u00BE", "3/4"]
+    ]);
+}
+
+/**
+ * Convert a string to a best-effort ASCII equivalent.
+ * 1) Normalize to NFKD (compatibility decomposition)
+ * 2) Remove combining marks \p{M}
+ * 3) Apply explicit ligature/letter & symbol maps
+ * 4) Optionally force pure ASCII
+ *
+ * @param {string} input
+ * @param {AsciiFoldOptions} [options]
+ * @returns {string}
+ */
+export function toASCII(input, options = {}) {
+    const {
+        marksStyle = "plain",
+        asciiOnly = true,
+        unknown = "?"
+    } = options;
+
+    if(input == null) {
+        return "";
+    }
+
+    // Step 1/2: Normalize + strip combining marks
+    let output = input
+        .normalize("NFKD")
+        .replace(/\p{M}+/gu, ""); // remove diacritical combining marks
+
+    // Step 3a: known ligatures/letters that NFKD doesn't map as desired
+    output = [...output].map(_char => ligaturesMap.get(_char) ?? _char).join("");
+
+    // Step 3b: symbols & punctuation fallbacks
+    const symbolsMap = makeSymbolsMap(marksStyle);
+    output = [...output].map(_char => symbolsMap.get(_char) ?? _char).join("");
+
+    // Step 4: optionally enforce ASCII-only
+    if(asciiOnly) {
+        // Replace anything outside U+0000..U+007F with nothing
+        output = output.replace(/[^\x00-\x7F]+/g, "");
+    } else {
+        // Keep but flag unknowns
+        output = output.replace(/[^\x00-\x7F]/g, unknown);
+    }
+
+    return output;
+}
+
+/**
+ * Convert a string into a slug-style ASCII string.
+ *
+ * @param {string} input
+ * @param {{
+ *   separator?: string,
+ *   caseStyle?: "lower" | "upper" | "title" | "camel" | "pascal" | "none",
+ *   strict?: boolean,
+ *   toASCIIOptions?: AsciiFoldOptions
+ * }} [options]
+ * @returns {string}
+ */
+export function toSlug(input, options = {}) {
+    const {
+        separator = "-",
+        caseStyle = "lower",
+        strict = false,
+        toASCIIOptions = {
+            marksStyle: "plain",
+            asciiOnly: true,
+            unknown: "?"
+        }
+    } = options;
+
+    let slug = toASCII(input, toASCIIOptions);
+
+    slug = slug
+        .replace(/[^A-Za-z0-9]+/g, separator)
+        .replace(new RegExp(`${separator}{2,}`, "g"), separator)
+        .replace(new RegExp(`^${separator}|${separator}$`, "g"), "");
+
+    switch(caseStyle) {
+        case "none": break; // Keep whatever case remains after toASCII()
+        case "upper": slug = slug.toUpperCase(); break;
+        case "lower":
+        default: slug = slug.toLowerCase(); break;
+    }
+
+    if (strict) {
+        slug = slug.replace(new RegExp(`[^A-Za-z0-9${separator}]`, "g"), "");
+    }
+
+    return slug;
+}