code-snippets/javascript/ascii-fold/ascii-fold.js

/**
 * @typedef {Object} AsciiFoldOptions
 * @property {"plain"|"paren"} [marksStyle="plain"]  How to render © ® ™ ℠ (plain: "c", "r", "tm", "sm"; paren: "(c)", "(r)", "(tm)", "(sm)")
 * @property {boolean} [asciiOnly=true]              If true, removes any remaining non-ASCII chars after mapping
 * @property {string}  [unknown="?"]                 Replacement for still-unknown non-ASCII when asciiOnly=false
 */

const ligaturesMap = new Map([
    // Common typographic ligatures
    ["\uFB00", "ff"], ["\uFB01", "fi"], ["\uFB02", "fl"],
    ["\uFB03", "ffi"], ["\uFB04", "ffl"],
    ["\uFB05", "st"], ["\uFB06", "st"],
    // Latin letters that don't decompose via NFKD the way we want
    ["Æ", "AE"], ["æ", "ae"],
    ["Œ", "OE"], ["œ", "oe"],
    ["ß", "ss"],
    ["Þ", "Th"], ["þ", "th"],
    ["Ð", "D"], ["ð", "d"],
    ["Ł", "L"], ["ł", "l"],
    ["Ø", "O"], ["ø", "o"],
    ["Đ", "D"], ["đ", "d"]
]);

/** Symbols, punctuation, and marks with sensible ASCII fallbacks */
function makeSymbolsMap(marksStyle = "plain") {
    const tm = marksStyle === "paren" ? "(tm)" : "tm";
    const sm = marksStyle === "paren" ? "(sm)" : "sm";
    const r = marksStyle === "paren" ? "(r)" : "r";
    const c = marksStyle === "paren" ? "(c)" : "c";

    return new Map([
        // Legal & marks
        ["\u00A9", c],   // ©
        ["\u00AE", r],   // ®
        ["\u2122", tm],  // ™
        ["\u2120", sm],  // ℠

        // Quotes → straight ASCII
        ["\u2018", "'"],
        ["\u2019", "'"],
        ["\u201A", "'"],
        ["\u201B", "'"],
        ["\u201C", "\""],
        ["\u201D", "\""],
        ["\u201E", "\""],
        ["\u201F", "\""],

        // Dashes & ellipsis
        ["\u2013", "-"],  // – en dash
        ["\u2014", "-"],  // — em dash
        ["\u2212", "-"],  // − minus
        ["\u2026", "..."],// … ellipsis

        // Misc symbols
        ["\u00B0", "deg"],// °
        ["\u00D7", "x"],  // ×
        ["\u00F7", "/"],  // ÷
        ["\u2022", "*"],  // •
        ["\u00A0", " "],  // non-breaking space
        ["\u2007", " "],  // figure space
        ["\u202F", " "],  // narrow no-break space
        ["\u2009", " "],  // thin space
        ["\u2002", " "], ["\u2003", " "], ["\u2004", " "], ["\u2005", " "], ["\u2006", " "], ["\u2008", " "], ["\u200A", " "], ["\u205F", " "], ["\u3000", " "], // spaces
        ["\u200B", ""],   // zero-width space
        ["\u200C", ""], ["\u200D", ""], ["\uFEFF", ""], // zero-width non-joiner/joiner & BOM
        // Fractions (basic)
        ["\u00BD", "1/2"], ["\u00BC", "1/4"], ["\u00BE", "3/4"]
    ]);
}

/**
 * Convert a string to a best-effort ASCII equivalent.
 * 1) Normalize to NFKD (compatibility decomposition)
 * 2) Remove combining marks \p{M}
 * 3) Apply explicit ligature/letter & symbol maps
 * 4) Optionally force pure ASCII
 *
 * @param {string} input
 * @param {AsciiFoldOptions} [options]
 * @returns {string}
 */
export function toASCII(input, options = {}) {
    const {
        marksStyle = "plain",
        asciiOnly = true,
        unknown = "?"
    } = options;

    if(input == null) {
        return "";
    }

    // Step 1/2: Normalize + strip combining marks
    let output = input
        .normalize("NFKD")
        .replace(/\p{M}+/gu, ""); // remove diacritical combining marks

    // Step 3a: known ligatures/letters that NFKD doesn't map as desired
    output = [...output].map(_char => ligaturesMap.get(_char) ?? _char).join("");

    // Step 3b: symbols & punctuation fallbacks
    const symbolsMap = makeSymbolsMap(marksStyle);
    output = [...output].map(_char => symbolsMap.get(_char) ?? _char).join("");

    // Step 4: optionally enforce ASCII-only
    if(asciiOnly) {
        // Replace anything outside U+0000..U+007F with nothing
        output = output.replace(/[^\x00-\x7F]+/g, "");
    } else {
        // Keep but flag unknowns
        output = output.replace(/[^\x00-\x7F]/g, unknown);
    }

    return output;
}

/**
 * Convert a string into a slug-style ASCII string.
 *
 * @param {string} input
 * @param {{
 *   separator?: string,
 *   caseStyle?: "lower" | "upper" | "title" | "camel" | "pascal" | "none",
 *   strict?: boolean,
 *   toASCIIOptions?: AsciiFoldOptions
 * }} [options]
 * @returns {string}
 */
export function toSlug(input, options = {}) {
    const {
        separator = "-",
        caseStyle = "lower",
        strict = false,
        toASCIIOptions = {
            marksStyle: "plain",
            asciiOnly: true,
            unknown: "?"
        }
    } = options;

    let slug = toASCII(input, toASCIIOptions);

    slug = slug
        .replace(/[^A-Za-z0-9]+/g, separator)
        .replace(new RegExp(`${separator}{2,}`, "g"), separator)
        .replace(new RegExp(`^${separator}|${separator}$`, "g"), "");

    switch(caseStyle) {
        case "none": break; // Keep whatever case remains after toASCII()
        case "upper": slug = slug.toUpperCase(); break;
        case "lower":
        default: slug = slug.toLowerCase(); break;
    }

    if (strict) {
        slug = slug.replace(new RegExp(`[^A-Za-z0-9${separator}]`, "g"), "");
    }

    return slug;
}