Add ASCII fold snippet

This commit is contained in:
2025-10-31 13:49:24 +01:00
parent 7bde13a63b
commit ec62075ba7
2 changed files with 246 additions and 0 deletions

View File

@ -0,0 +1,86 @@
# ascii-fold (JavaScript)
Best-effort ASCII folding and slug generation utilities extracted from a reusable snippet. It focuses on practical, predictable results:
- Removes diacritics using Unicode NFKD normalization and strips combining marks
- Handles common ligatures and special Latin letters (Æ/æ → AE/ae, ß → ss, etc.)
- Maps typographic quotes, dashes, ellipsis, spaces, and a few symbols to sensible ASCII
- Optional strict ASCII-only output or keep-non-ASCII-with-placeholder
- Includes a small `toSlug` helper built on top of `toASCII`
## Quick usage
Minimal examples showing what the functions do. Adjust to your environment as needed.
```js
// Assume you have the functions available in scope
const input = "Café™ — 50°C";
// Basic ASCII folding (default: asciiOnly=true, marksStyle="plain")
const ascii = toASCII(input);
// => "Cafe tm - 50 C"
// Keep non-ASCII by substituting unknowns
const kept = toASCII("Emoji: 😀", { asciiOnly: false, unknown: "?" });
// => "Emoji: ?"
// Slugify
const slug = toSlug("Hello, World! © 2025");
// => "hello-world-c-2025"
```
## Functions
### `toASCII(input, options)`
Converts a string to a best-effort ASCII equivalent:
1) NFKD normalize, 2) strip combining marks, 3) map ligatures/letters and symbols, 4) optionally enforce ASCII-only.
Options (`AsciiFoldOptions`):
- `marksStyle`: `"plain" | "paren"` (default `"plain"`)
- `"plain"`: © ® ™ ℠ → `c r tm sm`
- `"paren"`: © ® ™ ℠ → `(c) (r) (tm) (sm)`
- `asciiOnly`: `boolean` (default `true`)
- When `true`, removes any remaining non-ASCII after mapping
- When `false`, keeps non-ASCII but replaces still-unknowns with `unknown`
- `unknown`: `string` (default `"?"`)
- Placeholder for non-ASCII characters that remain when `asciiOnly=false`
Examples:
```js
toASCII("Äffin ½ kg", { marksStyle: "paren" });
// => "Affin - 1/2 kg"
toASCII("naïve façade", {});
// => "naive facade"
```
### `toSlug(input, options)`
Builds on `toASCII` and normalizes to a URL-friendly slug.
Options:
- `separator`: string (default `"-"`)
- `caseStyle`: `"lower" | "upper" | "none"` (default `"lower"`)
- `strict`: `boolean` (default `false`)
- When `true`, removes everything except `AZ az 09` and the chosen separator
- `toASCIIOptions`: `AsciiFoldOptions` (passed to `toASCII` first)
Examples:
```js
toSlug("Crème brûlée — ©", { separator: "-" });
// => "creme-brulee-c"
toSlug("Über cool", { caseStyle: "upper", separator: "_" });
// => "UBER_COOL"
```
## Notes on behavior
- Ligatures and special letters handled explicitly: ff/fi/fl/ffi/ffl, Æ/æ, Œ/œ, ß, Þ/þ, Ð/ð, Ł/ł, Ø/ø, Đ/đ
- Typographic punctuation mapped to ASCII: curly quotes → straight quotes, en/em dashes → `-`, ellipsis → `...`, non-breaking and thin/figure spaces → normal space
- Some miscellaneous symbols mapped: `° → deg`, `× → x`, `÷ → /`, `• → *`, simple fraction glyphs like `½ ¼ ¾`
- Zero-width marks (ZWNJ/ZWJ/BOM) are removed
## License
See the repository-level `LICENSE` file.

View File

@ -0,0 +1,160 @@
/**
* @typedef {Object} AsciiFoldOptions
* @property {"plain"|"paren"} [marksStyle="plain"] How to render © ® ™ ℠ (plain: "c", "r", "tm", "sm"; paren: "(c)", "(r)", "(tm)", "(sm)")
* @property {boolean} [asciiOnly=true] If true, removes any remaining non-ASCII chars after mapping
* @property {string} [unknown="?"] Replacement for still-unknown non-ASCII when asciiOnly=false
*/
const ligaturesMap = new Map([
// Common typographic ligatures
["\uFB00", "ff"], ["\uFB01", "fi"], ["\uFB02", "fl"],
["\uFB03", "ffi"], ["\uFB04", "ffl"],
["\uFB05", "st"], ["\uFB06", "st"],
// Latin letters that don't decompose via NFKD the way we want
["Æ", "AE"], ["æ", "ae"],
["Œ", "OE"], ["œ", "oe"],
["ß", "ss"],
["Þ", "Th"], ["þ", "th"],
["Ð", "D"], ["ð", "d"],
["Ł", "L"], ["ł", "l"],
["Ø", "O"], ["ø", "o"],
["Đ", "D"], ["đ", "d"]
]);
/** Symbols, punctuation, and marks with sensible ASCII fallbacks */
function makeSymbolsMap(marksStyle = "plain") {
const tm = marksStyle === "paren" ? "(tm)" : "tm";
const sm = marksStyle === "paren" ? "(sm)" : "sm";
const r = marksStyle === "paren" ? "(r)" : "r";
const c = marksStyle === "paren" ? "(c)" : "c";
return new Map([
// Legal & marks
["\u00A9", c], // ©
["\u00AE", r], // ®
["\u2122", tm], // ™
["\u2120", sm], // ℠
// Quotes → straight ASCII
["\u2018", "'"],
["\u2019", "'"],
["\u201A", "'"],
["\u201B", "'"],
["\u201C", "\""],
["\u201D", "\""],
["\u201E", "\""],
["\u201F", "\""],
// Dashes & ellipsis
["\u2013", "-"], // en dash
["\u2014", "-"], // — em dash
["\u2212", "-"], // minus
["\u2026", "..."],// … ellipsis
// Misc symbols
["\u00B0", "deg"],// °
["\u00D7", "x"], // ×
["\u00F7", "/"], // ÷
["\u2022", "*"], // •
["\u00A0", " "], // non-breaking space
["\u2007", " "], // figure space
["\u202F", " "], // narrow no-break space
["\u2009", " "], // thin space
["\u2002", " "], ["\u2003", " "], ["\u2004", " "], ["\u2005", " "], ["\u2006", " "], ["\u2008", " "], ["\u200A", " "], ["\u205F", " "], ["\u3000", " "], // spaces
["\u200B", ""], // zero-width space
["\u200C", ""], ["\u200D", ""], ["\uFEFF", ""], // zero-width non-joiner/joiner & BOM
// Fractions (basic)
["\u00BD", "1/2"], ["\u00BC", "1/4"], ["\u00BE", "3/4"]
]);
}
/**
* Convert a string to a best-effort ASCII equivalent.
* 1) Normalize to NFKD (compatibility decomposition)
* 2) Remove combining marks \p{M}
* 3) Apply explicit ligature/letter & symbol maps
* 4) Optionally force pure ASCII
*
* @param {string} input
* @param {AsciiFoldOptions} [options]
* @returns {string}
*/
export function toASCII(input, options = {}) {
const {
marksStyle = "plain",
asciiOnly = true,
unknown = "?"
} = options;
if(input == null) {
return "";
}
// Step 1/2: Normalize + strip combining marks
let output = input
.normalize("NFKD")
.replace(/\p{M}+/gu, ""); // remove diacritical combining marks
// Step 3a: known ligatures/letters that NFKD doesn't map as desired
output = [...output].map(_char => ligaturesMap.get(_char) ?? _char).join("");
// Step 3b: symbols & punctuation fallbacks
const symbolsMap = makeSymbolsMap(marksStyle);
output = [...output].map(_char => symbolsMap.get(_char) ?? _char).join("");
// Step 4: optionally enforce ASCII-only
if(asciiOnly) {
// Replace anything outside U+0000..U+007F with nothing
output = output.replace(/[^\x00-\x7F]+/g, "");
} else {
// Keep but flag unknowns
output = output.replace(/[^\x00-\x7F]/g, unknown);
}
return output;
}
/**
* Convert a string into a slug-style ASCII string.
*
* @param {string} input
* @param {{
* separator?: string,
* caseStyle?: "lower" | "upper" | "title" | "camel" | "pascal" | "none",
* strict?: boolean,
* toASCIIOptions?: AsciiFoldOptions
* }} [options]
* @returns {string}
*/
export function toSlug(input, options = {}) {
const {
separator = "-",
caseStyle = "lower",
strict = false,
toASCIIOptions = {
marksStyle: "plain",
asciiOnly: true,
unknown: "?"
}
} = options;
let slug = toASCII(input, toASCIIOptions);
slug = slug
.replace(/[^A-Za-z0-9]+/g, separator)
.replace(new RegExp(`${separator}{2,}`, "g"), separator)
.replace(new RegExp(`^${separator}|${separator}$`, "g"), "");
switch(caseStyle) {
case "none": break; // Keep whatever case remains after toASCII()
case "upper": slug = slug.toUpperCase(); break;
case "lower":
default: slug = slug.toLowerCase(); break;
}
if (strict) {
slug = slug.replace(new RegExp(`[^A-Za-z0-9${separator}]`, "g"), "");
}
return slug;
}