Add ASCII fold snippet
This commit is contained in:
86
javascript/ascii-fold/README.md
Normal file
86
javascript/ascii-fold/README.md
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
# ascii-fold (JavaScript)
|
||||||
|
|
||||||
|
Best-effort ASCII folding and slug generation utilities extracted from a reusable snippet. It focuses on practical, predictable results:
|
||||||
|
|
||||||
|
- Removes diacritics using Unicode NFKD normalization and strips combining marks
|
||||||
|
- Handles common ligatures and special Latin letters (Æ/æ → AE/ae, ß → ss, etc.)
|
||||||
|
- Maps typographic quotes, dashes, ellipsis, spaces, and a few symbols to sensible ASCII
|
||||||
|
- Optional strict ASCII-only output or keep-non-ASCII-with-placeholder
|
||||||
|
- Includes a small `toSlug` helper built on top of `toASCII`
|
||||||
|
|
||||||
|
## Quick usage
|
||||||
|
|
||||||
|
Minimal examples showing what the functions do. Adjust to your environment as needed.
|
||||||
|
|
||||||
|
```js
|
||||||
|
// Assume you have the functions available in scope
|
||||||
|
|
||||||
|
const input = "Café™ — 50 °C";
|
||||||
|
|
||||||
|
// Basic ASCII folding (default: asciiOnly=true, marksStyle="plain")
|
||||||
|
const ascii = toASCII(input);
|
||||||
|
// => "Cafe tm - 50 C"
|
||||||
|
|
||||||
|
// Keep non-ASCII by substituting unknowns
|
||||||
|
const kept = toASCII("Emoji: 😀", { asciiOnly: false, unknown: "?" });
|
||||||
|
// => "Emoji: ?"
|
||||||
|
|
||||||
|
// Slugify
|
||||||
|
const slug = toSlug("Hello, World! © 2025");
|
||||||
|
// => "hello-world-c-2025"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Functions
|
||||||
|
|
||||||
|
### `toASCII(input, options)`
|
||||||
|
Converts a string to a best-effort ASCII equivalent:
|
||||||
|
1) NFKD normalize, 2) strip combining marks, 3) map ligatures/letters and symbols, 4) optionally enforce ASCII-only.
|
||||||
|
|
||||||
|
Options (`AsciiFoldOptions`):
|
||||||
|
- `marksStyle`: `"plain" | "paren"` (default `"plain"`)
|
||||||
|
- `"plain"`: © ® ™ ℠ → `c r tm sm`
|
||||||
|
- `"paren"`: © ® ™ ℠ → `(c) (r) (tm) (sm)`
|
||||||
|
- `asciiOnly`: `boolean` (default `true`)
|
||||||
|
- When `true`, removes any remaining non-ASCII after mapping
|
||||||
|
- When `false`, keeps non-ASCII but replaces still-unknowns with `unknown`
|
||||||
|
- `unknown`: `string` (default `"?"`)
|
||||||
|
- Placeholder for non-ASCII characters that remain when `asciiOnly=false`
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
```js
|
||||||
|
toASCII("Äffin – ½ kg", { marksStyle: "paren" });
|
||||||
|
// => "Affin - 1/2 kg"
|
||||||
|
|
||||||
|
toASCII("naïve façade", {});
|
||||||
|
// => "naive facade"
|
||||||
|
```
|
||||||
|
|
||||||
|
### `toSlug(input, options)`
|
||||||
|
Builds on `toASCII` and normalizes to a URL-friendly slug.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
- `separator`: string (default `"-"`)
|
||||||
|
- `caseStyle`: `"lower" | "upper" | "none"` (default `"lower"`)
|
||||||
|
- `strict`: `boolean` (default `false`)
|
||||||
|
- When `true`, removes everything except `A–Z a–z 0–9` and the chosen separator
|
||||||
|
- `toASCIIOptions`: `AsciiFoldOptions` (passed to `toASCII` first)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
```js
|
||||||
|
toSlug("Crème brûlée — ©", { separator: "-" });
|
||||||
|
// => "creme-brulee-c"
|
||||||
|
|
||||||
|
toSlug("Über cool", { caseStyle: "upper", separator: "_" });
|
||||||
|
// => "UBER_COOL"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes on behavior
|
||||||
|
|
||||||
|
- Ligatures and special letters handled explicitly: ff/fi/fl/ffi/ffl, Æ/æ, Œ/œ, ß, Þ/þ, Ð/ð, Ł/ł, Ø/ø, Đ/đ
|
||||||
|
- Typographic punctuation mapped to ASCII: curly quotes → straight quotes, en/em dashes → `-`, ellipsis → `...`, non-breaking and thin/figure spaces → normal space
|
||||||
|
- Some miscellaneous symbols mapped: `° → deg`, `× → x`, `÷ → /`, `• → *`, simple fraction glyphs like `½ ¼ ¾`
|
||||||
|
- Zero-width marks (ZWNJ/ZWJ/BOM) are removed
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
See the repository-level `LICENSE` file.
|
||||||
160
javascript/ascii-fold/ascii-fold.js
Normal file
160
javascript/ascii-fold/ascii-fold.js
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
/**
|
||||||
|
* @typedef {Object} AsciiFoldOptions
|
||||||
|
* @property {"plain"|"paren"} [marksStyle="plain"] How to render © ® ™ ℠ (plain: "c", "r", "tm", "sm"; paren: "(c)", "(r)", "(tm)", "(sm)")
|
||||||
|
* @property {boolean} [asciiOnly=true] If true, removes any remaining non-ASCII chars after mapping
|
||||||
|
* @property {string} [unknown="?"] Replacement for still-unknown non-ASCII when asciiOnly=false
|
||||||
|
*/
|
||||||
|
|
||||||
|
const ligaturesMap = new Map([
|
||||||
|
// Common typographic ligatures
|
||||||
|
["\uFB00", "ff"], ["\uFB01", "fi"], ["\uFB02", "fl"],
|
||||||
|
["\uFB03", "ffi"], ["\uFB04", "ffl"],
|
||||||
|
["\uFB05", "st"], ["\uFB06", "st"],
|
||||||
|
// Latin letters that don't decompose via NFKD the way we want
|
||||||
|
["Æ", "AE"], ["æ", "ae"],
|
||||||
|
["Œ", "OE"], ["œ", "oe"],
|
||||||
|
["ß", "ss"],
|
||||||
|
["Þ", "Th"], ["þ", "th"],
|
||||||
|
["Ð", "D"], ["ð", "d"],
|
||||||
|
["Ł", "L"], ["ł", "l"],
|
||||||
|
["Ø", "O"], ["ø", "o"],
|
||||||
|
["Đ", "D"], ["đ", "d"]
|
||||||
|
]);
|
||||||
|
|
||||||
|
/** Symbols, punctuation, and marks with sensible ASCII fallbacks */
|
||||||
|
function makeSymbolsMap(marksStyle = "plain") {
|
||||||
|
const tm = marksStyle === "paren" ? "(tm)" : "tm";
|
||||||
|
const sm = marksStyle === "paren" ? "(sm)" : "sm";
|
||||||
|
const r = marksStyle === "paren" ? "(r)" : "r";
|
||||||
|
const c = marksStyle === "paren" ? "(c)" : "c";
|
||||||
|
|
||||||
|
return new Map([
|
||||||
|
// Legal & marks
|
||||||
|
["\u00A9", c], // ©
|
||||||
|
["\u00AE", r], // ®
|
||||||
|
["\u2122", tm], // ™
|
||||||
|
["\u2120", sm], // ℠
|
||||||
|
|
||||||
|
// Quotes → straight ASCII
|
||||||
|
["\u2018", "'"],
|
||||||
|
["\u2019", "'"],
|
||||||
|
["\u201A", "'"],
|
||||||
|
["\u201B", "'"],
|
||||||
|
["\u201C", "\""],
|
||||||
|
["\u201D", "\""],
|
||||||
|
["\u201E", "\""],
|
||||||
|
["\u201F", "\""],
|
||||||
|
|
||||||
|
// Dashes & ellipsis
|
||||||
|
["\u2013", "-"], // – en dash
|
||||||
|
["\u2014", "-"], // — em dash
|
||||||
|
["\u2212", "-"], // − minus
|
||||||
|
["\u2026", "..."],// … ellipsis
|
||||||
|
|
||||||
|
// Misc symbols
|
||||||
|
["\u00B0", "deg"],// °
|
||||||
|
["\u00D7", "x"], // ×
|
||||||
|
["\u00F7", "/"], // ÷
|
||||||
|
["\u2022", "*"], // •
|
||||||
|
["\u00A0", " "], // non-breaking space
|
||||||
|
["\u2007", " "], // figure space
|
||||||
|
["\u202F", " "], // narrow no-break space
|
||||||
|
["\u2009", " "], // thin space
|
||||||
|
["\u2002", " "], ["\u2003", " "], ["\u2004", " "], ["\u2005", " "], ["\u2006", " "], ["\u2008", " "], ["\u200A", " "], ["\u205F", " "], ["\u3000", " "], // spaces
|
||||||
|
["\u200B", ""], // zero-width space
|
||||||
|
["\u200C", ""], ["\u200D", ""], ["\uFEFF", ""], // zero-width non-joiner/joiner & BOM
|
||||||
|
// Fractions (basic)
|
||||||
|
["\u00BD", "1/2"], ["\u00BC", "1/4"], ["\u00BE", "3/4"]
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a string to a best-effort ASCII equivalent.
|
||||||
|
* 1) Normalize to NFKD (compatibility decomposition)
|
||||||
|
* 2) Remove combining marks \p{M}
|
||||||
|
* 3) Apply explicit ligature/letter & symbol maps
|
||||||
|
* 4) Optionally force pure ASCII
|
||||||
|
*
|
||||||
|
* @param {string} input
|
||||||
|
* @param {AsciiFoldOptions} [options]
|
||||||
|
* @returns {string}
|
||||||
|
*/
|
||||||
|
export function toASCII(input, options = {}) {
|
||||||
|
const {
|
||||||
|
marksStyle = "plain",
|
||||||
|
asciiOnly = true,
|
||||||
|
unknown = "?"
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
if(input == null) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 1/2: Normalize + strip combining marks
|
||||||
|
let output = input
|
||||||
|
.normalize("NFKD")
|
||||||
|
.replace(/\p{M}+/gu, ""); // remove diacritical combining marks
|
||||||
|
|
||||||
|
// Step 3a: known ligatures/letters that NFKD doesn't map as desired
|
||||||
|
output = [...output].map(_char => ligaturesMap.get(_char) ?? _char).join("");
|
||||||
|
|
||||||
|
// Step 3b: symbols & punctuation fallbacks
|
||||||
|
const symbolsMap = makeSymbolsMap(marksStyle);
|
||||||
|
output = [...output].map(_char => symbolsMap.get(_char) ?? _char).join("");
|
||||||
|
|
||||||
|
// Step 4: optionally enforce ASCII-only
|
||||||
|
if(asciiOnly) {
|
||||||
|
// Replace anything outside U+0000..U+007F with nothing
|
||||||
|
output = output.replace(/[^\x00-\x7F]+/g, "");
|
||||||
|
} else {
|
||||||
|
// Keep but flag unknowns
|
||||||
|
output = output.replace(/[^\x00-\x7F]/g, unknown);
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a string into a slug-style ASCII string.
|
||||||
|
*
|
||||||
|
* @param {string} input
|
||||||
|
* @param {{
|
||||||
|
* separator?: string,
|
||||||
|
* caseStyle?: "lower" | "upper" | "title" | "camel" | "pascal" | "none",
|
||||||
|
* strict?: boolean,
|
||||||
|
* toASCIIOptions?: AsciiFoldOptions
|
||||||
|
* }} [options]
|
||||||
|
* @returns {string}
|
||||||
|
*/
|
||||||
|
export function toSlug(input, options = {}) {
|
||||||
|
const {
|
||||||
|
separator = "-",
|
||||||
|
caseStyle = "lower",
|
||||||
|
strict = false,
|
||||||
|
toASCIIOptions = {
|
||||||
|
marksStyle: "plain",
|
||||||
|
asciiOnly: true,
|
||||||
|
unknown: "?"
|
||||||
|
}
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
let slug = toASCII(input, toASCIIOptions);
|
||||||
|
|
||||||
|
slug = slug
|
||||||
|
.replace(/[^A-Za-z0-9]+/g, separator)
|
||||||
|
.replace(new RegExp(`${separator}{2,}`, "g"), separator)
|
||||||
|
.replace(new RegExp(`^${separator}|${separator}$`, "g"), "");
|
||||||
|
|
||||||
|
switch(caseStyle) {
|
||||||
|
case "none": break; // Keep whatever case remains after toASCII()
|
||||||
|
case "upper": slug = slug.toUpperCase(); break;
|
||||||
|
case "lower":
|
||||||
|
default: slug = slug.toLowerCase(); break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (strict) {
|
||||||
|
slug = slug.replace(new RegExp(`[^A-Za-z0-9${separator}]`, "g"), "");
|
||||||
|
}
|
||||||
|
|
||||||
|
return slug;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user