Initial commit.

+ Add anyascii function from its C implementation (`2023-08-01` version).
+ Add utf8ToAscii helper function.
+ Add unit tests.
This commit is contained in:
Madeorsk 2025-01-08 19:52:15 +01:00
commit 6da1ed2e4c
Signed by: Madeorsk
GPG key ID: 677E51CA765BB79F
7 changed files with 1486 additions and 0 deletions

7
.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
# IntelliJ IDEA
*.iml
.idea/
# Zig
.zig-cache/
zig-out/

37
README.md Normal file
View file

@ -0,0 +1,37 @@
# anyascii.zig
This repository allows to use anyascii C implementation from zig, with a helper function `utf8ToAscii` to easily convert any UTF-8 encoded string in an ASCII-only string.
Current version of anyascii: `2023-08-01`.
## How to use
### `anyascii`
```zig
const std = @import("std");
const anyascii = @import("anyascii").anyascii;
// A single UTF-8 codepoint to its ASCII equivalent.
const ascii = try anyascii(allocator, try std.unicode.utf8Decode("Φ"));
defer allocator.free(ascii);
std.debug.print("{s}", .{ascii}); // Output: "F".
```
### `utf8ToAscii`
```zig
const std = @import("std");
const anyascii = @import("anyascii").utf8ToAscii;
// A full string of UTF-8 characters to ASCII characters.
const ascii = try utf8ToAscii(allocator, "Blöße");
defer allocator.free(ascii);
std.debug.print("{s}", .{ascii}); // Output: "Blosse".
```
## What is anyascii?
Taken from [official _anyascii_ description](https://github.com/anyascii/anyascii/tree/master#description).
AnyAscii provides ASCII-only replacement strings for practically all Unicode characters. Text is converted character-by-character without considering the context. The mappings for each script are based on popular existing romanization systems. Symbolic characters are converted based on their meaning or appearance. All ASCII characters in the input are left unchanged, every other character is replaced with printable ASCII characters.

1252
anyascii/anyascii.c Normal file

File diff suppressed because one or more lines are too long

34
anyascii/anyascii.h Normal file
View file

@ -0,0 +1,34 @@
/*
ISC License
Copyright (c) 2020-2023, Hunter WB <hunterwb.com>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifndef ANYASCII_H
#define ANYASCII_H
#include <stddef.h>
#include <stdint.h>
/**
* Gets the ASCII transliteration of a Unicode code point
*
* @param utf32 A Unicode code point
* @param ascii A pointer for the result to be written to; not null-terminated
* @return The number of chars in *ascii
*/
size_t anyascii(uint_least32_t utf32, const char **ascii);
#endif

37
build.zig Normal file
View file

@ -0,0 +1,37 @@
const std = @import("std");
// Although this function looks imperative, note that its job is to
// declaratively construct a build graph that will be executed by an external
// runner.
pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
// Anyascii zig module.
const anyascii = b.addModule("anyascii", .{
.root_source_file = b.path("src/lib.zig"),
.target = target,
.optimize = optimize,
});
anyascii.link_libc = true;
anyascii.addIncludePath(b.path("anyascii"));
anyascii.addCSourceFile(.{
.file = b.path("anyascii/anyascii.c"),
});
// Library unit tests.
const lib_unit_tests = b.addTest(.{
.root_source_file = b.path("src/lib.zig"),
.target = target,
.optimize = optimize,
});
lib_unit_tests.linkLibC();
lib_unit_tests.addIncludePath(b.path("anyascii"));
lib_unit_tests.addCSourceFile(.{
.file = b.path("anyascii/anyascii.c"),
});
const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
const test_step = b.step("test", "Run unit tests");
test_step.dependOn(&run_lib_unit_tests.step);
}

12
build.zig.zon Normal file
View file

@ -0,0 +1,12 @@
.{
.name = "anyascii.zig",
.version = "1.0.0",
.dependencies = .{},
.paths = .{
"build.zig",
"build.zig.zon",
"src",
"README.md",
},
}

107
src/lib.zig Normal file
View file

@ -0,0 +1,107 @@
const std = @import("std");
const c = @cImport({
@cInclude("anyascii.h");
});
/// Convert a unicode codepoint to its ascii equivalent.
pub fn anyascii(allocator: std.mem.Allocator, codepoint: u21) ![]const u8 {
// Call C anyascii function.
var cChars: [*]u8 = undefined;
const charsCount = c.anyascii(codepoint, @ptrCast(&cChars));
// Convert the raw C pointer to a zig allocated result.
const result = try allocator.alloc(u8, charsCount);
for (0..charsCount) |i| {
result[i] = cChars[i];
}
return result;
}
/// Convert a given UTF-8 string to its ASCII equivalent using anyascii.
pub fn utf8ToAscii(allocator: std.mem.Allocator, str: []const u8) ![]const u8 {
// Get a UTF8 iterator.
var iterator = (try std.unicode.Utf8View.init(str)).iterator();
// Initialize a out string arraylist where ascii equivalents will be appended.
var outStr = try std.ArrayList(u8).initCapacity(allocator, str.len | 15);
defer outStr.deinit();
// For each codepoint, convert it to ascii.
while (iterator.nextCodepoint()) |codepoint| {
const ascii = try anyascii(allocator, codepoint);
defer allocator.free(ascii);
try outStr.appendSlice(ascii); //TODO use a writer to avoid this copy
}
// Return the built full ascii equivalent.
return outStr.toOwnedSlice();
}
test anyascii {
try testAnyascii("a", "a");
try testAnyascii("o", "ø");
try testAnyascii("e", "ë");
try testAnyascii("s", "ŝ");
try testAnyascii("F", "Φ");
try testAnyascii(":crown:", "👑");
}
/// Test the conversion of a given UTF-8 character to its ASCII equivalent.
fn testAnyascii(expectedAscii: []const u8, utf8str: []const u8) !void {
const ascii = try anyascii(std.testing.allocator, try std.unicode.utf8Decode(utf8str));
defer std.testing.allocator.free(ascii);
try std.testing.expectEqualStrings(expectedAscii, ascii);
}
test utf8ToAscii {
// These examples are taken from anyascii examples, see https://github.com/anyascii/anyascii/tree/master#examples
try testUtf8ToAscii("Rene Francois Lacote", "René François Lacôte");
try testUtf8ToAscii("Blosse", "Blöße");
try testUtf8ToAscii("Tran Hung Dao", "Trần Hưng Đạo");
try testUtf8ToAscii("Naeroy", "Nærøy");
try testUtf8ToAscii("Feidippidis", "Φειδιππίδης");
try testUtf8ToAscii("Dimitris Fotopoylos", "Δημήτρης Φωτόπουλος");
try testUtf8ToAscii("Boris Nikolaevich El'tsin", "Борис Николаевич Ельцин");
try testUtf8ToAscii("Volodimir Gorbulin", "Володимир Горбулін");
try testUtf8ToAscii("T'rgovishche", "Търговище");
try testUtf8ToAscii("ShenZhen", "深圳");
try testUtf8ToAscii("ShenShuiBu", "深水埗");
try testUtf8ToAscii("HwaSeongSi", "화성시");
try testUtf8ToAscii("HuaChengShi", "華城市");
try testUtf8ToAscii("saitama", "さいたま");
try testUtf8ToAscii("QiYuXian", "埼玉県");
try testUtf8ToAscii("debre zeyt", "ደብረ ዘይት");
try testUtf8ToAscii("dek'emhare", "ደቀምሓረ");
try testUtf8ToAscii("dmnhwr", "دمنهور");
try testUtf8ToAscii("Abovyan", "Աբովյան");
try testUtf8ToAscii("samt'redia", "სამტრედია");
try testUtf8ToAscii("'vrhm hlvy frnkl", "אברהם הלוי פרנקל");
try testUtf8ToAscii("+say x ag", "⠠⠎⠁⠽⠀⠭⠀⠁⠛");
try testUtf8ToAscii("mymnsimh", "ময়মনসিংহ");
try testUtf8ToAscii("thntln", "ထန်တလန်");
try testUtf8ToAscii("porbmdr", "પોરબંદર");
try testUtf8ToAscii("mhasmumd", "महासमुंद");
try testUtf8ToAscii("bemgluru", "ಬೆಂಗಳೂರು");
try testUtf8ToAscii("siemrab", "សៀមរាប");
try testUtf8ToAscii("sahvannaekhd", "ສະຫວັນນະເຂດ");
try testUtf8ToAscii("klmsseri", "കളമശ്ശേരി");
try testUtf8ToAscii("gjpti", "ଗଜପତି");
try testUtf8ToAscii("jlmdhr", "ਜਲੰਧਰ");
try testUtf8ToAscii("rtnpur", "රත්නපුර");
try testUtf8ToAscii("knniyakumri", "கன்னியாகுமரி");
try testUtf8ToAscii("srikakulm", "శ్రీకాకుళం");
try testUtf8ToAscii("sngkhla", "สงขลา");
try testUtf8ToAscii(":crown: :palm_tree:", "👑 🌴");
try testUtf8ToAscii("* # + 5 X", "☆ ♯ ♰ ⚄ ⛌");
try testUtf8ToAscii("No M & A/S", " ⅋ ⅍");
}
/// Test the conversion of a given UTF-8 string to its ASCII equivalent.
fn testUtf8ToAscii(expectedAscii: []const u8, utf8str: []const u8) !void {
const ascii = try utf8ToAscii(std.testing.allocator, utf8str);
defer std.testing.allocator.free(ascii);
try std.testing.expectEqualStrings(expectedAscii, ascii);
}