Initial commit.

+ Add anyascii function from its C implementation (`2023-08-01` version). + Add utf8ToAscii helper function. + Add unit tests.
2025-01-08 19:52:15 +01:00 · 2025-01-08 19:52:15 +01:00 · 6da1ed2e4c
commit 6da1ed2e4c
7 changed files with 1486 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+# IntelliJ IDEA
+*.iml
+.idea/
+
+# Zig
+.zig-cache/
+zig-out/
--- a/README.md
+++ b/README.md
@ -0,0 +1,37 @@
+# anyascii.zig
+
+This repository allows to use anyascii C implementation from zig, with a helper function `utf8ToAscii` to easily convert any UTF-8 encoded string in an ASCII-only string.
+
+Current version of anyascii: `2023-08-01`.
+
+## How to use
+
+### `anyascii`
+
+```zig
+const std = @import("std");
+const anyascii = @import("anyascii").anyascii;
+
+// A single UTF-8 codepoint to its ASCII equivalent.
+const ascii = try anyascii(allocator, try std.unicode.utf8Decode("Φ"));
+defer allocator.free(ascii);
+std.debug.print("{s}", .{ascii}); // Output: "F".
+```
+
+### `utf8ToAscii`
+
+```zig
+const std = @import("std");
+const anyascii = @import("anyascii").utf8ToAscii;
+
+// A full string of UTF-8 characters to ASCII characters.
+const ascii = try utf8ToAscii(allocator, "Blöße");
+defer allocator.free(ascii);
+std.debug.print("{s}", .{ascii}); // Output: "Blosse".
+```
+
+## What is anyascii?
+
+Taken from [official _anyascii_ description](https://github.com/anyascii/anyascii/tree/master#description).
+
+AnyAscii provides ASCII-only replacement strings for practically all Unicode characters. Text is converted character-by-character without considering the context. The mappings for each script are based on popular existing romanization systems. Symbolic characters are converted based on their meaning or appearance. All ASCII characters in the input are left unchanged, every other character is replaced with printable ASCII characters.
--- a/anyascii/anyascii.c
+++ b/anyascii/anyascii.c
--- a/anyascii/anyascii.h
+++ b/anyascii/anyascii.h
@ -0,0 +1,34 @@
+/*
+ISC License
+
+Copyright (c) 2020-2023, Hunter WB <hunterwb.com>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef ANYASCII_H
+#define ANYASCII_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/**
+ * Gets the ASCII transliteration of a Unicode code point
+ * 
+ * @param utf32 A Unicode code point
+ * @param ascii A pointer for the result to be written to; not null-terminated
+ * @return The number of chars in *ascii
+ */
+size_t anyascii(uint_least32_t utf32, const char **ascii);
+
+#endif
--- a/build.zig
+++ b/build.zig
@ -0,0 +1,37 @@
+const std = @import("std");
+
+// Although this function looks imperative, note that its job is to
+// declaratively construct a build graph that will be executed by an external
+// runner.
+pub fn build(b: *std.Build) void {
+	const target = b.standardTargetOptions(.{});
+	const optimize = b.standardOptimizeOption(.{});
+
+	// Anyascii zig module.
+	const anyascii = b.addModule("anyascii", .{
+		.root_source_file = b.path("src/lib.zig"),
+		.target = target,
+		.optimize = optimize,
+	});
+	anyascii.link_libc = true;
+	anyascii.addIncludePath(b.path("anyascii"));
+	anyascii.addCSourceFile(.{
+		.file = b.path("anyascii/anyascii.c"),
+	});
+
+	// Library unit tests.
+	const lib_unit_tests = b.addTest(.{
+		.root_source_file = b.path("src/lib.zig"),
+		.target = target,
+		.optimize = optimize,
+	});
+	lib_unit_tests.linkLibC();
+	lib_unit_tests.addIncludePath(b.path("anyascii"));
+	lib_unit_tests.addCSourceFile(.{
+		.file = b.path("anyascii/anyascii.c"),
+	});
+	const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
+
+	const test_step = b.step("test", "Run unit tests");
+	test_step.dependOn(&run_lib_unit_tests.step);
+}
--- a/build.zig.zon
+++ b/build.zig.zon
@ -0,0 +1,12 @@
+.{
+	.name = "anyascii.zig",
+	.version = "1.0.0",
+	.dependencies = .{},
+
+	.paths = .{
+		"build.zig",
+		"build.zig.zon",
+		"src",
+		"README.md",
+	},
+}
--- a/src/lib.zig
+++ b/src/lib.zig
@ -0,0 +1,107 @@
+const std = @import("std");
+
+const c = @cImport({
+	@cInclude("anyascii.h");
+});
+
+/// Convert a unicode codepoint to its ascii equivalent.
+pub fn anyascii(allocator: std.mem.Allocator, codepoint: u21) ![]const u8 {
+	// Call C anyascii function.
+	var cChars: [*]u8 = undefined;
+	const charsCount = c.anyascii(codepoint, @ptrCast(&cChars));
+
+	// Convert the raw C pointer to a zig allocated result.
+	const result = try allocator.alloc(u8, charsCount);
+	for (0..charsCount) |i| {
+		result[i] = cChars[i];
+	}
+	return result;
+}
+
+/// Convert a given UTF-8 string to its ASCII equivalent using anyascii.
+pub fn utf8ToAscii(allocator: std.mem.Allocator, str: []const u8) ![]const u8 {
+	// Get a UTF8 iterator.
+	var iterator = (try std.unicode.Utf8View.init(str)).iterator();
+
+	// Initialize a out string arraylist where ascii equivalents will be appended.
+	var outStr = try std.ArrayList(u8).initCapacity(allocator, str.len | 15);
+	defer outStr.deinit();
+
+	// For each codepoint, convert it to ascii.
+	while (iterator.nextCodepoint()) |codepoint| {
+		const ascii = try anyascii(allocator, codepoint);
+		defer allocator.free(ascii);
+		try outStr.appendSlice(ascii); //TODO use a writer to avoid this copy
+	}
+
+	// Return the built full ascii equivalent.
+	return outStr.toOwnedSlice();
+}
+
+test anyascii {
+	try testAnyascii("a", "a");
+	try testAnyascii("o", "ø");
+	try testAnyascii("e", "ë");
+	try testAnyascii("s", "ŝ");
+	try testAnyascii("F", "Φ");
+	try testAnyascii(":crown:", "👑");
+}
+
+/// Test the conversion of a given UTF-8 character to its ASCII equivalent.
+fn testAnyascii(expectedAscii: []const u8, utf8str: []const u8) !void {
+	const ascii = try anyascii(std.testing.allocator, try std.unicode.utf8Decode(utf8str));
+	defer std.testing.allocator.free(ascii);
+	try std.testing.expectEqualStrings(expectedAscii, ascii);
+}
+
+test utf8ToAscii {
+	// These examples are taken from anyascii examples, see https://github.com/anyascii/anyascii/tree/master#examples
+
+	try testUtf8ToAscii("Rene Francois Lacote", "René François Lacôte");
+	try testUtf8ToAscii("Blosse", "Blöße");
+	try testUtf8ToAscii("Tran Hung Dao", "Trần Hưng Đạo");
+	try testUtf8ToAscii("Naeroy", "Nærøy");
+	try testUtf8ToAscii("Feidippidis", "Φειδιππίδης");
+	try testUtf8ToAscii("Dimitris Fotopoylos", "Δημήτρης Φωτόπουλος");
+	try testUtf8ToAscii("Boris Nikolaevich El'tsin", "Борис Николаевич Ельцин");
+	try testUtf8ToAscii("Volodimir Gorbulin", "Володимир Горбулін");
+	try testUtf8ToAscii("T'rgovishche", "Търговище");
+	try testUtf8ToAscii("ShenZhen", "深圳");
+	try testUtf8ToAscii("ShenShuiBu", "深水埗");
+	try testUtf8ToAscii("HwaSeongSi", "화성시");
+	try testUtf8ToAscii("HuaChengShi", "華城市");
+	try testUtf8ToAscii("saitama", "さいたま");
+	try testUtf8ToAscii("QiYuXian", "埼玉県");
+	try testUtf8ToAscii("debre zeyt", "ደብረ ዘይት");
+	try testUtf8ToAscii("dek'emhare", "ደቀምሓረ");
+	try testUtf8ToAscii("dmnhwr", "دمنهور");
+	try testUtf8ToAscii("Abovyan", "Աբովյան");
+	try testUtf8ToAscii("samt'redia", "სამტრედია");
+	try testUtf8ToAscii("'vrhm hlvy frnkl", "אברהם הלוי פרנקל");
+	try testUtf8ToAscii("+say x ag", "⠠⠎⠁⠽⠀⠭⠀⠁⠛");
+	try testUtf8ToAscii("mymnsimh", "ময়মনসিংহ");
+	try testUtf8ToAscii("thntln", "ထန်တလန်");
+	try testUtf8ToAscii("porbmdr", "પોરબંદર");
+	try testUtf8ToAscii("mhasmumd", "महासमुंद");
+	try testUtf8ToAscii("bemgluru", "ಬೆಂಗಳೂರು");
+	try testUtf8ToAscii("siemrab", "សៀមរាប");
+	try testUtf8ToAscii("sahvannaekhd", "ສະຫວັນນະເຂດ");
+	try testUtf8ToAscii("klmsseri", "കളമശ്ശേരി");
+	try testUtf8ToAscii("gjpti", "ଗଜପତି");
+	try testUtf8ToAscii("jlmdhr", "ਜਲੰਧਰ");
+	try testUtf8ToAscii("rtnpur", "රත්නපුර");
+	try testUtf8ToAscii("knniyakumri", "கன்னியாகுமரி");
+	try testUtf8ToAscii("srikakulm", "శ్రీకాకుళం");
+	try testUtf8ToAscii("sngkhla", "สงขลา");
+
+	try testUtf8ToAscii(":crown: :palm_tree:", "👑 🌴");
+	try testUtf8ToAscii("* # + 5 X", "☆ ♯ ♰ ⚄ ⛌");
+	try testUtf8ToAscii("No M & A/S", "№ ℳ ⅋ ⅍");
+}
+
+/// Test the conversion of a given UTF-8 string to its ASCII equivalent.
+fn testUtf8ToAscii(expectedAscii: []const u8, utf8str: []const u8) !void {
+	const ascii = try utf8ToAscii(std.testing.allocator, utf8str);
+	defer std.testing.allocator.free(ascii);
+	try std.testing.expectEqualStrings(expectedAscii, ascii);
+}