anyascii.zig/src/lib.zig
Madeorsk eb85754ece
anyasciiWrite with zig writer to avoid unnecessary allocations.
+ Add `anyasciiWrite` which writes to a zig writer directly to avoid unnecessary allocations in some use cases.
+ Use `anyasciiWrite` utf8ToAscii to avoid allocation duplicates in the previous implementation.
2025-01-08 22:14:56 +01:00

120 lines
4.9 KiB
Zig
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const std = @import("std");
const c = @cImport({
@cInclude("anyascii.h");
});
/// Convert a unicode codepoint to its ascii equivalent.
pub fn anyascii(allocator: std.mem.Allocator, codepoint: u21) ![]const u8 {
// Call C anyascii function.
var cChars: [*]u8 = undefined;
const charsCount = c.anyascii(codepoint, @ptrCast(&cChars));
// Convert the raw C pointer to a zig allocated result.
const result = try allocator.alloc(u8, charsCount);
for (0..charsCount) |i| {
result[i] = cChars[i];
}
return result;
}
/// Convert a unicode codepoint to its ascii equivalent, in the provided writer.
pub fn anyasciiWrite(writer: std.io.AnyWriter, codepoint: u21) !void {
// Call C anyascii function.
var cChars: [*]u8 = undefined;
const charsCount = c.anyascii(codepoint, @ptrCast(&cChars));
// Write every byte from the raw C pointer.
for (0..charsCount) |i| {
try writer.writeByte(cChars[i]);
}
}
/// Convert a given UTF-8 string to its ASCII equivalent using anyascii.
pub fn utf8ToAscii(allocator: std.mem.Allocator, str: []const u8) ![]const u8 {
// Get a UTF8 iterator.
var iterator = (try std.unicode.Utf8View.init(str)).iterator();
// Initialize a out string array list where ascii equivalents will be appended.
var outStr = try std.ArrayList(u8).initCapacity(allocator, str.len | 15);
defer outStr.deinit();
// Get a writer to the array list.
const writer = outStr.writer().any();
// For each codepoint, convert it to ascii.
while (iterator.nextCodepoint()) |codepoint| {
try anyasciiWrite(writer, codepoint);
}
// Return the built full ascii equivalent.
return outStr.toOwnedSlice();
}
test anyascii {
try testAnyascii("a", "a");
try testAnyascii("o", "ø");
try testAnyascii("e", "ë");
try testAnyascii("s", "ŝ");
try testAnyascii("F", "Φ");
try testAnyascii(":crown:", "👑");
}
/// Test the conversion of a given UTF-8 character to its ASCII equivalent.
fn testAnyascii(expectedAscii: []const u8, utf8str: []const u8) !void {
const ascii = try anyascii(std.testing.allocator, try std.unicode.utf8Decode(utf8str));
defer std.testing.allocator.free(ascii);
try std.testing.expectEqualStrings(expectedAscii, ascii);
}
test utf8ToAscii {
// These examples are taken from anyascii examples, see https://github.com/anyascii/anyascii/tree/master#examples
try testUtf8ToAscii("Rene Francois Lacote", "René François Lacôte");
try testUtf8ToAscii("Blosse", "Blöße");
try testUtf8ToAscii("Tran Hung Dao", "Trần Hưng Đạo");
try testUtf8ToAscii("Naeroy", "Nærøy");
try testUtf8ToAscii("Feidippidis", "Φειδιππίδης");
try testUtf8ToAscii("Dimitris Fotopoylos", "Δημήτρης Φωτόπουλος");
try testUtf8ToAscii("Boris Nikolaevich El'tsin", "Борис Николаевич Ельцин");
try testUtf8ToAscii("Volodimir Gorbulin", "Володимир Горбулін");
try testUtf8ToAscii("T'rgovishche", "Търговище");
try testUtf8ToAscii("ShenZhen", "深圳");
try testUtf8ToAscii("ShenShuiBu", "深水埗");
try testUtf8ToAscii("HwaSeongSi", "화성시");
try testUtf8ToAscii("HuaChengShi", "華城市");
try testUtf8ToAscii("saitama", "さいたま");
try testUtf8ToAscii("QiYuXian", "埼玉県");
try testUtf8ToAscii("debre zeyt", "ደብረ ዘይት");
try testUtf8ToAscii("dek'emhare", "ደቀምሓረ");
try testUtf8ToAscii("dmnhwr", "دمنهور");
try testUtf8ToAscii("Abovyan", "Աբովյան");
try testUtf8ToAscii("samt'redia", "სამტრედია");
try testUtf8ToAscii("'vrhm hlvy frnkl", "אברהם הלוי פרנקל");
try testUtf8ToAscii("+say x ag", "⠠⠎⠁⠽⠀⠭⠀⠁⠛");
try testUtf8ToAscii("mymnsimh", "ময়মনসিংহ");
try testUtf8ToAscii("thntln", "ထန်တလန်");
try testUtf8ToAscii("porbmdr", "પોરબંદર");
try testUtf8ToAscii("mhasmumd", "महासमुंद");
try testUtf8ToAscii("bemgluru", "ಬೆಂಗಳೂರು");
try testUtf8ToAscii("siemrab", "សៀមរាប");
try testUtf8ToAscii("sahvannaekhd", "ສະຫວັນນະເຂດ");
try testUtf8ToAscii("klmsseri", "കളമശ്ശേരി");
try testUtf8ToAscii("gjpti", "ଗଜପତି");
try testUtf8ToAscii("jlmdhr", "ਜਲੰਧਰ");
try testUtf8ToAscii("rtnpur", "රත්නපුර");
try testUtf8ToAscii("knniyakumri", "கன்னியாகுமரி");
try testUtf8ToAscii("srikakulm", "శ్రీకాకుళం");
try testUtf8ToAscii("sngkhla", "สงขลา");
try testUtf8ToAscii(":crown: :palm_tree:", "👑 🌴");
try testUtf8ToAscii("* # + 5 X", "☆ ♯ ♰ ⚄ ⛌");
try testUtf8ToAscii("No M & A/S", " ⅋ ⅍");
}
/// Test the conversion of a given UTF-8 string to its ASCII equivalent.
fn testUtf8ToAscii(expectedAscii: []const u8, utf8str: []const u8) !void {
const ascii = try utf8ToAscii(std.testing.allocator, utf8str);
defer std.testing.allocator.free(ascii);
try std.testing.expectEqualStrings(expectedAscii, ascii);
}