flow/src/buffer/unicode.zig
CJ van den Berg ce61c1765a
refactor: use changes_when_lowercased consistently in toggle_case
switch_case already uses changes_when_lowercased to detect uppercase
characters. toggle_case was using is_lowercase with inverted branch
order, which is functionally equivalent but inconsistent.
2026-04-13 22:21:34 +02:00

271 lines
8.2 KiB
Zig
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

pub fn control_code_to_unicode(code: u8) [:0]const u8 {
return switch (code) {
'\x00' => "",
'\x01' => "",
'\x02' => "",
'\x03' => "",
'\x04' => "",
'\x05' => "",
'\x06' => "",
'\x07' => "",
'\x08' => "",
'\x09' => "",
'\x0A' => "",
'\x0B' => "",
'\x0C' => "",
'\x0D' => "",
'\x0E' => "",
'\x0F' => "",
'\x10' => "",
'\x11' => "",
'\x12' => "",
'\x13' => "",
'\x14' => "",
'\x15' => "",
'\x16' => "",
'\x17' => "",
'\x18' => "",
'\x19' => "",
'\x1A' => "",
'\x1B' => "",
'\x1C' => "",
'\x1D' => "",
'\x1E' => "",
'\x1F' => "",
'\x20' => "",
'\x7F' => "",
else => "",
};
}
pub const char_pairs = [_]struct { []const u8, []const u8 }{
.{ "\"", "\"" },
.{ "'", "'" },
.{ "`", "`" },
.{ "(", ")" },
.{ "[", "]" },
.{ "{", "}" },
.{ "", "" },
.{ "", "" },
.{ "", "" },
.{ "«", "»" },
.{ "¿", "?" },
.{ "¡", "!" },
};
pub const open_close_pairs = [_]struct { []const u8, []const u8 }{
.{ "(", ")" },
.{ "[", "]" },
.{ "{", "}" },
.{ "", "" },
.{ "", "" },
.{ "«", "»" },
.{ "¿", "?" },
.{ "¡", "!" },
};
const spinner = [_][]const u8{
"",
"",
"",
"",
"",
"",
"",
"",
"",
};
const spinner_short = [_][]const u8{
"",
"",
"",
"",
"",
"",
};
fn raw_byte_to_utf8(cp: u8, buf: []u8) ![]const u8 {
var utf16le: [1]u16 = undefined;
const utf16le_as_bytes = std.mem.sliceAsBytes(utf16le[0..]);
std.mem.writeInt(u16, utf16le_as_bytes[0..2], cp, .little);
return buf[0..try utf16LeToUtf8(buf, &utf16le)];
}
pub fn utf8_sanitize(allocator: std.mem.Allocator, input: []const u8) error{
OutOfMemory,
DanglingSurrogateHalf,
ExpectedSecondSurrogateHalf,
UnexpectedSecondSurrogateHalf,
}![]u8 {
var output: std.ArrayListUnmanaged(u8) = .{};
const writer = output.writer(allocator);
var buf: [4]u8 = undefined;
for (input) |byte| try writer.writeAll(try raw_byte_to_utf8(byte, &buf));
return output.toOwnedSlice(allocator);
}
pub const TransformError = error{
OutOfMemory,
Utf8CannotEncodeSurrogateHalf,
CodepointTooLarge,
WriteFailed,
};
fn utf8_write_transform_T(comptime View: anytype, comptime field: uucode.FieldEnum, writer: *std.Io.Writer, text: []const u8) TransformError!@typeInfo(@TypeOf(View.iterator)).@"fn".return_type.? {
const view: View = .initUnchecked(text);
var it = view.iterator();
while (it.nextCodepoint()) |cp| {
const cp_ = switch (field) {
.simple_uppercase_mapping, .simple_lowercase_mapping => uucode.get(field, cp) orelse cp,
.case_folding_simple => uucode.get(field, cp),
else => @compileError(@tagName(field) ++ " is not a unicode transformation"),
};
var utf8_buf: [6]u8 = undefined;
const size = try utf8Encode(cp_, &utf8_buf);
try writer.writeAll(utf8_buf[0..size]);
}
return it;
}
fn utf8_write_transform(comptime field: uucode.FieldEnum, writer: *std.Io.Writer, text: []const u8) TransformError!void {
_ = try utf8_write_transform_T(Utf8View, field, writer, text);
}
fn utf8_partial_write_transform(comptime field: uucode.FieldEnum, writer: *std.Io.Writer, text: []const u8) TransformError![]const u8 {
const it = try utf8_write_transform_T(Utf8PartialView, field, writer, text);
return text[0..it.end];
}
fn utf8_transform(comptime field: uucode.FieldEnum, allocator: std.mem.Allocator, text: []const u8) TransformError![]u8 {
var result: std.Io.Writer.Allocating = .init(allocator);
defer result.deinit();
try utf8_write_transform(field, &result.writer, text);
return result.toOwnedSlice();
}
fn utf8_predicate_all(comptime field: uucode.FieldEnum, text: []const u8) bool {
const view: Utf8View = .initUnchecked(text);
var it = view.iterator();
while (it.nextCodepoint()) |cp| {
const result = switch (field) {
.is_lowercase => uucode.get(field, cp),
.changes_when_casefolded => uucode.get(field, cp),
.changes_when_lowercased => uucode.get(field, cp),
else => @compileError(@tagName(field) ++ " is not a unicode predicate"),
};
if (!result) return false;
}
return true;
}
fn utf8_predicate_any(comptime field: uucode.FieldEnum, text: []const u8) bool {
const view: Utf8View = .initUnchecked(text);
var it = view.iterator();
while (it.nextCodepoint()) |cp| {
const result = switch (field) {
.is_lowercase => uucode.get(field, cp),
.changes_when_casefolded => uucode.get(field, cp),
.changes_when_lowercased => uucode.get(field, cp),
else => @compileError(@tagName(field) ++ " is not a unicode predicate"),
};
if (result) return true;
}
return false;
}
pub fn to_upper(allocator: std.mem.Allocator, text: []const u8) TransformError![]u8 {
return utf8_transform(.simple_uppercase_mapping, allocator, text);
}
pub fn to_lower(allocator: std.mem.Allocator, text: []const u8) TransformError![]u8 {
return utf8_transform(.simple_lowercase_mapping, allocator, text);
}
pub fn case_fold(allocator: std.mem.Allocator, text: []const u8) TransformError![]u8 {
return utf8_transform(.case_folding_simple, allocator, text);
}
pub fn case_folded_write(writer: *std.Io.Writer, text: []const u8) TransformError!void {
return utf8_write_transform(.case_folding_simple, writer, text);
}
pub fn case_folded_write_partial(writer: *std.Io.Writer, text: []const u8) TransformError![]const u8 {
return utf8_partial_write_transform(.case_folding_simple, writer, text);
}
pub fn switch_case(allocator: std.mem.Allocator, text: []const u8) TransformError![]u8 {
return if (utf8_predicate_any(.changes_when_lowercased, text))
to_lower(allocator, text)
else
to_upper(allocator, text);
}
pub fn toggle_case(allocator: std.mem.Allocator, text: []const u8) TransformError![]u8 {
var result: std.Io.Writer.Allocating = .init(allocator);
defer result.deinit();
const writer = &result.writer;
const view: Utf8View = .initUnchecked(text);
var it = view.iterator();
while (it.nextCodepoint()) |cp| {
const cp_ = if (uucode.get(.changes_when_lowercased, cp))
uucode.get(.simple_lowercase_mapping, cp) orelse cp
else
uucode.get(.simple_uppercase_mapping, cp) orelse cp;
var utf8_buf: [6]u8 = undefined;
const size = try utf8Encode(cp_, &utf8_buf);
try writer.writeAll(utf8_buf[0..size]);
}
return result.toOwnedSlice();
}
pub fn is_lowercase(text: []const u8) bool {
return utf8_predicate_all(.is_lowercase, text);
}
const std = @import("std");
const uucode = @import("vaxis").uucode;
const utf16LeToUtf8 = std.unicode.utf16LeToUtf8;
const utf8ByteSequenceLength = std.unicode.utf8ByteSequenceLength;
const utf8Decode = std.unicode.utf8Decode;
const utf8Encode = std.unicode.utf8Encode;
const Utf8View = std.unicode.Utf8View;
const Utf8PartialIterator = struct {
bytes: []const u8,
end: usize,
fn nextCodepointSlice(it: *Utf8PartialIterator) ?[]const u8 {
if (it.end >= it.bytes.len) {
return null;
}
const cp_len = utf8ByteSequenceLength(it.bytes[it.end]) catch return null;
if (it.end + cp_len > it.bytes.len) {
return null;
}
it.end += cp_len;
return it.bytes[it.end - cp_len .. it.end];
}
fn nextCodepoint(it: *Utf8PartialIterator) ?u21 {
const slice = it.nextCodepointSlice() orelse return null;
return utf8Decode(slice) catch unreachable;
}
};
const Utf8PartialView = struct {
bytes: []const u8,
fn initUnchecked(s: []const u8) Utf8PartialView {
return Utf8PartialView{ .bytes = s };
}
fn iterator(s: Utf8PartialView) Utf8PartialIterator {
return Utf8PartialIterator{
.bytes = s.bytes,
.end = 0,
};
}
};