From c0a9be21f557a60b9a1760dd69a04f761a0824cb Mon Sep 17 00:00:00 2001 From: CJ van den Berg Date: Wed, 18 Dec 2024 15:52:57 +0100 Subject: [PATCH] feat: sanitize non utf-8 and display a status bar warning --- src/buffer/Buffer.zig | 28 +++++++++++++++++++--------- src/buffer/unicode.zig | 22 ++++++++++++++++++++++ src/tui/editor.zig | 27 +++++++++++++++++++-------- src/tui/status/filestate.zig | 8 +++++++- src/tui/status/linenumstate.zig | 14 ++++++++++++-- 5 files changed, 79 insertions(+), 20 deletions(-) diff --git a/src/buffer/Buffer.zig b/src/buffer/Buffer.zig index e43e545..2ca5a5a 100644 --- a/src/buffer/Buffer.zig +++ b/src/buffer/Buffer.zig @@ -24,7 +24,6 @@ pub const Metrics = struct { pub const egc_length_func = *const fn (self: Metrics, egcs: []const u8, colcount: *c_int, abs_col: usize) usize; pub const egc_chunk_width_func = *const fn (self: Metrics, chunk_: []const u8, abs_col_: usize) usize; pub const egc_last_func = *const fn (self: Metrics, egcs: []const u8) []const u8; - }; arena: std.heap.ArenaAllocator, @@ -38,6 +37,7 @@ last_save: ?Root = null, file_exists: bool = true, file_eol_mode: EolMode = .lf, last_save_eol_mode: EolMode = .lf, +file_utf8_sanitized: bool = false, undo_history: ?*UndoNode = null, redo_history: ?*UndoNode = null, @@ -1064,12 +1064,11 @@ fn new_file(self: *const Self, file_exists: *bool) !Root { return Leaf.new(self.allocator, "", true, false); } -pub fn load(self: *const Self, reader: anytype, size: usize, eol_mode: *EolMode) !Root { +pub fn load(self: *const Self, reader: anytype, size: usize, eol_mode: *EolMode, utf8_sanitized: *bool) !Root { const lf = '\n'; const cr = '\r'; var buf = try self.external_allocator.alloc(u8, size); const self_ = @constCast(self); - self_.file_buf = buf; const read_size = try reader.readAll(buf); if (read_size != size) return error.BufferUnderrun; @@ -1077,6 +1076,14 @@ pub fn load(self: *const Self, reader: anytype, size: usize, eol_mode: *EolMode) if (final_read != 0) @panic("unexpected data in final read"); + if (!std.unicode.utf8ValidateSlice(buf)) { + const converted = try unicode.utf8_sanitize(self.external_allocator, buf); + self.external_allocator.free(buf); + buf = converted; + utf8_sanitized.* = true; + } + self_.file_buf = buf; + eol_mode.* = .lf; var leaf_count: usize = 1; for (0..buf.len) |i| { @@ -1107,20 +1114,20 @@ pub fn load(self: *const Self, reader: anytype, size: usize, eol_mode: *EolMode) return Node.merge_in_place(leaves, self.allocator); } -pub fn load_from_string(self: *const Self, s: []const u8, eol_mode: *EolMode) !Root { +pub fn load_from_string(self: *const Self, s: []const u8, eol_mode: *EolMode, utf8_sanitized: *bool) !Root { var stream = std.io.fixedBufferStream(s); - return self.load(stream.reader(), s.len, eol_mode); + return self.load(stream.reader(), s.len, eol_mode, utf8_sanitized); } pub fn load_from_string_and_update(self: *Self, file_path: []const u8, s: []const u8) !void { - self.root = try self.load_from_string(s, &self.file_eol_mode); + self.root = try self.load_from_string(s, &self.file_eol_mode, &self.file_utf8_sanitized); self.file_path = try self.allocator.dupe(u8, file_path); self.last_save = self.root; self.last_save_eol_mode = self.file_eol_mode; self.file_exists = false; } -pub fn load_from_file(self: *const Self, file_path: []const u8, file_exists: *bool, eol_mode: *EolMode) !Root { +pub fn load_from_file(self: *const Self, file_path: []const u8, file_exists: *bool, eol_mode: *EolMode, utf8_sanitized: *bool) !Root { const file = cwd().openFile(file_path, .{ .mode = .read_only }) catch |e| switch (e) { error.FileNotFound => return self.new_file(file_exists), else => return e, @@ -1129,17 +1136,19 @@ pub fn load_from_file(self: *const Self, file_path: []const u8, file_exists: *bo file_exists.* = true; defer file.close(); const stat = try file.stat(); - return self.load(file.reader(), @intCast(stat.size), eol_mode); + return self.load(file.reader(), @intCast(stat.size), eol_mode, utf8_sanitized); } pub fn load_from_file_and_update(self: *Self, file_path: []const u8) !void { var file_exists: bool = false; var eol_mode: EolMode = .lf; - self.root = try self.load_from_file(file_path, &file_exists, &eol_mode); + var utf8_sanitized: bool = false; + self.root = try self.load_from_file(file_path, &file_exists, &eol_mode, &utf8_sanitized); self.file_path = try self.allocator.dupe(u8, file_path); self.last_save = self.root; self.file_exists = file_exists; self.file_eol_mode = eol_mode; + self.file_utf8_sanitized = utf8_sanitized; self.last_save_eol_mode = eol_mode; } @@ -1183,6 +1192,7 @@ pub fn store_to_file_and_clean(self: *Self, file_path: []const u8) !void { self.last_save = self.root; self.last_save_eol_mode = self.file_eol_mode; self.file_exists = true; + self.file_utf8_sanitized = false; } pub fn is_dirty(self: *const Self) bool { diff --git a/src/buffer/unicode.zig b/src/buffer/unicode.zig index a16983a..8d0bb63 100644 --- a/src/buffer/unicode.zig +++ b/src/buffer/unicode.zig @@ -37,3 +37,25 @@ pub fn control_code_to_unicode(code: u8) [:0]const u8 { else => "", }; } + +fn raw_byte_to_utf8(cp: u8, buf: []u8) ![]const u8 { + var utf16le: [1]u16 = undefined; + const utf16le_as_bytes = std.mem.sliceAsBytes(utf16le[0..]); + std.mem.writeInt(u16, utf16le_as_bytes[0..2], cp, .little); + return buf[0..try std.unicode.utf16LeToUtf8(buf, &utf16le)]; +} + +const std = @import("std"); + +pub fn utf8_sanitize(allocator: std.mem.Allocator, input: []const u8) error{ + OutOfMemory, + DanglingSurrogateHalf, + ExpectedSecondSurrogateHalf, + UnexpectedSecondSurrogateHalf, +}![]u8 { + var output: std.ArrayListUnmanaged(u8) = .{}; + const writer = output.writer(allocator); + var buf: [4]u8 = undefined; + for (input) |byte| try writer.writeAll(try raw_byte_to_utf8(byte, &buf)); + return output.toOwnedSlice(allocator); +} diff --git a/src/tui/editor.zig b/src/tui/editor.zig index e6738b9..419dd9e 100644 --- a/src/tui/editor.zig +++ b/src/tui/editor.zig @@ -224,6 +224,7 @@ pub const Editor = struct { bytes: usize = 0, chunks: usize = 0, eol_mode: Buffer.EolMode = .lf, + utf8_sanitized: bool = false, } = null, matches: Match.List, match_token: usize = 0, @@ -259,6 +260,7 @@ pub const Editor = struct { cursels: usize = 0, dirty: bool = false, eol_mode: Buffer.EolMode = .lf, + utf8_sanitized: bool = false, } = .{}, syntax: ?*syntax = null, @@ -414,6 +416,10 @@ pub const Editor = struct { return if (self.buffer) |p| p.file_eol_mode else error.Stop; } + fn buf_utf8_sanitized(self: *const Self) !bool { + return if (self.buffer) |p| p.file_utf8_sanitized else error.Stop; + } + fn buf_a(self: *const Self) !Allocator { return if (self.buffer) |p| p.allocator else error.Stop; } @@ -517,6 +523,7 @@ pub const Editor = struct { } else return error.SaveNoFileName; try self.send_editor_save(self.file_path.?); self.last.dirty = false; + self.update_event() catch {}; } fn save_as(self: *Self, file_path: []const u8) !void { @@ -525,6 +532,7 @@ pub const Editor = struct { self.file_path = try self.allocator.dupe(u8, file_path); try self.send_editor_save(self.file_path.?); self.last.dirty = false; + self.update_event() catch {}; } pub fn push_cursor(self: *Self) !void { @@ -575,10 +583,10 @@ pub const Editor = struct { fn update_buf(self: *Self, root: Buffer.Root) !void { const b = self.buffer orelse return error.Stop; - return self.update_buf_and_eol_mode(root, b.file_eol_mode); + return self.update_buf_and_eol_mode(root, b.file_eol_mode, b.file_utf8_sanitized); } - fn update_buf_and_eol_mode(self: *Self, root: Buffer.Root, eol_mode: Buffer.EolMode) !void { + fn update_buf_and_eol_mode(self: *Self, root: Buffer.Root, eol_mode: Buffer.EolMode, utf8_sanitized: bool) !void { const b = self.buffer orelse return error.Stop; var sfa = std.heap.stackFallback(512, self.allocator); const allocator = sfa.get(); @@ -587,6 +595,7 @@ pub const Editor = struct { try b.store_undo(meta); b.update(root); b.file_eol_mode = eol_mode; + b.file_utf8_sanitized = utf8_sanitized; try self.send_editor_modified(); } @@ -1210,13 +1219,14 @@ pub const Editor = struct { const root: ?Buffer.Root = self.buf_root() catch null; const eol_mode = self.buf_eol_mode() catch .lf; + const utf8_sanitized = self.buf_utf8_sanitized() catch false; if (token_from(self.last.root) != token_from(root)) { try self.send_editor_update(self.last.root, root, eol_mode); self.lsp_version += 1; } - if (self.last.eol_mode != eol_mode) - try self.send_editor_eol_mode(eol_mode); + if (self.last.eol_mode != eol_mode or self.last.utf8_sanitized != utf8_sanitized) + try self.send_editor_eol_mode(eol_mode, utf8_sanitized); if (self.last.dirty != dirty) try self.send_editor_dirty(dirty); @@ -1254,6 +1264,7 @@ pub const Editor = struct { self.last.dirty = dirty; self.last.root = root; self.last.eol_mode = eol_mode; + self.last.utf8_sanitized = utf8_sanitized; } fn send_editor_pos(self: *const Self, cursor: *const Cursor) !void { @@ -1333,8 +1344,8 @@ pub const Editor = struct { project_manager.did_change(file_path, self.lsp_version, token_from(new_root), token_from(old_root), eol_mode) catch {}; } - fn send_editor_eol_mode(self: *const Self, eol_mode: Buffer.EolMode) !void { - _ = try self.handlers.msg(.{ "E", "eol_mode", @intFromEnum(eol_mode) }); + fn send_editor_eol_mode(self: *const Self, eol_mode: Buffer.EolMode, utf8_sanitized: bool) !void { + _ = try self.handlers.msg(.{ "E", "eol_mode", @intFromEnum(eol_mode), utf8_sanitized }); } fn clamp_abs(self: *Self, abs: bool) void { @@ -4134,7 +4145,7 @@ pub const Editor = struct { self.cancel_all_selections(); self.cancel_all_matches(); if (state.whole_file) |buf| { - state.work_root = try b.load_from_string(buf.items, &state.eol_mode); + state.work_root = try b.load_from_string(buf.items, &state.eol_mode, &state.utf8_sanitized); state.bytes = buf.items.len; state.chunks = 1; primary.cursor = state.old_primary.cursor; @@ -4145,7 +4156,7 @@ pub const Editor = struct { if (state.old_primary_reversed) sel.reverse(); primary.cursor = sel.end; } - try self.update_buf_and_eol_mode(state.work_root, state.eol_mode); + try self.update_buf_and_eol_mode(state.work_root, state.eol_mode, state.utf8_sanitized); primary.cursor.clamp_to_buffer(state.work_root, self.metrics); self.logger.print("filter: done (bytes:{d} chunks:{d})", .{ state.bytes, state.chunks }); self.reset_syntax(); diff --git a/src/tui/status/filestate.zig b/src/tui/status/filestate.zig index f2067d8..91e4351 100644 --- a/src/tui/status/filestate.zig +++ b/src/tui/status/filestate.zig @@ -32,6 +32,7 @@ file_dirty: bool = false, detailed: bool = false, file: bool = false, eol_mode: Buffer.EolMode = .lf, +utf8_sanitized: bool = false, const project_icon = ""; const Self = @This(); @@ -161,6 +162,11 @@ fn render_detailed(self: *Self, plane: *Plane, theme: *const Widget.Theme) void _ = plane.print(" of {d} lines", .{self.lines}) catch {}; if (self.file_type.len > 0) _ = plane.print(" ({s}){s}", .{ self.file_type, eol_mode }) catch {}; + + if (self.utf8_sanitized) { + plane.set_style(.{ .fg = theme.editor_error.fg.? }); + _ = plane.putstr(" [UTF-8 sanitized]") catch {}; + } } return; } @@ -196,7 +202,7 @@ pub fn receive(self: *Self, _: *Button.State(Self), _: tp.pid_ref, m: tp.message return false; if (try m.match(.{ "E", "dirty", tp.extract(&file_dirty) })) { self.file_dirty = file_dirty; - } else if (try m.match(.{ "E", "eol_mode", tp.extract(&eol_mode) })) { + } else if (try m.match(.{ "E", "eol_mode", tp.extract(&eol_mode), tp.extract(&self.utf8_sanitized) })) { self.eol_mode = @enumFromInt(eol_mode); } else if (try m.match(.{ "E", "save", tp.extract(&file_path) })) { @memcpy(self.name_buf[0..file_path.len], file_path); diff --git a/src/tui/status/linenumstate.zig b/src/tui/status/linenumstate.zig index cbeec3f..3d1c11a 100644 --- a/src/tui/status/linenumstate.zig +++ b/src/tui/status/linenumstate.zig @@ -10,12 +10,15 @@ const EventHandler = @import("EventHandler"); const Widget = @import("../Widget.zig"); const Button = @import("../Button.zig"); +const utf8_sanitized_warning = "  UTF"; + line: usize = 0, lines: usize = 0, column: usize = 0, buf: [256]u8 = undefined, rendered: [:0]const u8 = "", eol_mode: Buffer.EolMode = .lf, +utf8_sanitized: bool = false, const Self = @This(); @@ -36,7 +39,8 @@ fn on_click(_: *Self, _: *Button.State(Self)) void { } pub fn layout(self: *Self, btn: *Button.State(Self)) Widget.Layout { - const len = btn.plane.egc_chunk_width(self.rendered, 0, 1); + const warn_len = if (self.utf8_sanitized) btn.plane.egc_chunk_width(utf8_sanitized_warning, 0, 1) else 0; + const len = btn.plane.egc_chunk_width(self.rendered, 0, 1) + warn_len; return .{ .static = len }; } @@ -47,6 +51,11 @@ pub fn render(self: *Self, btn: *Button.State(Self), theme: *const Widget.Theme) btn.plane.set_style(if (btn.active) theme.editor_cursor else if (btn.hover) theme.statusbar_hover else theme.statusbar); btn.plane.fill(" "); btn.plane.home(); + if (self.utf8_sanitized) { + btn.plane.set_style(.{ .fg = theme.editor_error.fg.? }); + _ = btn.plane.putstr(utf8_sanitized_warning) catch {}; + } + btn.plane.set_style(if (btn.active) theme.editor_cursor else if (btn.hover) theme.statusbar_hover else theme.statusbar); _ = btn.plane.putstr(self.rendered) catch {}; return false; } @@ -67,7 +76,7 @@ pub fn receive(self: *Self, _: *Button.State(Self), _: tp.pid_ref, m: tp.message var eol_mode: Buffer.EolModeTag = @intFromEnum(Buffer.EolMode.lf); if (try m.match(.{ "E", "pos", tp.extract(&self.lines), tp.extract(&self.line), tp.extract(&self.column) })) { self.format(); - } else if (try m.match(.{ "E", "eol_mode", tp.extract(&eol_mode) })) { + } else if (try m.match(.{ "E", "eol_mode", tp.extract(&eol_mode), tp.extract(&self.utf8_sanitized) })) { self.eol_mode = @enumFromInt(eol_mode); self.format(); } else if (try m.match(.{ "E", "open", tp.more })) { @@ -78,6 +87,7 @@ pub fn receive(self: *Self, _: *Button.State(Self), _: tp.pid_ref, m: tp.message self.column = 0; self.rendered = ""; self.eol_mode = .lf; + self.utf8_sanitized = false; } return false; }