我正在开发一个功能不完整的小型 Lisp 解释器,以学习一些 Zig。我的灵感是这样的 https://norvig.com/lispy.html,这可能不是在 C/Zig 中实现这一点的最惯用的方法,但仍然非常小且简单,这正是我正在寻找的。我现在正站在通过标记联合代表 AST 的舞台上。但是,我无法理解为什么下面的代码似乎使我陷入损坏/覆盖的内存状态。
const std = @import("std");
// const test_string = "(begin (define r 10) (* pi (* r r)))";
const shorter_test_string = "(begin (define r 10))";
const Allocator = std.mem.Allocator;
const Tag = enum { String, Number, Bool, Symbol };
const Type = union(Tag) { String: std.ArrayList(u8), Number: i64, Bool: bool, Symbol: []const u8 };
const TokenType = enum { Expr, Single };
const Token = union(TokenType) {
Expr: *std.ArrayList(*Token),
Single: *Type,
};
pub fn finalizeWord(string_collector: *std.ArrayList(u8), outer_list: *std.ArrayList(std.ArrayList(u8))) !void {
if (!std.mem.eql(u8, string_collector.items, "")) {
try outer_list.append(try string_collector.clone());
string_collector.clearRetainingCapacity();
}
}
// TODO: I commented out all the deallocations for debugging purposes, will add them later
pub fn tokenize(alloca: *Allocator, parsed_input: *std.ArrayList(std.ArrayList(u8))) !*Token {
var token: std.ArrayList(u8) = parsed_input.orderedRemove(0);
if (std.mem.eql(u8, token.items, "(")) {
std.debug.print("Open bracket encountered \n", .{});
// defer token.deinit();
var list = std.ArrayList(*Token).init(alloca.*);
var local_token = try moveToHeap(alloca, Token{ .Expr = &list });
while (!std.mem.eql(u8, parsed_input.items[0].items, ")")) {
std.debug.print("While loop iteration \n", .{});
var item_to_add = try tokenize(alloca, parsed_input);
try list.append(item_to_add);
}
_ = parsed_input.orderedRemove(0);
// defer closing_bracket.deinit();
return local_token;
} else if (std.mem.eql(u8, token.items, ")")) {
// TODO: This is definitely not unreachable but a Syntax Error
unreachable;
} else {
var item = try moveToHeap(alloca, Token{ .Single = try atomize(alloca, token) });
std.debug.print("Single item encountered: {s} \n", .{item.*.Single.*.String.items});
return item;
}
}
pub fn atomize(alloca: *Allocator, item: std.ArrayList(u8)) !*Type {
return try moveToHeap(alloca, Type{ .String = try item.clone() });
}
pub fn parse(alloca: *Allocator, input_string: []const u8) !std.ArrayList(std.ArrayList(u8)) {
var outer_list = std.ArrayList(std.ArrayList(u8)).init(alloca.*);
var string_collector = std.ArrayList(u8).init(alloca.*);
defer string_collector.deinit();
for (input_string) |character| {
if (character == '(') {
try finalizeWord(&string_collector, &outer_list);
var opening_bracket = std.ArrayList(u8).init(alloca.*);
try opening_bracket.append('(');
try outer_list.append(opening_bracket);
continue;
} else if (character == ')') {
try finalizeWord(&string_collector, &outer_list);
var closing_bracket = std.ArrayList(u8).init(alloca.*);
try closing_bracket.append(')');
try outer_list.append(closing_bracket);
continue;
} else if (character == ' ') {
try finalizeWord(&string_collector, &outer_list);
} else {
try string_collector.append(character);
}
}
return outer_list;
}
pub fn moveToHeap(allocator: *Allocator, value: anytype) !*@TypeOf(value) {
const T = @TypeOf(value);
std.debug.assert(@typeInfo(T) != .Pointer);
const ptr = try allocator.create(T);
ptr.* = value;
return ptr;
}
pub fn print_tokens(token: *Token) void {
switch (token.*) {
.Expr => |expr| {
// TODO: deallocations should happen in separate functions
// defer expr.deinit();
std.debug.print("Processing Expression \n", .{});
for (expr.items) |t| {
print_tokens(t);
}
},
.Single => |item| {
// TODO: deallocations should happen in separate functions
// defer item.*.String.deinit();
std.debug.print("Processing Token: {s}\n", .{item.*.String.items});
},
}
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer {
_ = gpa.deinit();
}
var allocator = gpa.allocator();
var parsed_string = try parse(&allocator, shorter_test_string);
defer parsed_string.deinit();
defer {
for (parsed_string.items) |item| {
item.deinit();
}
}
var tokens = try tokenize(&allocator, &parsed_string);
print_tokens(tokens); // SEGFAULT happens here
}
我正在努力解决的特定部分是函数
tokenize
和print_tokens
。如果您在 main
中运行代码,您会在打印标记时收到段错误,这似乎是因为一旦退出 tokenize
函数,某些嵌套 Expr
的内存就会被损坏/覆盖。我有一种直觉,这种表现形式
const Tag = enum { String, Number, Bool, Symbol };
const Type = union(Tag) { String: std.ArrayList(u8), Number: i64, Bool: bool, Symbol: []const u8 };
const TokenType = enum { Expr, Single };
const Token = union(TokenType) {
Expr: *std.ArrayList(*Token),
Single: *Type,
};
可能存在一些固有的设计问题,但此时我想知道内存覆盖的实际原因是什么,因为所有内容都是手动放置在堆上的。
我尝试使用 LLDB 进行调试,似乎甚至在这里放置
print
或进行堆分配
_ = parsed_input.orderedRemove(0);
std.debug.print("Sample print", .{});
// defer closing_bracket.deinit();
return local_token;
覆盖
Expr
中的内部 local_token.*.Expr.*.items[1]
。正如我之前提到的,我现在更感兴趣的是为什么在最终的 Lisp 解释器实现中会出现内存问题:)
问题在于如何在
tokenize
中创建表达式列表:
var list = std.ArrayList(*Token).init(alloca.*);
var local_token = try moveToHeap(alloca, Token{ .Expr = &list });
您存储了一个指向
list
的指针,但它位于函数的堆栈上。一旦tokenize
退出,指针就失效了。只需使用分配器创建列表即可。例如:
var list = try alloca.create(std.ArrayList(*Token));
list.* = std.ArrayList(*Token).init(alloca.*);
var local_token = try moveToHeap(alloca, Token{ .Expr = list });