#include #include #include #include #include #include #include #include #include #include #include #define NOB_IMPLEMENTATION #include "nob.h" #define ARENA_IMPLEMENTATION #include "arena.h" #define HT_IMPLEMENTATION #include "ht.h" #if defined(Y) || defined(N) #error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined" #endif #define STB_C_LEX_C_DECIMAL_INTS Y // "0|[1-9][0-9]*" CLEX_intlit #define STB_C_LEX_C_HEX_INTS Y // "0x[0-9a-fA-F]+" CLEX_intlit #define STB_C_LEX_C_OCTAL_INTS Y // "[0-7]+" CLEX_intlit #define STB_C_LEX_C_DECIMAL_FLOATS Y // "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?)" CLEX_floatlit #define STB_C_LEX_C99_HEX_FLOATS N // "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+" CLEX_floatlit #define STB_C_LEX_C_IDENTIFIERS Y // "[_a-zA-Z][_a-zA-Z0-9]*" CLEX_id #define STB_C_LEX_C_DQ_STRINGS Y // double-quote-delimited strings with escapes CLEX_dqstring #define STB_C_LEX_C_SQ_STRINGS N // single-quote-delimited strings with escapes CLEX_ssstring #define STB_C_LEX_C_CHARS Y // single-quote-delimited character with escape CLEX_charlits #define STB_C_LEX_C_COMMENTS Y // "/* comment */" #define STB_C_LEX_CPP_COMMENTS Y // "// comment to end of line\n" #define STB_C_LEX_C_COMPARISONS Y // "==" CLEX_eq "!=" CLEX_noteq "<=" CLEX_lesseq ">=" CLEX_greatereq #define STB_C_LEX_C_LOGICAL Y // "&&" CLEX_andand "||" CLEX_oror #define STB_C_LEX_C_SHIFTS Y // "<<" CLEX_shl ">>" CLEX_shr #define STB_C_LEX_C_INCREMENTS Y // "++" CLEX_plusplus "--" CLEX_minusminus #define STB_C_LEX_C_ARROW Y // "->" CLEX_arrow #define STB_C_LEX_EQUAL_ARROW N // "=>" CLEX_eqarrow #define STB_C_LEX_C_BITWISEEQ Y // "&=" CLEX_andeq "|=" CLEX_oreq "^=" CLEX_xoreq #define STB_C_LEX_C_ARITHEQ Y // "+=" CLEX_pluseq "-=" CLEX_minuseq // "*=" CLEX_muleq "/=" CLEX_diveq "%=" CLEX_modeq // if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ: // "<<=" CLEX_shleq ">>=" CLEX_shreq #define STB_C_LEX_PARSE_SUFFIXES N // letters after numbers are parsed as part of those numbers, and must be in suffix list below #define STB_C_LEX_DECIMAL_SUFFIXES "" // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage #define STB_C_LEX_HEX_SUFFIXES "" // e.g. "uUlL" #define STB_C_LEX_OCTAL_SUFFIXES "" // e.g. "uUlL" #define STB_C_LEX_FLOAT_SUFFIXES "" // #define STB_C_LEX_0_IS_EOF N // if Y, ends parsing at '\0'; if N, returns '\0' as token #define STB_C_LEX_INTEGERS_AS_DOUBLES N // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N #define STB_C_LEX_MULTILINE_DSTRINGS N // allow newlines in double-quoted strings #define STB_C_LEX_MULTILINE_SSTRINGS N // allow newlines in single-quoted strings #define STB_C_LEX_USE_STDLIB Y // use strtod,strtol for parsing #s; otherwise inaccurate hack #define STB_C_LEX_DOLLAR_IDENTIFIER Y // allow $ as an identifier character #define STB_C_LEX_FLOAT_NO_DECIMAL Y // allow floats that have no decimal point if they have an exponent #define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES N // if Y, all CLEX_ token names are defined, even if never returned // leaving it as N should help you catch config bugs #define STB_C_LEX_DISCARD_PREPROCESSOR Y // discard C-preprocessor directives (e.g. after prepocess // still have #line, #pragma, etc) //#define STB_C_LEX_ISWHITE(str) ... // return length in bytes of whitespace characters if first char is whitespace #define STB_C_LEXER_DEFINITIONS // This line prevents the header file from replacing your definitions #define STB_C_LEXER_IMPLEMENTATION #include "stb_c_lexer.h" static void print_token(FILE *stream, stb_lexer *lexer) { switch (lexer->token) { case CLEX_id : fprintf(stream, "_%s", lexer->string) ; break; case CLEX_eq : fprintf(stream, "==") ; break; case CLEX_noteq : fprintf(stream, "!=") ; break; case CLEX_lesseq : fprintf(stream, "<=") ; break; case CLEX_greatereq : fprintf(stream, ">=") ; break; case CLEX_andand : fprintf(stream, "&&") ; break; case CLEX_oror : fprintf(stream, "||") ; break; case CLEX_shl : fprintf(stream, "<<") ; break; case CLEX_shr : fprintf(stream, ">>") ; break; case CLEX_plusplus : fprintf(stream, "++") ; break; case CLEX_minusminus: fprintf(stream, "--") ; break; case CLEX_arrow : fprintf(stream, "->") ; break; case CLEX_andeq : fprintf(stream, "&=") ; break; case CLEX_oreq : fprintf(stream, "|=") ; break; case CLEX_xoreq : fprintf(stream, "^=") ; break; case CLEX_pluseq : fprintf(stream, "+=") ; break; case CLEX_minuseq : fprintf(stream, "-=") ; break; case CLEX_muleq : fprintf(stream, "*=") ; break; case CLEX_diveq : fprintf(stream, "/=") ; break; case CLEX_modeq : fprintf(stream, "%%=") ; break; case CLEX_shleq : fprintf(stream, "<<=") ; break; case CLEX_shreq : fprintf(stream, ">>=") ; break; case CLEX_eqarrow : fprintf(stream, "=>") ; break; case CLEX_dqstring : fprintf(stream, "\"%s\"", lexer->string) ; break; case CLEX_sqstring : fprintf(stream, "'\"%s\"'", lexer->string); break; case CLEX_charlit : fprintf(stream, "'%s'", lexer->string) ; break; #if defined(STB__clex_int_as_double) && !defined(STB__CLEX_use_stdlib) case CLEX_intlit : fprintf(stream, "#%g", lexer->real_number); break; #else case CLEX_intlit : fprintf(stream, "#%ld", lexer->int_number); break; #endif case CLEX_floatlit : fprintf(stream, "%g", lexer->real_number) ; break; default: if (lexer->token >= 0 && lexer->token < 256) fprintf(stream, "%c", (int) lexer->token); else { fprintf(stream, "<<>>\n", lexer->token); } break; } } //////////////////////////////////////////////////////////////// // TODO: GMP uses its own allocation hooks, so we may need custom allocators // for arena-friendly ownership or debugging. Example: // mp_set_memory_functions(my_alloc, my_realloc, my_free); // See: https://gmplib.org/manual/Custom-Allocation // // TODO: We currently leak GMP-owned memory because we do not call the // matching cleanup functions. Example: // mpz_init(x); // ... // mpz_clear(x); typedef enum { Node_Type_Nil, Node_Type_Integer, Node_Type_Real, Node_Type_String, Node_Type_Symbol, Node_Type_List, } Node_Type; typedef struct Node Node; struct Node { Node_Type type; union { mpz_t integer; String_View string; String_View symbol; struct { Node *head; Node *tail; } list; } as; Node *next; }; typedef Ht(String_View, Node *) Symbol_Table; typedef struct Scope Scope; struct Scope { Scope *parent; Symbol_Table symbols; }; static Scope * scope_new(Arena *arena, Scope *parent); static void scope_free(Scope *scope); static void scope_insert(Scope *scope, String_View name, Node *node); static Node * scope_lookup(Scope *scope, String_View name); static Node * scope_lookup_current(Scope *scope, String_View name); static Scope * scope_new(Arena *arena, Scope *parent) { // TODO: We can move the allocation and zeroing to a separate function. Scope *scope = arena_alloc(arena, sizeof(Scope)); assert(scope != NULL); memset(scope, 0, sizeof(Scope)); scope->parent = parent; scope->symbols.hasheq = ht_sv_hasheq; return scope; } static void scope_free(Scope *scope) { // NOTE: Even if the hash table allocates on the heap, that is fine because // scopes are meant to be freed individually. ht_free(&scope->symbols); } static void scope_insert(Scope *scope, String_View name, Node *node) { *ht_find_or_put(&scope->symbols, name) = node; } static Node * scope_lookup(Scope *scope, String_View name) { Node *result = NULL; while (!result && scope) { result = scope_lookup_current(scope, name); scope = scope->parent; } return result; } static Node * scope_lookup_current(Scope *scope, String_View name) { return *ht_find(&scope->symbols, name); } // TODO: Add diagnostics, symbol table, options, etc. typedef struct Context Context; struct Context { Arena arena; Scope *global_scope; Scope *current_scope; }; typedef struct Parser Parser; struct Parser { Context *context; stb_lexer lexer; bool had_error; }; static Node * parser_parse(Parser *parser); static Node * parser_parse_expr(Parser *parser); static Node * parser_parse_list(Parser *parser); static Node * parser_parse_atom(Parser *parser); static bool parser_accept(Parser *parser, long token); static bool parser_expect(Parser *parser, long token); static void parser_advance(Parser *parser); static void parser_error(Parser *parser, const char *message); static bool parser_accept(Parser *parser, long token) { if (parser->lexer.token == token) { parser_advance(parser); return true; } return false; } static bool parser_expect(Parser *parser, long token) { bool ok = parser_accept(parser, token); if (!ok) parser_error(parser, "parser_expect"); return ok; } static void parser_advance(Parser *parser) { stb_c_lexer_get_token(&parser->lexer); } // TODO: Error handling needs to be revised. static void parser_error(Parser *parser, const char *message) { parser->had_error = true; fprintf(stderr, "Error: %s\n", message); } static Node * parser_parse(Parser *parser) { parser_advance(parser); return parser_parse_expr(parser); } static Node * parser_parse_expr(Parser *parser) { if (parser->lexer.token == '(') return parser_parse_list(parser); return parser_parse_atom(parser); } static Node * parser_parse_list(Parser *parser) { // TODO: We can move the allocation and zeroing to a separate function. Node *list = arena_alloc(&parser->context->arena, sizeof(Node)); assert(list != NULL); memset(list, 0, sizeof(Node)); list->type = Node_Type_List; parser_expect(parser, '('); while (!parser_accept(parser, ')')) { Node *expr = parser_parse_expr(parser); if (list->as.list.head == NULL) { list->as.list.head = list->as.list.tail = expr; } else { list->as.list.tail = list->as.list.tail->next = expr; } } return list; } static Node * parser_parse_atom(Parser *parser) { // TODO: We can move the allocation and zeroing to a separate function. Node *atom = arena_alloc(&parser->context->arena, sizeof(Node)); assert(atom != NULL); memset(atom, 0, sizeof(Node)); switch (parser->lexer.token) { case CLEX_intlit: atom->type = Node_Type_Integer; mpz_init(atom->as.integer); mpz_set_si(atom->as.integer, parser->lexer.int_number); parser_advance(parser); break; case CLEX_floatlit: atom->type = Node_Type_Real; // TODO: Store real numbers inside the AST. assert(false && "Not implemented."); (void) parser->lexer.real_number; parser_advance(parser); break; case CLEX_dqstring: atom->type = Node_Type_String; atom->as.string = nob_sv_from_parts(parser->lexer.string, parser->lexer.string_len); parser_advance(parser); break; case CLEX_id: atom->type = Node_Type_Symbol; atom->as.symbol = nob_sv_from_parts(parser->lexer.string, parser->lexer.string_len); parser_advance(parser); break; default: parser_error(parser, "parser_parse_atom"); break; } return atom; } //////////////////////////////////////////////////////////////// int main(int argc, char **argv) { fprintf(stdout, "GMP version: %s\n", gmp_version); if (argc != 2) { assert(argc >= 1); fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } String_Builder sb = {0}; if (!read_entire_file(argv[1], &sb)) return 1; char string_store[1024 * 1024]; int store_length = ARRAY_LEN(string_store); // stb_lexer lexer = {0}; // const char *input_stream = sb.items; // const char *input_stream_end = sb.items + sb.count; // stb_c_lexer_init(&lexer, input_stream, input_stream_end, // string_store, store_length); // while (stb_c_lexer_get_token(&lexer)) { // if (lexer.token == CLEX_parse_error) { // fprintf(stderr, "\n<<>>\n"); // break; // } // print_token(stdout, &lexer); // fprintf(stdout, " "); // } // fprintf(stdout, "\n"); Context context = { .arena = {0}, }; Scope *scope = scope_new(context.arena, NULL); context.global_scope = scope; context.current_scope = scope; Parser parser = { .context = &context, .lexer = {0}, .had_error = false, }; const char *input_stream = sb.items; const char *input_stream_end = sb.items + sb.count; stb_c_lexer_init(&parser.lexer, input_stream, input_stream_end, string_store, store_length); Node *root = parser_parse(&parser); (void) root; scope_free(scope); return 0; }