1 files changed, 260 insertions, 60 deletions
diff --git a/source/forest.c b/source/forest.c
index ccec067..3e756fb 100644
--- a/source/forest.c
+++ b/source/forest.c
@@ -1,5 +1,6 @@
 #include <assert.h>
 #include <inttypes.h>
+#include <stdarg.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,6 +10,8 @@
 #include <stdlib.h>
 #include <unistd.h>
 
+#include <gmp.h>
+
 #define ARENA_IMPLEMENTATION
 #include "arena.h"
 
@@ -25,8 +28,8 @@
 #define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
 #define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
 #define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
-#define STB_C_LEX_C_DECIMAL_FLOATS  Y   //  "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?)     CLEX_floatlit
-#define STB_C_LEX_C99_HEX_FLOATS    N   //  "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+     CLEX_floatlit
+#define STB_C_LEX_C_DECIMAL_FLOATS  Y   //  "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?)"    CLEX_floatlit
+#define STB_C_LEX_C99_HEX_FLOATS    N   //  "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+"    CLEX_floatlit
 #define STB_C_LEX_C_IDENTIFIERS     Y   //  "[_a-zA-Z][_a-zA-Z0-9]*"               CLEX_id
 #define STB_C_LEX_C_DQ_STRINGS      Y   //  double-quote-delimited strings with escapes  CLEX_dqstring
 #define STB_C_LEX_C_SQ_STRINGS      N   //  single-quote-delimited strings with escapes  CLEX_ssstring
@@ -41,9 +44,9 @@
 #define STB_C_LEX_EQUAL_ARROW       N   //  "=>"  CLEX_eqarrow
 #define STB_C_LEX_C_BITWISEEQ       Y   //  "&="  CLEX_andeq    "|="  CLEX_oreq     "^="  CLEX_xoreq
 #define STB_C_LEX_C_ARITHEQ         Y   //  "+="  CLEX_pluseq   "-="  CLEX_minuseq
-                                        //  "*="  CLEX_muleq    "/="  CLEX_diveq    "%=" CLEX_modeq
-                                        //  if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
-                                        //                      "<<=" CLEX_shleq    ">>=" CLEX_shreq
+					//  "*="  CLEX_muleq    "/="  CLEX_diveq    "%=" CLEX_modeq
+					//  if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
+					//                      "<<=" CLEX_shleq    ">>=" CLEX_shreq
 
 #define STB_C_LEX_PARSE_SUFFIXES    N   // letters after numbers are parsed as part of those numbers, and must be in suffix list below
 #define STB_C_LEX_DECIMAL_SUFFIXES  ""  // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
@@ -60,67 +63,245 @@
 #define STB_C_LEX_FLOAT_NO_DECIMAL     Y  // allow floats that have no decimal point if they have an exponent
 
 #define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES  N   // if Y, all CLEX_ token names are defined, even if never returned
-                                              // leaving it as N should help you catch config bugs
+					      // leaving it as N should help you catch config bugs
 
 #define STB_C_LEX_DISCARD_PREPROCESSOR    Y   // discard C-preprocessor directives (e.g. after prepocess
-                                              // still have #line, #pragma, etc)
+					      // still have #line, #pragma, etc)
 
-//#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
+					      //#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
 
 #define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
 
 #define STB_C_LEXER_IMPLEMENTATION
 #include "stb_c_lexer.h"
 
+static void
+print_token(FILE *stream, stb_lexer *lexer) {
+	switch (lexer->token) {
+		case CLEX_id        : fprintf(stream, "_%s", lexer->string)     ; break;
+		case CLEX_eq        : fprintf(stream, "==")                     ; break;
+		case CLEX_noteq     : fprintf(stream, "!=")                     ; break;
+		case CLEX_lesseq    : fprintf(stream, "<=")                     ; break;
+		case CLEX_greatereq : fprintf(stream, ">=")                     ; break;
+		case CLEX_andand    : fprintf(stream, "&&")                     ; break;
+		case CLEX_oror      : fprintf(stream, "||")                     ; break;
+		case CLEX_shl       : fprintf(stream, "<<")                     ; break;
+		case CLEX_shr       : fprintf(stream, ">>")                     ; break;
+		case CLEX_plusplus  : fprintf(stream, "++")                     ; break;
+		case CLEX_minusminus: fprintf(stream, "--")                     ; break;
+		case CLEX_arrow     : fprintf(stream, "->")                     ; break;
+		case CLEX_andeq     : fprintf(stream, "&=")                     ; break;
+		case CLEX_oreq      : fprintf(stream, "|=")                     ; break;
+		case CLEX_xoreq     : fprintf(stream, "^=")                     ; break;
+		case CLEX_pluseq    : fprintf(stream, "+=")                     ; break;
+		case CLEX_minuseq   : fprintf(stream, "-=")                     ; break;
+		case CLEX_muleq     : fprintf(stream, "*=")                     ; break;
+		case CLEX_diveq     : fprintf(stream, "/=")                     ; break;
+		case CLEX_modeq     : fprintf(stream, "%%=")                    ; break;
+		case CLEX_shleq     : fprintf(stream, "<<=")                    ; break;
+		case CLEX_shreq     : fprintf(stream, ">>=")                    ; break;
+		case CLEX_eqarrow   : fprintf(stream, "=>")                     ; break;
+		case CLEX_dqstring  : fprintf(stream, "\"%s\"", lexer->string)  ; break;
+		case CLEX_sqstring  : fprintf(stream, "'\"%s\"'", lexer->string); break;
+		case CLEX_charlit   : fprintf(stream, "'%s'", lexer->string)    ; break;
+#if defined(STB__clex_int_as_double) && !defined(STB__CLEX_use_stdlib)
+		case CLEX_intlit    : fprintf(stream, "#%g", lexer->real_number); break;
+#else
+		case CLEX_intlit    : fprintf(stream, "#%ld", lexer->int_number); break;
+#endif
+		case CLEX_floatlit  : fprintf(stream, "%g", lexer->real_number) ; break;
+		default:
+			if (lexer->token >= 0 && lexer->token < 256)
+				fprintf(stream, "%c", (int) lexer->token);
+			else {
+				fprintf(stream, "<<<UNKNOWN TOKEN %ld >>>\n", lexer->token);
+			}
+			break;
+	}
+}
+
+////////////////////////////////////////////////////////////////
+
+// TODO: GMP uses its own allocation hooks, so we may need custom allocators
+//       for arena-friendly ownership or debugging. Example:
+//         mp_set_memory_functions(my_alloc, my_realloc, my_free);
+//       See: https://gmplib.org/manual/Custom-Allocation
+//
+// TODO: We currently leak GMP-owned memory because we do not call the
+//       matching cleanup functions. Example:
+//         mpz_init(x);
+//         ...
+//         mpz_clear(x);
+
+
+typedef enum {
+	Node_Type_Nil,
+	Node_Type_Integer,
+	Node_Type_Real,
+	Node_Type_String,
+	Node_Type_Symbol,
+	Node_Type_List,
+} Node_Type;
+
+typedef struct Node Node;
+struct Node {
+	Node_Type type;
+	union {
+		mpz_t integer;
+		String_View string;
+		String_View symbol;
+
+		struct {
+			Node *head;
+			Node *tail;
+		} list;
+	} as;
+	Node *next;
+};
+
+
+// TODO: Add diagnostics, symbol table, options, etc.
+typedef struct Context Context;
+struct Context {
+	Arena arena;
+};
+
+
+typedef struct Parser Parser;
+struct Parser {
+	Context *context;
+	stb_lexer lexer;
+	bool had_error;
+};
+
+static Node *
+parser_parse(Parser *parser);
+static Node *
+parser_parse_expr(Parser *parser);
+static Node *
+parser_parse_list(Parser *parser);
+static Node *
+parser_parse_atom(Parser *parser);
+
+static bool
+parser_accept(Parser *parser, long token);
+static bool
+parser_expect(Parser *parser, long token);
+static void
+parser_advance(Parser *parser);
+static void
+parser_error(Parser *parser, const char *message);
+
+static bool
+parser_accept(Parser *parser, long token) {
+	if (parser->lexer.token == token) {
+		parser_advance(parser);
+		return true;
+	}
+	return false;
+}
+
+static bool
+parser_expect(Parser *parser, long token) {
+	bool ok = parser_accept(parser, token);
+	if (!ok)
+		parser_error(parser, "parser_expect");
+	return ok;
+}
 
 static void
-print_token(stb_lexer *lexer) {
-   switch (lexer->token) {
-      case CLEX_id        : printf("_%s", lexer->string); break;
-      case CLEX_eq        : printf("=="); break;
-      case CLEX_noteq     : printf("!="); break;
-      case CLEX_lesseq    : printf("<="); break;
-      case CLEX_greatereq : printf(">="); break;
-      case CLEX_andand    : printf("&&"); break;
-      case CLEX_oror      : printf("||"); break;
-      case CLEX_shl       : printf("<<"); break;
-      case CLEX_shr       : printf(">>"); break;
-      case CLEX_plusplus  : printf("++"); break;
-      case CLEX_minusminus: printf("--"); break;
-      case CLEX_arrow     : printf("->"); break;
-      case CLEX_andeq     : printf("&="); break;
-      case CLEX_oreq      : printf("|="); break;
-      case CLEX_xoreq     : printf("^="); break;
-      case CLEX_pluseq    : printf("+="); break;
-      case CLEX_minuseq   : printf("-="); break;
-      case CLEX_muleq     : printf("*="); break;
-      case CLEX_diveq     : printf("/="); break;
-      case CLEX_modeq     : printf("%%="); break;
-      case CLEX_shleq     : printf("<<="); break;
-      case CLEX_shreq     : printf(">>="); break;
-      case CLEX_eqarrow   : printf("=>"); break;
-      case CLEX_dqstring  : printf("\"%s\"", lexer->string); break;
-      case CLEX_sqstring  : printf("'\"%s\"'", lexer->string); break;
-      case CLEX_charlit   : printf("'%s'", lexer->string); break;
-      #if defined(STB__clex_int_as_double) && !defined(STB__CLEX_use_stdlib)
-      case CLEX_intlit    : printf("#%g", lexer->real_number); break;
-      #else
-      case CLEX_intlit    : printf("#%ld", lexer->int_number); break;
-      #endif
-      case CLEX_floatlit  : printf("%g", lexer->real_number); break;
-      default:
-         if (lexer->token >= 0 && lexer->token < 256)
-            printf("%c", (int) lexer->token);
-         else {
-            printf("<<<UNKNOWN TOKEN %ld >>>\n", lexer->token);
-         }
-         break;
-   }
+parser_advance(Parser *parser) {
+	stb_c_lexer_get_token(&parser->lexer);
+}
+
+// TODO: Error handling needs to be revised.
+static void
+parser_error(Parser *parser, const char *message) {
+	parser->had_error = true;
+	fprintf(stderr, "Error: %s\n", message);
+}
+
+static Node *
+parser_parse(Parser *parser) {
+	parser_advance(parser);
+	return parser_parse_expr(parser);
+}
+
+static Node *
+parser_parse_expr(Parser *parser) {
+	if (parser->lexer.token == '(')
+		return parser_parse_list(parser);
+	return parser_parse_atom(parser);
 }
 
+static Node *
+parser_parse_list(Parser *parser) {
+	// TODO: We can move the allocation and zeroing to a separate function that accepts the context.
+	Node *list = arena_alloc(&parser->context->arena, sizeof(Node));
+	assert(list != NULL);
+	memset(list, 0, sizeof(Node));
+
+	list->type = Node_Type_List;
+
+	parser_expect(parser, '(');
+	while (!parser_accept(parser, ')')) {
+		Node *expr = parser_parse_expr(parser);
+
+		if (list->as.list.head == NULL) {
+			list->as.list.head = list->as.list.tail = expr;
+		} else {
+			list->as.list.tail = list->as.list.tail->next = expr;
+		}
+	}
+
+	return list;
+}
+
+static Node *
+parser_parse_atom(Parser *parser) {
+	// TODO: We can move the allocation and zeroing to a separate function that accepts the context.
+	Node *atom = arena_alloc(&parser->context->arena, sizeof(Node));
+	assert(atom != NULL);
+	memset(atom, 0, sizeof(Node));
+
+	switch (parser->lexer.token) {
+		case CLEX_intlit:
+			atom->type = Node_Type_Integer;
+			mpz_init(atom->as.integer);
+			mpz_set_si(atom->as.integer, parser->lexer.int_number);
+			parser_advance(parser);
+			break;
+		case CLEX_floatlit:
+			atom->type = Node_Type_Real;
+			// TODO: Store real numbers inside the AST.
+			assert(false && "Not implemented.");
+			(void) parser->lexer.real_number;
+			parser_advance(parser);
+			break;
+		case CLEX_dqstring:
+			atom->type = Node_Type_String;
+			atom->as.string = nob_sv_from_parts(parser->lexer.string, parser->lexer.string_len);
+			parser_advance(parser);
+			break;
+		case CLEX_id:
+			atom->type = Node_Type_Symbol;
+			atom->as.symbol = nob_sv_from_parts(parser->lexer.string, parser->lexer.string_len);
+			parser_advance(parser);
+			break;
+		default:
+			parser_error(parser, "parser_parse_atom");
+			break;
+	}
+
+	return atom;
+}
+
+////////////////////////////////////////////////////////////////
 
 int
 main(int argc, char **argv) {
+	fprintf(stdout, "GMP version: %s\n", gmp_version);
+
 	if (argc != 2) {
 		assert(argc >= 1);
 		fprintf(stderr, "Usage: %s <source>\n", argv[0]);
@@ -134,20 +315,39 @@ main(int argc, char **argv) {
 	char string_store[1024 * 1024];
 	int store_length = ARRAY_LEN(string_store);
 
-	stb_lexer lexer = {0};
+	// stb_lexer lexer = {0};
+	// const char *input_stream = sb.items;
+	// const char *input_stream_end = sb.items + sb.count;
+	// stb_c_lexer_init(&lexer, input_stream, input_stream_end,
+	//		string_store, store_length);
+
+	// while (stb_c_lexer_get_token(&lexer)) {
+	// 	if (lexer.token == CLEX_parse_error) {
+	// 		fprintf(stderr, "\n<<<PARSE ERROR>>>\n");
+	// 		break;
+	// 	}
+	// 	print_token(stdout, &lexer);
+	// 	fprintf(stdout, "  ");
+	// }
+	// fprintf(stdout, "\n");
+
+	Context context = {
+		.arena = {0},
+	};
+
+	Parser parser = {
+		.context = &context,
+		.lexer = {0},
+		.had_error = false,
+	};
+
 	const char *input_stream = sb.items;
 	const char *input_stream_end = sb.items + sb.count;
-	stb_c_lexer_init(&lexer, input_stream, input_stream_end,
+	stb_c_lexer_init(&parser.lexer, input_stream, input_stream_end,
 			string_store, store_length);
 
-	while (stb_c_lexer_get_token(&lexer)) {
-		if (lexer.token == CLEX_parse_error) {
-			fprintf(stderr, "\n<<<PARSE ERROR>>>\n");
-			break;
-		}
-		print_token(&lexer);
-		printf("  ");
-	}
+	Node *root = parser_parse(&parser);
+	(void) root;
 
 	return 0;
 }