From 10bb8a05088f1d3bb24f7167f609b5f6fb0ba026 Mon Sep 17 00:00:00 2001
From: Johnny Richard <johnny@johnnyrichard.com>
Date: Wed, 30 Oct 2024 22:58:03 +0100
Subject: bootstrap project

Signed-off-by: Johnny Richard <johnny@johnnyrichard.com>
---
 src/lexer.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 src/lexer.c

(limited to 'src/lexer.c')

diff --git a/src/lexer.c b/src/lexer.c
new file mode 100644
index 0000000..3ad8751
--- /dev/null
+++ b/src/lexer.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2025 Johnny Richard <johnny@johnnyrichard.com>
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ *
+ * This file is part of obe.
+ *
+ * obe is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * obe is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with obe. If not, see <https://www.gnu.org/licenses/>.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <obe/lexer.h>
+#include <obe/string.h>
+#include <obe/utils.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void
+obe_lexer_init(obe_lexer_t* lexer, char* filename)
+{
+    assert(lexer);
+
+    char* program = read_file_contents(filename);
+    if (program == NULL) {
+        fprintf(stderr, "Unable to read file contents <%s>\n", filename);
+        exit(EXIT_FAILURE);
+    }
+
+    lexer->filename = filename;
+    lexer->loc = (obe_lexer_loc_t){ 0 };
+    lexer->source = obe_string_from_cstr(program);
+}
+
+bool
+obe_lexer_is_eof(obe_lexer_t* lexer)
+{
+    return !(lexer->loc.offset < lexer->source.length);
+}
+
+char
+obe_lexer_current_char(obe_lexer_t* lexer)
+{
+    return lexer->source.chars[lexer->loc.offset];
+}
+
+char
+obe_lexer_next_char(obe_lexer_t* lexer)
+{
+    assert(lexer->loc.offset < lexer->source.length);
+
+    char previous_char = obe_lexer_current_char(lexer);
+    if (previous_char == '\n') {
+        lexer->loc.lineno++;
+        lexer->loc.lineoffset = ++lexer->loc.offset;
+    } else {
+        lexer->loc.offset++;
+    }
+    return obe_lexer_current_char(lexer);
+}
+
+void
+obe_lexer_next_token(obe_lexer_t* lexer, obe_token_t* token)
+{
+    if (obe_lexer_is_eof(lexer)) {
+        *token = (obe_token_t){ .kind = TOKEN_EOF };
+        return;
+    }
+
+    char c = obe_lexer_current_char(lexer);
+    if (isspace(c) && !obe_lexer_is_eof(lexer)) {
+        while (isspace(c) && !obe_lexer_is_eof(lexer)) {
+            c = obe_lexer_next_char(lexer);
+        }
+    }
+
+    if (obe_lexer_is_eof(lexer)) {
+        *token = (obe_token_t){ .kind = TOKEN_EOF };
+        return;
+    }
+
+    if (isalpha(c) || c == '_') {
+        obe_lexer_loc_t start_loc = lexer->loc;
+        while ((isalnum(c) || c == '_') && !obe_lexer_is_eof(lexer)) {
+            c = obe_lexer_next_char(lexer);
+        }
+        obe_string_t token_value = {
+            .chars = lexer->source.chars + start_loc.offset,
+            .length = lexer->loc.offset - start_loc.offset
+        };
+        token->value = token_value;
+        token->loc = start_loc;
+        if (obe_string_eq(token_value, obe_string_from_cstr("fn"))) {
+            token->kind = TOKEN_KW_FN;
+            return;
+        }
+        if (obe_string_eq(token_value, obe_string_from_cstr("br"))) {
+            token->kind = TOKEN_KW_BR;
+            return;
+        }
+        if (obe_string_eq(token_value,
+                               obe_string_from_cstr("return"))) {
+            token->kind = TOKEN_KW_RETURN;
+            return;
+        }
+        if (obe_string_eq(token_value, obe_string_from_cstr("int"))) {
+            token->kind = TOKEN_INT;
+            return;
+        }
+        token->kind = TOKEN_IDENT;
+        return;
+    }
+
+    if (c == '.') {
+        obe_lexer_loc_t start_loc = lexer->loc;
+        do {
+            c = obe_lexer_next_char(lexer);
+        } while ((isalnum(c) || c == '_') && !obe_lexer_is_eof(lexer));
+
+        obe_string_t token_value = {
+            .chars = lexer->source.chars + start_loc.offset,
+            .length = lexer->loc.offset - start_loc.offset
+        };
+
+        token->value = token_value;
+        token->loc = start_loc;
+        token->kind = TOKEN_LABEL;
+        return;
+    }
+
+    if (isdigit(c)) {
+        obe_lexer_loc_t start_loc = lexer->loc;
+        while (isdigit(c) && !obe_lexer_is_eof(lexer)) {
+            c = obe_lexer_next_char(lexer);
+        }
+        obe_string_t token_value = {
+            .chars = lexer->source.chars + start_loc.offset,
+            .length = lexer->loc.offset - start_loc.offset
+        };
+        token->kind = TOKEN_NUMBER;
+        token->value = token_value;
+        token->loc = start_loc;
+        return;
+    }
+
+    if (c == ';') {
+        token->kind = TOKEN_SEMICOLON;
+        token->loc = lexer->loc;
+        token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+        obe_lexer_next_char(lexer);
+        return;
+    }
+
+    if (c == ':') {
+        token->kind = TOKEN_COLON;
+        token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset , .length = 1} ;
+        token->loc = lexer->loc;
+        obe_lexer_next_char(lexer);
+        return;
+    }
+
+    if (c == '=') {
+        token->kind = TOKEN_EQ;
+        token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+        token->loc = lexer->loc;
+        obe_lexer_next_char(lexer);
+        return;
+    }
+
+    if (c == '{') {
+        token->kind = TOKEN_LBRACE;
+        token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+        token->loc = lexer->loc;
+        obe_lexer_next_char(lexer);
+        return;
+    }
+
+    if (c == '}') {
+        token->kind = TOKEN_RBRACE;
+        token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+        token->loc = lexer->loc;
+        obe_lexer_next_char(lexer);
+        return;
+    }
+
+    token->kind = TOKEN_UNKOWN;
+    token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+    token->loc = lexer->loc;
+    obe_lexer_next_char(lexer);
+    return;
+}
+
+static char* token_to_cstr_table[] = { [TOKEN_KW_RETURN] = "return",
+                                       [TOKEN_KW_FN] = "fn",
+                                       [TOKEN_KW_BR] = "br",
+                                       [TOKEN_IDENT] = "<ident>",
+                                       [TOKEN_LABEL] = "<label>",
+                                       [TOKEN_NUMBER] = "<number>",
+                                       [TOKEN_INT] = "int",
+                                       [TOKEN_EQ] = "=",
+                                       [TOKEN_COLON] = ":",
+                                       [TOKEN_SEMICOLON] = ";",
+                                       [TOKEN_LBRACE] = "{",
+                                       [TOKEN_RBRACE] = "}",
+                                       [TOKEN_EOF] = "<eof>",
+                                       [TOKEN_UNKOWN] = "<?unkown?>" };
+
+char*
+obe_token_to_cstr(obe_token_kind_t kind)
+{
+    return token_to_cstr_table[kind];
+}
-- 
cgit v1.2.3