summaryrefslogtreecommitdiff
path: root/src/lexer.c
diff options
context:
space:
mode:
authorJohnny Richard <johnny@johnnyrichard.com>2024-10-30 22:58:03 +0100
committerJohnny Richard <johnny@johnnyrichard.com>2025-12-14 09:53:52 +0100
commit10bb8a05088f1d3bb24f7167f609b5f6fb0ba026 (patch)
tree7a4b3f69a461301c45204ed856b61f92a7d42233 /src/lexer.c
bootstrap projectHEADmaster
Signed-off-by: Johnny Richard <johnny@johnnyrichard.com>
Diffstat (limited to 'src/lexer.c')
-rw-r--r--src/lexer.c222
1 files changed, 222 insertions, 0 deletions
diff --git a/src/lexer.c b/src/lexer.c
new file mode 100644
index 0000000..3ad8751
--- /dev/null
+++ b/src/lexer.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2025 Johnny Richard <johnny@johnnyrichard.com>
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ *
+ * This file is part of obe.
+ *
+ * obe is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * obe is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with obe. If not, see <https://www.gnu.org/licenses/>.
+ */
+#include <assert.h>
+#include <ctype.h>
+#include <obe/lexer.h>
+#include <obe/string.h>
+#include <obe/utils.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void
+obe_lexer_init(obe_lexer_t* lexer, char* filename)
+{
+ assert(lexer);
+
+ char* program = read_file_contents(filename);
+ if (program == NULL) {
+ fprintf(stderr, "Unable to read file contents <%s>\n", filename);
+ exit(EXIT_FAILURE);
+ }
+
+ lexer->filename = filename;
+ lexer->loc = (obe_lexer_loc_t){ 0 };
+ lexer->source = obe_string_from_cstr(program);
+}
+
+bool
+obe_lexer_is_eof(obe_lexer_t* lexer)
+{
+ return !(lexer->loc.offset < lexer->source.length);
+}
+
+char
+obe_lexer_current_char(obe_lexer_t* lexer)
+{
+ return lexer->source.chars[lexer->loc.offset];
+}
+
+char
+obe_lexer_next_char(obe_lexer_t* lexer)
+{
+ assert(lexer->loc.offset < lexer->source.length);
+
+ char previous_char = obe_lexer_current_char(lexer);
+ if (previous_char == '\n') {
+ lexer->loc.lineno++;
+ lexer->loc.lineoffset = ++lexer->loc.offset;
+ } else {
+ lexer->loc.offset++;
+ }
+ return obe_lexer_current_char(lexer);
+}
+
+void
+obe_lexer_next_token(obe_lexer_t* lexer, obe_token_t* token)
+{
+ if (obe_lexer_is_eof(lexer)) {
+ *token = (obe_token_t){ .kind = TOKEN_EOF };
+ return;
+ }
+
+ char c = obe_lexer_current_char(lexer);
+ if (isspace(c) && !obe_lexer_is_eof(lexer)) {
+ while (isspace(c) && !obe_lexer_is_eof(lexer)) {
+ c = obe_lexer_next_char(lexer);
+ }
+ }
+
+ if (obe_lexer_is_eof(lexer)) {
+ *token = (obe_token_t){ .kind = TOKEN_EOF };
+ return;
+ }
+
+ if (isalpha(c) || c == '_') {
+ obe_lexer_loc_t start_loc = lexer->loc;
+ while ((isalnum(c) || c == '_') && !obe_lexer_is_eof(lexer)) {
+ c = obe_lexer_next_char(lexer);
+ }
+ obe_string_t token_value = {
+ .chars = lexer->source.chars + start_loc.offset,
+ .length = lexer->loc.offset - start_loc.offset
+ };
+ token->value = token_value;
+ token->loc = start_loc;
+ if (obe_string_eq(token_value, obe_string_from_cstr("fn"))) {
+ token->kind = TOKEN_KW_FN;
+ return;
+ }
+ if (obe_string_eq(token_value, obe_string_from_cstr("br"))) {
+ token->kind = TOKEN_KW_BR;
+ return;
+ }
+ if (obe_string_eq(token_value,
+ obe_string_from_cstr("return"))) {
+ token->kind = TOKEN_KW_RETURN;
+ return;
+ }
+ if (obe_string_eq(token_value, obe_string_from_cstr("int"))) {
+ token->kind = TOKEN_INT;
+ return;
+ }
+ token->kind = TOKEN_IDENT;
+ return;
+ }
+
+ if (c == '.') {
+ obe_lexer_loc_t start_loc = lexer->loc;
+ do {
+ c = obe_lexer_next_char(lexer);
+ } while ((isalnum(c) || c == '_') && !obe_lexer_is_eof(lexer));
+
+ obe_string_t token_value = {
+ .chars = lexer->source.chars + start_loc.offset,
+ .length = lexer->loc.offset - start_loc.offset
+ };
+
+ token->value = token_value;
+ token->loc = start_loc;
+ token->kind = TOKEN_LABEL;
+ return;
+ }
+
+ if (isdigit(c)) {
+ obe_lexer_loc_t start_loc = lexer->loc;
+ while (isdigit(c) && !obe_lexer_is_eof(lexer)) {
+ c = obe_lexer_next_char(lexer);
+ }
+ obe_string_t token_value = {
+ .chars = lexer->source.chars + start_loc.offset,
+ .length = lexer->loc.offset - start_loc.offset
+ };
+ token->kind = TOKEN_NUMBER;
+ token->value = token_value;
+ token->loc = start_loc;
+ return;
+ }
+
+ if (c == ';') {
+ token->kind = TOKEN_SEMICOLON;
+ token->loc = lexer->loc;
+ token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+ obe_lexer_next_char(lexer);
+ return;
+ }
+
+ if (c == ':') {
+ token->kind = TOKEN_COLON;
+ token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset , .length = 1} ;
+ token->loc = lexer->loc;
+ obe_lexer_next_char(lexer);
+ return;
+ }
+
+ if (c == '=') {
+ token->kind = TOKEN_EQ;
+ token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+ token->loc = lexer->loc;
+ obe_lexer_next_char(lexer);
+ return;
+ }
+
+ if (c == '{') {
+ token->kind = TOKEN_LBRACE;
+ token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+ token->loc = lexer->loc;
+ obe_lexer_next_char(lexer);
+ return;
+ }
+
+ if (c == '}') {
+ token->kind = TOKEN_RBRACE;
+ token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+ token->loc = lexer->loc;
+ obe_lexer_next_char(lexer);
+ return;
+ }
+
+ token->kind = TOKEN_UNKOWN;
+ token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 };
+ token->loc = lexer->loc;
+ obe_lexer_next_char(lexer);
+ return;
+}
+
+static char* token_to_cstr_table[] = { [TOKEN_KW_RETURN] = "return",
+ [TOKEN_KW_FN] = "fn",
+ [TOKEN_KW_BR] = "br",
+ [TOKEN_IDENT] = "<ident>",
+ [TOKEN_LABEL] = "<label>",
+ [TOKEN_NUMBER] = "<number>",
+ [TOKEN_INT] = "int",
+ [TOKEN_EQ] = "=",
+ [TOKEN_COLON] = ":",
+ [TOKEN_SEMICOLON] = ";",
+ [TOKEN_LBRACE] = "{",
+ [TOKEN_RBRACE] = "}",
+ [TOKEN_EOF] = "<eof>",
+ [TOKEN_UNKOWN] = "<?unkown?>" };
+
+char*
+obe_token_to_cstr(obe_token_kind_t kind)
+{
+ return token_to_cstr_table[kind];
+}