From 10bb8a05088f1d3bb24f7167f609b5f6fb0ba026 Mon Sep 17 00:00:00 2001 From: Johnny Richard Date: Wed, 30 Oct 2024 22:58:03 +0100 Subject: bootstrap project Signed-off-by: Johnny Richard --- src/lexer.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 src/lexer.c (limited to 'src/lexer.c') diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..3ad8751 --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2025 Johnny Richard + * + * SPDX-License-Identifier: LGPL-3.0-or-later + * + * This file is part of obe. + * + * obe is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * obe is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with obe. If not, see . + */ +#include +#include +#include +#include +#include +#include +#include + +void +obe_lexer_init(obe_lexer_t* lexer, char* filename) +{ + assert(lexer); + + char* program = read_file_contents(filename); + if (program == NULL) { + fprintf(stderr, "Unable to read file contents <%s>\n", filename); + exit(EXIT_FAILURE); + } + + lexer->filename = filename; + lexer->loc = (obe_lexer_loc_t){ 0 }; + lexer->source = obe_string_from_cstr(program); +} + +bool +obe_lexer_is_eof(obe_lexer_t* lexer) +{ + return !(lexer->loc.offset < lexer->source.length); +} + +char +obe_lexer_current_char(obe_lexer_t* lexer) +{ + return lexer->source.chars[lexer->loc.offset]; +} + +char +obe_lexer_next_char(obe_lexer_t* lexer) +{ + assert(lexer->loc.offset < lexer->source.length); + + char previous_char = obe_lexer_current_char(lexer); + if (previous_char == '\n') { + lexer->loc.lineno++; + lexer->loc.lineoffset = ++lexer->loc.offset; + } else { + lexer->loc.offset++; + } + return obe_lexer_current_char(lexer); +} + +void +obe_lexer_next_token(obe_lexer_t* lexer, obe_token_t* token) +{ + if (obe_lexer_is_eof(lexer)) { + *token = (obe_token_t){ .kind = TOKEN_EOF }; + return; + } + + char c = obe_lexer_current_char(lexer); + if (isspace(c) && !obe_lexer_is_eof(lexer)) { + while (isspace(c) && !obe_lexer_is_eof(lexer)) { + c = obe_lexer_next_char(lexer); + } + } + + if (obe_lexer_is_eof(lexer)) { + *token = (obe_token_t){ .kind = TOKEN_EOF }; + return; + } + + if (isalpha(c) || c == '_') { + obe_lexer_loc_t start_loc = lexer->loc; + while ((isalnum(c) || c == '_') && !obe_lexer_is_eof(lexer)) { + c = obe_lexer_next_char(lexer); + } + obe_string_t token_value = { + .chars = lexer->source.chars + start_loc.offset, + .length = lexer->loc.offset - start_loc.offset + }; + token->value = token_value; + token->loc = start_loc; + if (obe_string_eq(token_value, obe_string_from_cstr("fn"))) { + token->kind = TOKEN_KW_FN; + return; + } + if (obe_string_eq(token_value, obe_string_from_cstr("br"))) { + token->kind = TOKEN_KW_BR; + return; + } + if (obe_string_eq(token_value, + obe_string_from_cstr("return"))) { + token->kind = TOKEN_KW_RETURN; + return; + } + if (obe_string_eq(token_value, obe_string_from_cstr("int"))) { + token->kind = TOKEN_INT; + return; + } + token->kind = TOKEN_IDENT; + return; + } + + if (c == '.') { + obe_lexer_loc_t start_loc = lexer->loc; + do { + c = obe_lexer_next_char(lexer); + } while ((isalnum(c) || c == '_') && !obe_lexer_is_eof(lexer)); + + obe_string_t token_value = { + .chars = lexer->source.chars + start_loc.offset, + .length = lexer->loc.offset - start_loc.offset + }; + + token->value = token_value; + token->loc = start_loc; + token->kind = TOKEN_LABEL; + return; + } + + if (isdigit(c)) { + obe_lexer_loc_t start_loc = lexer->loc; + while (isdigit(c) && !obe_lexer_is_eof(lexer)) { + c = obe_lexer_next_char(lexer); + } + obe_string_t token_value = { + .chars = lexer->source.chars + start_loc.offset, + .length = lexer->loc.offset - start_loc.offset + }; + token->kind = TOKEN_NUMBER; + token->value = token_value; + token->loc = start_loc; + return; + } + + if (c == ';') { + token->kind = TOKEN_SEMICOLON; + token->loc = lexer->loc; + token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 }; + obe_lexer_next_char(lexer); + return; + } + + if (c == ':') { + token->kind = TOKEN_COLON; + token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset , .length = 1} ; + token->loc = lexer->loc; + obe_lexer_next_char(lexer); + return; + } + + if (c == '=') { + token->kind = TOKEN_EQ; + token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 }; + token->loc = lexer->loc; + obe_lexer_next_char(lexer); + return; + } + + if (c == '{') { + token->kind = TOKEN_LBRACE; + token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 }; + token->loc = lexer->loc; + obe_lexer_next_char(lexer); + return; + } + + if (c == '}') { + token->kind = TOKEN_RBRACE; + token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 }; + token->loc = lexer->loc; + obe_lexer_next_char(lexer); + return; + } + + token->kind = TOKEN_UNKOWN; + token->value = (obe_string_t){ .chars = lexer->source.chars + lexer->loc.offset, .length = 1 }; + token->loc = lexer->loc; + obe_lexer_next_char(lexer); + return; +} + +static char* token_to_cstr_table[] = { [TOKEN_KW_RETURN] = "return", + [TOKEN_KW_FN] = "fn", + [TOKEN_KW_BR] = "br", + [TOKEN_IDENT] = "", + [TOKEN_LABEL] = "