diff options
| author | Johnny Richard <johnny@johnnyrichard.com> | 2025-04-11 01:15:01 +0200 |
|---|---|---|
| committer | Johnny Richard <johnny@johnnyrichard.com> | 2025-04-14 23:11:22 +0200 |
| commit | e7f69c8fbbbcbddde84933b2becd91e787d1ac63 (patch) | |
| tree | 16cd17da17133494dd06aab614724e76b059d4ad /src/lexer.c | |
Intial commit
Signed-off-by: Johnny Richard <johnny@johnnyrichard.com>
Diffstat (limited to 'src/lexer.c')
| -rw-r--r-- | src/lexer.c | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..9e9ab90 --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,253 @@ +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <ctype.h> +#include "utils.h" +#include "array.h" +#include "string_view.h" +#include "lexer.h" + +void +lexer_init(lexer_t *lexer, char *file_name) +{ + assert(lexer); + + char *program = read_file_contents(file_name); + if (program == NULL) { + fprintf(stderr, "Unable to read file <%s>\n", file_name); + exit(EXIT_FAILURE); + } + + lexer->file_name = file_name; + lexer->loc = (lex_loc_t) { 0 }; + lexer->source = string_view_from_cstr(program); +} + +bool +lexer_is_eof(lexer_t *lexer) +{ + return !(lexer->loc.offset < lexer->source.size); +} + +char +lexer_current_char(lexer_t *lexer) +{ + return lexer->source.chars[lexer->loc.offset]; +} + +static bool +_isspace(char c) +{ + return isspace(c) && c != '\n'; +} + +char +lexer_next_char(lexer_t *lexer) +{ + assert(lexer->loc.offset < lexer->source.size); + char previous_char = lexer_current_char(lexer); + if (previous_char == '\n') { + lexer->loc.lineno++; + lexer->loc.lineoffset = ++lexer->loc.offset; + } else { + lexer->loc.offset++; + } + return lexer_current_char(lexer); +} + +void +lexer_next_token(lexer_t *lexer, token_t *token) +{ + if (lexer_is_eof(lexer)) { + *token = (token_t) { .kind = TOKEN_EOF }; + return; + } + + char c = lexer_current_char(lexer); + if (_isspace(c) && !lexer_is_eof(lexer)) { + while (_isspace(c) && !lexer_is_eof(lexer)) { + c = lexer_next_char(lexer); + } + } + + if (lexer_is_eof(lexer)) { + *token = (token_t) { .kind = TOKEN_EOF }; + return; + } + + if (c == '\n') { + token->kind = TOKEN_EOS; + token->loc = lexer->loc; + token->value = (string_view_t) { .size = 1, .chars = lexer->source.chars + lexer->loc.offset }; + lexer_next_char(lexer); + return; + } + + if (isalpha(c)) { + lex_loc_t start_loc = lexer->loc; + while (isalnum(c) && !lexer_is_eof(lexer)) { + c = lexer_next_char(lexer); + } + string_view_t token_value = { + .size = lexer->loc.offset - start_loc.offset, + .chars = lexer->source.chars + start_loc.offset + }; + token->value = token_value; + token->loc = start_loc; + if (string_view_eq(token_value, string_view_from_cstr("push"))) { + token->kind = TOKEN_KW_PUSH; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("dup"))) { + token->kind = TOKEN_KW_DUP; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("copy"))) { + token->kind = TOKEN_KW_COPY; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("swap"))) { + token->kind = TOKEN_KW_SWAP; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("drop"))) { + token->kind = TOKEN_KW_DROP; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("slide"))) { + token->kind = TOKEN_KW_SLIDE; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("add"))) { + token->kind = TOKEN_KW_ADD; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("sub"))) { + token->kind = TOKEN_KW_SUB; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("mul"))) { + token->kind = TOKEN_KW_MUL; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("div"))) { + token->kind = TOKEN_KW_DIV; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("mod"))) { + token->kind = TOKEN_KW_MOD; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("store"))) { + token->kind = TOKEN_KW_STORE; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("load"))) { + token->kind = TOKEN_KW_LOAD; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("call"))) { + token->kind = TOKEN_KW_CALL; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("ret"))) { + token->kind = TOKEN_KW_RET; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("jmp"))) { + token->kind = TOKEN_KW_JMP; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("jz"))) { + token->kind = TOKEN_KW_JMPZ; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("jn"))) { + token->kind = TOKEN_KW_JMPN; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("printi"))) { + token->kind = TOKEN_KW_PRINTI; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("printc"))) { + token->kind = TOKEN_KW_PRINTC; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("readi"))) { + token->kind = TOKEN_KW_READI; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("readc"))) { + token->kind = TOKEN_KW_READC; + return; + } + if (string_view_eq(token_value, string_view_from_cstr("end"))) { + token->kind = TOKEN_KW_END; + return; + } + token->kind = TOKEN_IDENT; + return; + } + + if (isdigit(c)) { + lex_loc_t start_loc = lexer->loc; + while (isdigit(c) && !lexer_is_eof(lexer)) { + c = lexer_next_char(lexer); + } + string_view_t token_value = { + .size = lexer->loc.offset - start_loc.offset, + .chars = lexer->source.chars + start_loc.offset + }; + token->kind = TOKEN_NUMBER; + token->value = token_value; + token->loc = start_loc; + } + + if (c == ':') { + token->kind = TOKEN_COLON; + token->value = (string_view_t) { .size = 1, .chars = lexer->source.chars + lexer->loc.offset}; + token->loc = lexer->loc; + lexer_next_char(lexer); + return; + } +} + +static char *token_to_cstr_table[] = { + [TOKEN_KW_PUSH] = "push", + [TOKEN_KW_DUP] = "dup", + [TOKEN_KW_COPY] = "copy", + [TOKEN_KW_SWAP] = "swap", + [TOKEN_KW_DROP] = "drop", + [TOKEN_KW_SLIDE] = "slide", + [TOKEN_KW_ADD] = "add", + [TOKEN_KW_SUB] = "sub", + [TOKEN_KW_MUL] = "mul", + [TOKEN_KW_DIV] = "div", + [TOKEN_KW_MOD] = "mod", + [TOKEN_KW_STORE] = "store", + [TOKEN_KW_LOAD] = "load", + [TOKEN_KW_CALL] = "call", + [TOKEN_KW_RET] = "ret", + [TOKEN_KW_JMP] = "jmp", + [TOKEN_KW_JMPZ] = "jz", + [TOKEN_KW_JMPN] = "jn", + [TOKEN_KW_PRINTI] = "printi", + [TOKEN_KW_PRINTC] = "printc", + [TOKEN_KW_READI] = "readi", + [TOKEN_KW_READC] = "readc", + [TOKEN_KW_END] = "end", + [TOKEN_IDENT] = "identifier", + [TOKEN_EOS] = "<eos>", + [TOKEN_NUMBER] = "number", + [TOKEN_COLON] = ":", + [TOKEN_UNKOWN] = "<unkown>", + [TOKEN_EOF] = "<eof>", +}; + +char * +token_to_cstr(token_kind_t kind) +{ + return token_to_cstr_table[kind]; +} |
