summaryrefslogtreecommitdiff
path: root/src/lexer.c
diff options
context:
space:
mode:
authorJohnny Richard <johnny@johnnyrichard.com>2025-04-11 01:15:01 +0200
committerJohnny Richard <johnny@johnnyrichard.com>2025-04-14 23:11:22 +0200
commite7f69c8fbbbcbddde84933b2becd91e787d1ac63 (patch)
tree16cd17da17133494dd06aab614724e76b059d4ad /src/lexer.c
Intial commit
Signed-off-by: Johnny Richard <johnny@johnnyrichard.com>
Diffstat (limited to 'src/lexer.c')
-rw-r--r--src/lexer.c253
1 files changed, 253 insertions, 0 deletions
diff --git a/src/lexer.c b/src/lexer.c
new file mode 100644
index 0000000..9e9ab90
--- /dev/null
+++ b/src/lexer.c
@@ -0,0 +1,253 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <ctype.h>
+#include "utils.h"
+#include "array.h"
+#include "string_view.h"
+#include "lexer.h"
+
+void
+lexer_init(lexer_t *lexer, char *file_name)
+{
+ assert(lexer);
+
+ char *program = read_file_contents(file_name);
+ if (program == NULL) {
+ fprintf(stderr, "Unable to read file <%s>\n", file_name);
+ exit(EXIT_FAILURE);
+ }
+
+ lexer->file_name = file_name;
+ lexer->loc = (lex_loc_t) { 0 };
+ lexer->source = string_view_from_cstr(program);
+}
+
+bool
+lexer_is_eof(lexer_t *lexer)
+{
+ return !(lexer->loc.offset < lexer->source.size);
+}
+
+char
+lexer_current_char(lexer_t *lexer)
+{
+ return lexer->source.chars[lexer->loc.offset];
+}
+
+static bool
+_isspace(char c)
+{
+ return isspace(c) && c != '\n';
+}
+
+char
+lexer_next_char(lexer_t *lexer)
+{
+ assert(lexer->loc.offset < lexer->source.size);
+ char previous_char = lexer_current_char(lexer);
+ if (previous_char == '\n') {
+ lexer->loc.lineno++;
+ lexer->loc.lineoffset = ++lexer->loc.offset;
+ } else {
+ lexer->loc.offset++;
+ }
+ return lexer_current_char(lexer);
+}
+
+void
+lexer_next_token(lexer_t *lexer, token_t *token)
+{
+ if (lexer_is_eof(lexer)) {
+ *token = (token_t) { .kind = TOKEN_EOF };
+ return;
+ }
+
+ char c = lexer_current_char(lexer);
+ if (_isspace(c) && !lexer_is_eof(lexer)) {
+ while (_isspace(c) && !lexer_is_eof(lexer)) {
+ c = lexer_next_char(lexer);
+ }
+ }
+
+ if (lexer_is_eof(lexer)) {
+ *token = (token_t) { .kind = TOKEN_EOF };
+ return;
+ }
+
+ if (c == '\n') {
+ token->kind = TOKEN_EOS;
+ token->loc = lexer->loc;
+ token->value = (string_view_t) { .size = 1, .chars = lexer->source.chars + lexer->loc.offset };
+ lexer_next_char(lexer);
+ return;
+ }
+
+ if (isalpha(c)) {
+ lex_loc_t start_loc = lexer->loc;
+ while (isalnum(c) && !lexer_is_eof(lexer)) {
+ c = lexer_next_char(lexer);
+ }
+ string_view_t token_value = {
+ .size = lexer->loc.offset - start_loc.offset,
+ .chars = lexer->source.chars + start_loc.offset
+ };
+ token->value = token_value;
+ token->loc = start_loc;
+ if (string_view_eq(token_value, string_view_from_cstr("push"))) {
+ token->kind = TOKEN_KW_PUSH;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("dup"))) {
+ token->kind = TOKEN_KW_DUP;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("copy"))) {
+ token->kind = TOKEN_KW_COPY;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("swap"))) {
+ token->kind = TOKEN_KW_SWAP;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("drop"))) {
+ token->kind = TOKEN_KW_DROP;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("slide"))) {
+ token->kind = TOKEN_KW_SLIDE;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("add"))) {
+ token->kind = TOKEN_KW_ADD;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("sub"))) {
+ token->kind = TOKEN_KW_SUB;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("mul"))) {
+ token->kind = TOKEN_KW_MUL;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("div"))) {
+ token->kind = TOKEN_KW_DIV;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("mod"))) {
+ token->kind = TOKEN_KW_MOD;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("store"))) {
+ token->kind = TOKEN_KW_STORE;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("load"))) {
+ token->kind = TOKEN_KW_LOAD;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("call"))) {
+ token->kind = TOKEN_KW_CALL;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("ret"))) {
+ token->kind = TOKEN_KW_RET;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("jmp"))) {
+ token->kind = TOKEN_KW_JMP;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("jz"))) {
+ token->kind = TOKEN_KW_JMPZ;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("jn"))) {
+ token->kind = TOKEN_KW_JMPN;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("printi"))) {
+ token->kind = TOKEN_KW_PRINTI;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("printc"))) {
+ token->kind = TOKEN_KW_PRINTC;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("readi"))) {
+ token->kind = TOKEN_KW_READI;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("readc"))) {
+ token->kind = TOKEN_KW_READC;
+ return;
+ }
+ if (string_view_eq(token_value, string_view_from_cstr("end"))) {
+ token->kind = TOKEN_KW_END;
+ return;
+ }
+ token->kind = TOKEN_IDENT;
+ return;
+ }
+
+ if (isdigit(c)) {
+ lex_loc_t start_loc = lexer->loc;
+ while (isdigit(c) && !lexer_is_eof(lexer)) {
+ c = lexer_next_char(lexer);
+ }
+ string_view_t token_value = {
+ .size = lexer->loc.offset - start_loc.offset,
+ .chars = lexer->source.chars + start_loc.offset
+ };
+ token->kind = TOKEN_NUMBER;
+ token->value = token_value;
+ token->loc = start_loc;
+ }
+
+ if (c == ':') {
+ token->kind = TOKEN_COLON;
+ token->value = (string_view_t) { .size = 1, .chars = lexer->source.chars + lexer->loc.offset};
+ token->loc = lexer->loc;
+ lexer_next_char(lexer);
+ return;
+ }
+}
+
+static char *token_to_cstr_table[] = {
+ [TOKEN_KW_PUSH] = "push",
+ [TOKEN_KW_DUP] = "dup",
+ [TOKEN_KW_COPY] = "copy",
+ [TOKEN_KW_SWAP] = "swap",
+ [TOKEN_KW_DROP] = "drop",
+ [TOKEN_KW_SLIDE] = "slide",
+ [TOKEN_KW_ADD] = "add",
+ [TOKEN_KW_SUB] = "sub",
+ [TOKEN_KW_MUL] = "mul",
+ [TOKEN_KW_DIV] = "div",
+ [TOKEN_KW_MOD] = "mod",
+ [TOKEN_KW_STORE] = "store",
+ [TOKEN_KW_LOAD] = "load",
+ [TOKEN_KW_CALL] = "call",
+ [TOKEN_KW_RET] = "ret",
+ [TOKEN_KW_JMP] = "jmp",
+ [TOKEN_KW_JMPZ] = "jz",
+ [TOKEN_KW_JMPN] = "jn",
+ [TOKEN_KW_PRINTI] = "printi",
+ [TOKEN_KW_PRINTC] = "printc",
+ [TOKEN_KW_READI] = "readi",
+ [TOKEN_KW_READC] = "readc",
+ [TOKEN_KW_END] = "end",
+ [TOKEN_IDENT] = "identifier",
+ [TOKEN_EOS] = "<eos>",
+ [TOKEN_NUMBER] = "number",
+ [TOKEN_COLON] = ":",
+ [TOKEN_UNKOWN] = "<unkown>",
+ [TOKEN_EOF] = "<eof>",
+};
+
+char *
+token_to_cstr(token_kind_t kind)
+{
+ return token_to_cstr_table[kind];
+}