/* * Copyright (C) 2023 Johnny Richard * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "lexer.h" #include #include #include #include #include #include void lexer_init(lexer_t *lexer, char *filepath) { assert(lexer && "lexer must be defined"); assert(filepath && "filepath must be defined"); lexer->filepath = filepath; lexer->srclen = 0; lexer->cur = 0; lexer->row = 0; lexer->bol = 0; lexer_load_file_contents(lexer); } static void lexer_define_literal_token_props(lexer_t *lexer, token_t *token, token_kind_t kind) { token->kind = kind; token->value = string_view_new(lexer->src + lexer->cur, 1); token->filepath = lexer->filepath; token->row = lexer->row; token->col = lexer->cur - lexer->bol; token->bol = lexer->bol; } static void lexer_drop_spaces(lexer_t *lexer) { while (lexer_is_not_eof(lexer) && isspace(lexer_current_char(lexer))) { if (lexer_current_char(lexer) == '\n') { lexer_drop_char(lexer); lexer->row++; lexer->bol = lexer->cur; continue; } lexer_drop_char(lexer); } } static void lexer_tokenize_from_given_position_to_cursor(lexer_t *lexer, token_t *token, size_t position, token_kind_t kind) { token->kind = kind; token->value = string_view_new(lexer->src + position, lexer->cur - position); token->filepath = lexer->filepath; token->row = lexer->row; token->col = position - lexer->bol; token->bol = lexer->bol; } static void lexer_tokenize_number(lexer_t *lexer, token_t *token) { size_t begin = lexer->cur; while (lexer_is_not_eof(lexer) && isdigit(lexer_current_char(lexer))) { lexer_drop_char(lexer); } lexer_tokenize_from_given_position_to_cursor(lexer, token, begin, TOKEN_NUMBER); } static void lexer_tokenize_name(lexer_t *lexer, token_t *token) { size_t begin = lexer->cur; while (lexer_is_not_eof(lexer) && (isalnum(lexer_current_char(lexer)) || lexer_current_char(lexer) == '_')) { lexer_drop_char(lexer); } lexer_tokenize_from_given_position_to_cursor(lexer, token, begin, TOKEN_NAME); } static void lexer_token_process_keyword(token_t *token) { if (string_view_eq(string_view_from_str("let"), token->value)) { token->kind = TOKEN_KEYWORD_LET; return; } if (string_view_eq(string_view_from_str("return"), token->value)) { token->kind = TOKEN_KEYWORD_RETURN; return; } if (string_view_eq(string_view_from_str("fn"), token->value)) { token->kind = TOKEN_KEYWORD_FN; return; } if (string_view_eq(string_view_from_str("if"), token->value)) { token->kind = TOKEN_KEYWORD_IF; return; } if (string_view_eq(string_view_from_str("true"), token->value)) { token->kind = TOKEN_TRUE; return; } if (string_view_eq(string_view_from_str("false"), token->value)) { token->kind = TOKEN_FALSE; return; } } void lexer_next_token(lexer_t *lexer, token_t *token) { lexer_drop_spaces(lexer); if (lexer_is_eof(lexer)) { lexer_define_literal_token_props(lexer, token, TOKEN_EOF); lexer_drop_char(lexer); return; } if (isdigit(lexer_current_char(lexer))) { lexer_tokenize_number(lexer, token); return; } if (isalpha(lexer_current_char(lexer))) { lexer_tokenize_name(lexer, token); lexer_token_process_keyword(token); return; } if (lexer_current_char(lexer) == '(') { lexer_define_literal_token_props(lexer, token, TOKEN_OPAREN); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == ')') { lexer_define_literal_token_props(lexer, token, TOKEN_CPAREN); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == ':') { lexer_define_literal_token_props(lexer, token, TOKEN_COLON); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == ',') { lexer_define_literal_token_props(lexer, token, TOKEN_COMMA); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == ';') { lexer_define_literal_token_props(lexer, token, TOKEN_SEMICOLON); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == '{') { lexer_define_literal_token_props(lexer, token, TOKEN_OCURLY); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == '}') { lexer_define_literal_token_props(lexer, token, TOKEN_CCURLY); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == '+') { lexer_define_literal_token_props(lexer, token, TOKEN_PLUS); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == '-') { lexer_define_literal_token_props(lexer, token, TOKEN_MINUS); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == '*') { lexer_define_literal_token_props(lexer, token, TOKEN_STAR); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == '/') { lexer_define_literal_token_props(lexer, token, TOKEN_SLASH); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == '=') { lexer_define_literal_token_props(lexer, token, TOKEN_ASSIGN); lexer_drop_char(lexer); if (lexer_current_char(lexer) == '=') { lexer_drop_char(lexer); lexer_tokenize_from_given_position_to_cursor(lexer, token, lexer->cur - 2, TOKEN_EQUAL); return; } return; } if (lexer_current_char(lexer) == '!') { lexer_define_literal_token_props(lexer, token, TOKEN_NOT); lexer_drop_char(lexer); if (lexer_current_char(lexer) == '=') { lexer_drop_char(lexer); lexer_tokenize_from_given_position_to_cursor(lexer, token, lexer->cur - 2, TOKEN_NOT_EQUAL); return; } return; } if (lexer_current_char(lexer) == '>') { lexer_define_literal_token_props(lexer, token, TOKEN_GT); lexer_drop_char(lexer); if (lexer_current_char(lexer) == '=') { lexer_drop_char(lexer); lexer_tokenize_from_given_position_to_cursor(lexer, token, lexer->cur - 2, TOKEN_GT_EQUAL); return; } if (lexer_current_char(lexer) == '>') { lexer_drop_char(lexer); lexer_tokenize_from_given_position_to_cursor(lexer, token, lexer->cur - 2, TOKEN_BITWISE_SHIFT_RIGHT); return; } return; } if (lexer_current_char(lexer) == '<') { lexer_define_literal_token_props(lexer, token, TOKEN_LT); lexer_drop_char(lexer); if (lexer_current_char(lexer) == '=') { lexer_drop_char(lexer); lexer_tokenize_from_given_position_to_cursor(lexer, token, lexer->cur - 2, TOKEN_LT_EQUAL); return; } if (lexer_current_char(lexer) == '<') { lexer_drop_char(lexer); lexer_tokenize_from_given_position_to_cursor(lexer, token, lexer->cur - 2, TOKEN_BITWISE_SHIFT_LEFT); return; } return; } if (lexer_current_char(lexer) == '&') { lexer_define_literal_token_props(lexer, token, TOKEN_BITWISE_AND); lexer_drop_char(lexer); if (lexer_current_char(lexer) == '&') { lexer_drop_char(lexer); lexer_tokenize_from_given_position_to_cursor(lexer, token, lexer->cur - 2, TOKEN_AND); return; } return; } if (lexer_current_char(lexer) == '|') { lexer_define_literal_token_props(lexer, token, TOKEN_BITWISE_OR); lexer_drop_char(lexer); if (lexer_current_char(lexer) == '|') { lexer_drop_char(lexer); lexer_tokenize_from_given_position_to_cursor(lexer, token, lexer->cur - 2, TOKEN_OR); return; } return; } if (lexer_current_char(lexer) == '^') { lexer_define_literal_token_props(lexer, token, TOKEN_BITWISE_XOR); lexer_drop_char(lexer); return; } if (lexer_current_char(lexer) == '~') { lexer_define_literal_token_props(lexer, token, TOKEN_BITWISE_NOT); lexer_drop_char(lexer); return; } lexer_define_literal_token_props(lexer, token, TOKEN_UNKNOWN); lexer_drop_char(lexer); return; } void lexer_load_file_contents(lexer_t *lexer) { assert(lexer && "lexer must be defined"); FILE *file; file = fopen(lexer->filepath, "r"); if (!file) { fprintf(stderr, "tried to open file '%s': %s\n", lexer->filepath, strerror(errno)); exit(EXIT_FAILURE); } fseek(file, 0L, SEEK_END); lexer->srclen = ftell(file); rewind(file); lexer->src = calloc(1, lexer->srclen + 1); if (!lexer->src) { fclose(file); perror("lexer_load_file_contents -> calloc"); exit(EXIT_FAILURE); } if (fread(lexer->src, lexer->srclen, 1, file) != 1) { fclose(file); free(lexer->src); // FIXME: distinguish error using ferror and feof functions fprintf(stderr, "could not read file '%s'\n", lexer->filepath); exit(EXIT_FAILURE); } } void lexer_lookahead(lexer_t *lexer, token_t *token, size_t level) { uint32_t cur = lexer->cur; uint32_t row = lexer->row; uint32_t bol = lexer->bol; while (level != 0) { lexer_next_token(lexer, token); level--; if (token->kind == TOKEN_EOF) { break; } } lexer->cur = cur; lexer->row = row; lexer->bol = bol; } void lexer_peek_next_token(lexer_t *lexer, token_t *token) { lexer_lookahead(lexer, token, 1); } void lexer_drop_next_token(lexer_t *lexer) { token_t token; lexer_next_token(lexer, &token); } void lexer_drop_char(lexer_t *lexer) { lexer->cur++; } bool lexer_is_eof(lexer_t *lexer) { return lexer->cur >= lexer->srclen; } bool lexer_is_not_eof(lexer_t *lexer) { return !lexer_is_eof(lexer); } char lexer_current_char(lexer_t *lexer) { return lexer->src[lexer->cur]; } char * token_kind_to_str(token_kind_t kind) { switch (kind) { case TOKEN_NAME: return "TOKEN_NAME"; case TOKEN_OPAREN: return "("; case TOKEN_CPAREN: return ")"; case TOKEN_COLON: return ":"; case TOKEN_COMMA: return ","; case TOKEN_SEMICOLON: return ";"; case TOKEN_OCURLY: return "{"; case TOKEN_CCURLY: return "}"; case TOKEN_NUMBER: return "TOKEN_NUMBER"; case TOKEN_PLUS: return "+"; case TOKEN_MINUS: return "-"; case TOKEN_STAR: return "*"; case TOKEN_SLASH: return "/"; case TOKEN_ASSIGN: return "="; case TOKEN_EQUAL: return "=="; case TOKEN_NOT: return "!"; case TOKEN_NOT_EQUAL: return "!="; case TOKEN_GT: return ">"; case TOKEN_GT_EQUAL: return ">="; case TOKEN_LT: return "<"; case TOKEN_LT_EQUAL: return "<="; case TOKEN_AND: return "&&"; case TOKEN_OR: return "||"; case TOKEN_BITWISE_AND: return "&"; case TOKEN_BITWISE_OR: return "|"; case TOKEN_BITWISE_SHIFT_LEFT: return "<<"; case TOKEN_BITWISE_SHIFT_RIGHT: return ">>"; case TOKEN_BITWISE_XOR: return "^"; case TOKEN_BITWISE_NOT: return "~"; case TOKEN_KEYWORD_RETURN: return "return"; case TOKEN_KEYWORD_FN: return "fn"; case TOKEN_KEYWORD_LET: return "let"; case TOKEN_KEYWORD_IF: return "if"; case TOKEN_TRUE: return "true"; case TOKEN_FALSE: return "false"; case TOKEN_EOF: return "TOKEN_EOF"; case TOKEN_UNKNOWN: return "TOKEN_UNKNOWN"; } assert(false && "unreachable"); }