libpulsar/lexer_8c_source.html

// Copyright (C) 2023 Ethan Uppal. All rights reserved.


#include <string.h>

#include <stdbool.h>

#include <stdio.h>

#include <ctype.h>

#include "frontend/lexer/lexer.h"

#include "frontend/lexer/token.h"

#include "util/arena.h"

#include "error/error.h"

#include "util/mtrack.h"


struct ps_lexer_state lexer;


struct ps_token_matcher {

    STR token;

    usize length;

    enum ps_token_type type;

};


static struct ps_token_matcher keyword_map[] = {

#define ENUM(type, keyword) {keyword, sizeof(keyword) - 1, type},

#include "token_type_keyword.h"

#undef ENUM

};


static struct ps_token_matcher symbolic_map[] = {

#define ENUM(type, _, token) {token, sizeof(token) - 1, type},

#include "token_type_symbols_operators.h"

#undef ENUM

};


/* clang-format off */

static _A(unused) PS_PRINT_IMPL(ps_lexer_state, {

    printf("struct ps_lexer_state {\n");

    printf("  code = %p\n", (void*)self->code);

    printf("  loc = ");

    ps_loc_print(&self->loc, '\n');

    printf("  tokens = %p\n", (void*)self->tokens);

    // printf("  defines = %p\n", self->defines);

    printf("  in_comment = %s\n", BOOLSTR(self->in_comment));

    printf("  par_nesting = %zu\n", self->par_nesting);

    printf("  is_string_escaped = %s\n", BOOLSTR(self->is_string_escaped));

    printf("  interp_string = %s\n", BOOLSTR(self->interp_string));

    printf("  interp_par_nesting = %zu\n", self->interp_par_nesting);

    printf("}");

})


/* State-querying */


inline bool ps_lexer_is_eof(void) {

    return lexer.loc.pos >= lexer.length;

}


/* clang-format on */


static bool ps_lexer_will_be_eof(usize n) {

    return lexer.loc.pos + n >= lexer.length;

}


static inline char ps_lexer_current(void) {

    return ps_lexer_is_eof() ? EOF : lexer.code[lexer.loc.pos];

}


static inline bool ps_lexer_is_at(char c) {

    return ps_lexer_current() == c;

}


static inline char ps_lexer_peek(usize n) {

    return ps_lexer_will_be_eof(n) ? EOF : lexer.code[lexer.loc.pos + n];

}


static inline bool ps_lexer_next_is(char next) {

    return ps_lexer_peek(1) == next;

}


static inline bool _ps_lexer_next_are(STR next, usize n) {

    // if (ps_lexer_will_be_eof(n)) {

    //     return false;

    // }

    for (usize i = 0; i < n; i++) {

        if (ps_lexer_peek(i) != next[i]) {

            return false;

        }

    }

    return true;

}


static inline bool ps_lexer_should_trim(char c) {

    return c == ' ' || c == '\t';

}


static inline bool ps_starts_id(char c) {

    return isalpha(c) || c == '_' || c == '$';

}


static inline bool ps_is_id_char(char c) {

    return isalnum(c) || c == '_' || c == '$';

}


static inline bool ps_starts_num(char c) {

    return isdigit(c);

}


static inline bool ps_is_num_char(char c, enum ps_token_type base,

    bool* encounted_dot) {

    switch (base) {

        case PS_TOKEN_LIT_DEC: {

            if (c == '.') {

                if (*encounted_dot) {

                    return false;

                } else {

                    *encounted_dot = true;

                    return true;

                }

            }


            return isdigit(c);

        }

        case PS_TOKEN_LIT_HEX: {

            return isxdigit(c);

        }

        case PS_TOKEN_LIT_OCT: {

            return c >= '0' && c <= '7';

        }

        case PS_TOKEN_LIT_BIN: {

            return c == '0' || c == '1';

        }

        default: {

            return false;

        }

    }

}


static inline bool ps_lexer_starts_str(void) {

    if (lexer.interp_string) {

        return ps_lexer_is_at(')')

               && lexer.par_nesting == lexer.interp_par_nesting + 1;

    } else {

        return ps_lexer_is_at('"');

    }

}


static inline bool ps_lexer_end_str(enum ps_token_type* result) {

    if (ps_lexer_is_at('"') && !lexer.is_string_escaped) {

        *result = PS_TOKEN_LIT_STR;

        return true;

    }

    if (ps_lexer_is_at('(') && lexer.is_string_escaped) {

        *result = PS_TOKEN_LIT_STR_INTERP;

        return true;

    }

    if (ps_lexer_is_eof()) {

        *result = PS_TOKEN_UNKNOWN;

        return true;

    }

    return false;

}


static inline bool ps_lexer_starts_char(void) {

    return ps_lexer_is_at('\'');

}


static inline bool ps_lexer_ends_char(enum ps_token_type* result) {

    if (ps_lexer_is_at('\'') && !lexer.is_string_escaped) {

        *result = PS_TOKEN_LIT_CHR;

        return true;

    }

    if (ps_lexer_is_eof()) {

        return true;

    }

    return false;

}


/* State-changing */


static inline void ps_lexer_trim(void) {

    while (ps_lexer_should_trim(ps_lexer_current())) {

        ps_lexer_advance();

    }

}


void ps_lexer_advance(void) {

    bool this_char_escaped = false;

    if (lexer.is_string_escaped) {

        lexer.is_string_escaped = false;

        this_char_escaped = true;

    }


    const char current = ps_lexer_current();


    if (current == '\n') {

        lexer.loc.line++;

        lexer.loc.col = 1;

    } else if (!lexer.in_comment) {

        switch (current) {

            case '(': {

                lexer.par_nesting++;

                break;

            }

            case ')': {

                lexer.par_nesting--;

                break;

            }

            case '\\': {

                // If not escaped, we use this as an escape

                if (!this_char_escaped) {

                    lexer.is_string_escaped = true;

                }

                break;

            }

        }

    }


    lexer.loc.pos++;

    lexer.loc.col++;

}


inline void ps_lexer_advance_n(usize n) {

    while (!ps_lexer_is_eof() && n--) {

        ps_lexer_advance();

    }

}


static inline void ps_lexer_begin_comment(void) {

    lexer.in_comment = true;

}


static inline void ps_lexer_end_comment(void) {

    lexer.in_comment = false;

}


/* Multi-character tokens */


static inline void ps_lexer_id_or_keyword(void) {

    struct ps_loc loc = lexer.loc;


    usize length = 0;

    do {

        ps_lexer_advance();

        length++;

    } while (ps_is_id_char(ps_lexer_current()));


    STR start = lexer.code + loc.pos;

    for (usize i = 0; i < lengthof(keyword_map); i++) {

        if (length == keyword_map[i].length

            && memcmp(start, keyword_map[i].token, length) == 0) {

            ps_add_token_with_loc(keyword_map[i].type, length, loc);

            return;

        }

    }


    ps_add_token_with_loc(PS_TOKEN_ID, length, loc);

}


static inline int ps_lexer_number_literal(void) {

    struct ps_loc loc = lexer.loc;


    enum ps_token_type literal_type = PS_TOKEN_LIT_DEC;

    bool encounted_dot = false;


    // Handle different bases

    if (ps_lexer_is_at('0')) {

        switch (ps_lexer_peek(1)) {

            case 'x': {

                literal_type = PS_TOKEN_LIT_HEX;

                ps_lexer_advance_n(2);

                break;

            }

            case 'o': {

                literal_type = PS_TOKEN_LIT_OCT;

                ps_lexer_advance_n(2);

                break;

            }

            case 'b': {

                literal_type = PS_TOKEN_LIT_BIN;

                ps_lexer_advance_n(2);

                break;

            }

            default: {

                break;

            }

        }

    }


    while (1) {

        ps_lexer_advance();

        const char current = ps_lexer_current();


        if (!ps_is_num_char(current, literal_type, &encounted_dot)) {

            if (encounted_dot && literal_type == PS_TOKEN_LIT_DEC) {

                literal_type = PS_TOKEN_LIT_FLT;


                if (current == 'e' && literal_type == PS_TOKEN_LIT_FLT) {

                    break;

                }

            }


            if (isxdigit(current)) {

                ps_error(PS_SCOPE_ERROR, PS_ECODE_INVALID_NUMBER, lexer.code,

                    lexer.loc, 1, "Invalid digit",

                    (literal_type == PS_TOKEN_LIT_DEC)

                        ? "Decimal numbers can only contain the digits 0-9."

                    : (literal_type == PS_TOKEN_LIT_OCT)

                        ? "Octal numbers can only contain the digits 0-7."

                    : (literal_type == PS_TOKEN_LIT_BIN)

                        ? "Binary numbers can only contain the digits 0 and 1."

                        : ps_token_type_to_string(literal_type),

                    NULL);

                return 1;

            }


            break;

        }

    }


    ps_add_token_with_loc(literal_type, lexer.loc.pos - loc.pos, loc);


    if (ps_lexer_is_at('e')) {

        ps_add_token(PS_TOKEN_E, 1);


        if (ps_lexer_is_eof() || !ps_starts_num(ps_lexer_current())) {

            ps_error(PS_SCOPE_ERROR, PS_ECODE_INVALID_NUMBER, lexer.code,

                lexer.loc, ps_lexer_is_eof() ? 0 : 1,

                "Expected decimal literal for scientific notation exponent "

                "after 'e'.",

                "Decimal literal expected here", NULL);

            return 1;

        }

    }


    return 0;

}


static inline int ps_lexer_string_literal(void) {

    const struct ps_loc start_loc = lexer.loc;


    // skip past quote

    ps_lexer_advance();


    const struct ps_loc token_loc = lexer.loc;

    enum ps_token_type type = PS_TOKEN_UNKNOWN;


    usize length = 0;


    while (!ps_lexer_end_str(&type)) {

        ps_lexer_advance();

        length++;

    }


    if (type == PS_TOKEN_UNKNOWN && length > 1) {

        ps_error(PS_SCOPE_ERROR, PS_ECODE_MISMATCH, lexer.code, start_loc, 1,

            "Unterminated string literal",

            "String literal begun here was not terminated.", NULL);

        return 1;

    }


    if (type == PS_TOKEN_LIT_STR_INTERP) {

        lexer.interp_string = true;


        // Start counting nesting

        lexer.interp_par_nesting = lexer.par_nesting;


        // Ignore the \ before the (

        length--;

    } else {

        lexer.interp_string = false;

    }


    // advance past string token end character, be it '(' or '"'

    ps_lexer_advance();


    // TODO: process string for escape sequences


    ps_add_token_with_loc(type, length, token_loc);


    return 0;

}


static inline int ps_lexer_char_literal(void) {

    const struct ps_loc start_loc = lexer.loc;


    // skip past quote

    ps_lexer_advance();


    const struct ps_loc token_loc = lexer.loc;

    enum ps_token_type type = PS_TOKEN_UNKNOWN;


    usize length = 0;


    while (!ps_lexer_ends_char(&type)) {

        ps_lexer_advance();

        length++;

    }


    if (length == 0) {

        ps_error(PS_SCOPE_ERROR, PS_ECODE_EMPTY_CHR_LIT, lexer.code, start_loc,

            1, "Character literal cannot be empty", NULL, NULL);

        return 1;

    }


    if (type == PS_TOKEN_UNKNOWN) {

        ps_error(PS_SCOPE_ERROR, PS_ECODE_MISMATCH, lexer.code, start_loc, 1,

            "Unterminated character literal",

            "Character literal begun here was not terminated.", NULL);

        return 1;

    }


    // skip past ending quote

    ps_lexer_advance();


    // TODO: process char for escape sequences


    ps_add_token_with_loc(PS_TOKEN_LIT_CHR, length, token_loc);


    return 0;

}


/* Main functions */


inline void ps_add_token(enum ps_token_type type, usize length) {

    ps_add_token_with_loc(type, length, lexer.loc);

    ps_lexer_advance_n(length);

}


void ps_add_token_with_loc(enum ps_token_type type, usize length,

    struct ps_loc loc) {

    ps_add(

        &lexer.tokens, $(token) in {

            token->start = ast_strndup(lexer.code + loc.pos, length);

            token->length = length;

            token->loc = loc;

            token->type = type;

        });

}


static inline int ps_get_token(void) {

    ps_lexer_trim();


    char current = ps_lexer_current();

    if (current == '\n') {

        ps_add_token(PS_TOKEN_NL, 1);

        return 0;

    } else if (current == '/') {

        if (ps_lexer_next_is('/')) {

            // if we found a double matching comment //, then go till \n

            ps_lexer_begin_comment();

            ps_lexer_advance_n(2);

            while (ps_lexer_current() != '\n') {

                ps_lexer_advance();

            }

            ps_lexer_end_comment();

            return 0;

        } else if (ps_lexer_next_is('*')) {

            // if we found a mutliline comment /*, then go till */

            ps_lexer_begin_comment();

            struct ps_loc prior_loc = lexer.loc;

            ps_lexer_advance_n(2);

            while (!ps_lexer_next_are_lit("*/")) {

                ps_lexer_advance();

                if (ps_lexer_is_eof()) {

                    ps_error(PS_SCOPE_ERROR, PS_ECODE_UNBALANCED_SLASH_COMMENT,

                        lexer.code, prior_loc,

                        2,  // length

                        "Unbalanced slash comment",

                        "Comments starting with '/*' must be terminated with "

                        "'*/'",

                        NULL);

                    return 1;

                }

            }

            ps_lexer_advance_n(2);

            ps_lexer_end_comment();

            return 0;

        }

    } else if (ps_starts_id(current)) {

        ps_lexer_id_or_keyword();

        return 0;

    } else if (ps_starts_num(current)) {

        return ps_lexer_number_literal();

    } else if (ps_lexer_starts_str()) {

        return ps_lexer_string_literal();

    } else if (ps_lexer_starts_char()) {

        return ps_lexer_char_literal();

    }


    for (usize i = 0; i < lengthof(symbolic_map); i++) {

        auto_t symbolic = symbolic_map[i];

        if (ps_lexer_next_are(symbolic.token, symbolic.length)) {

            ps_add_token(symbolic.type, symbolic.length);

            return 0;

        }

    }


    ps_error(PS_SCOPE_ERROR, PS_ECODE_UNKNOWN_TOKEN, lexer.code, lexer.loc,

        1,  // length

        "Unknown token", "This character was not recognized.",

        "Check for encoding errors or invalid characters.");


    return 1;

}


struct ps_token_arr* ps_lex(const struct ps_file_ctx* file_ctx) {

    // if (length == -1) {

    //     length = strlen(file_ctx->buffer);

    // }


    lexer.code = file_ctx->buffer;

    lexer.length = file_ctx->length;


    lexer.loc.filename = file_ctx->filename;

    lexer.loc.line = 1;

    lexer.loc.col = 1;

    lexer.loc.pos = 0;


    lexer.in_comment = false;

    lexer.par_nesting = 0;


    lexer.is_string_escaped = false;

    lexer.interp_string = false;

    lexer.interp_par_nesting = 0;


    lexer.tokens = ps_token_arr_new();


    if (!lexer.tokens) {

        return NULL;

    }


    // Process tokens until 5 errors, then exit if any (up to 5) errors.

    int status = 0;

    while (!ps_lexer_is_eof()) {

        if (ps_get_token() != 0) {

            status = 1;

            if (ps_error_count() < PS_LEXER_MAX_ERRORS) {

                ps_lexer_advance();

            } else {

                ps_error(PS_SCOPE_INFO, PS_ECODE_STOP, lexer.code, lexer.loc, 1,

                    "Stopping after max errors receieved.", NULL, NULL);

                break;

            }

        }

    }

    if (status != 0) {

        ps_token_arr_free(lexer.tokens);

        return NULL;

    }


    return lexer.tokens;

}


arena.h
Defines an arena allocator for the compiler.

ast_strndup
#define ast_strndup(str, len)
Definition arena.h:63

in
#define in

_A
#define _A(attr)
Definition def.h:56

BOOLSTR
#define BOOLSTR(x)
Definition def.h:75

STR
#define STR
Definition def.h:40

lengthof
#define lengthof(array)
Definition def.h:55

auto_t
#define auto_t
Definition def.h:53

usize
#define usize
Definition def.h:50

ps_add
#define ps_add(__arrptr, __capture)
Definition dynarr.h:50

ps_error_count
usize ps_error_count(void)
The number of errors that have been reported in the error reporting system.
Definition error.c:240

error.h
Error reporting and displaying utilities.

ps_add_token
void ps_add_token(enum ps_token_type type, usize length)
Definition lexer.c:477

lexer
struct ps_lexer_state lexer
Definition lexer.c:13

ps_lexer_is_eof
bool ps_lexer_is_eof(void)
Checks whether the lexer is at the end of input.
Definition lexer.c:55

ps_lexer_advance
void ps_lexer_advance(void)
Advances past one character in the lexer.
Definition lexer.c:230

ps_lex
struct ps_token_arr * ps_lex(const struct ps_file_ctx *file_ctx)
Lexes the given source code into tokens.
Definition lexer.c:559

ps_add_token_with_loc
void ps_add_token_with_loc(enum ps_token_type type, usize length, struct ps_loc loc)
Definition lexer.c:482

ps_lexer_advance_n
void ps_lexer_advance_n(usize n)
Advances past the next n characters in the lexer, updating internal state as necessary.
Definition lexer.c:272

lexer.h
Interface for the lexer.

ps_lexer_next_are
#define ps_lexer_next_are(next, n)
Whether the next n characters in the lexer stream, starting from the current character,...
Definition lexer.h:51

PS_LEXER_MAX_ERRORS
#define PS_LEXER_MAX_ERRORS
The lexer will stop in its tracks after it encounters this many errors.
Definition lexer.h:22

loc
static void usize struct ps_loc loc
Definition lexer.h:76

ps_lexer_next_are_lit
#define ps_lexer_next_are_lit(next)
Whether the next characters in the lexer stream, starting from the current character,...
Definition lexer.h:56

length
static void usize length
Definition lexer.h:71

mtrack.h
Enables malloc tracking functionality.

PS_PRINT_IMPL
#define PS_PRINT_IMPL(T,...)
Definition print.h:10

ps_error
Represents an error or source-referencing display message.
Definition error.h:43

ps_file_ctx
Information captured in a file necessary for effective info/error reporting.
Definition io.h:15

ps_file_ctx::length
usize length
Definition io.h:18

ps_file_ctx::buffer
char * buffer
Definition io.h:17

ps_file_ctx::filename
STR filename
Definition io.h:16

ps_lexer_state
Represents lexer state for a single file.
Definition lexer.h:26

ps_lexer_state::loc
struct ps_loc loc
The current location of the lexer.
Definition lexer.h:29

ps_lexer_state::length
usize length
The length of the file.
Definition lexer.h:28

ps_lexer_state::par_nesting
usize par_nesting
Nesting level of parentheses.
Definition lexer.h:35

ps_lexer_state::code
STR code
The source code of the file.
Definition lexer.h:27

ps_lexer_state::is_string_escaped
bool is_string_escaped
Whether the next character in the string should be escaped.
Definition lexer.h:38

ps_lexer_state::interp_par_nesting
usize interp_par_nesting
The nesting of parentheses at the start of string interpolation.
Definition lexer.h:42

ps_lexer_state::in_comment
bool in_comment
Whether the lexer is currently in a comment.
Definition lexer.h:34

ps_lexer_state::interp_string
bool interp_string
Whether the lexer is currrently in an interpolated string.
Definition lexer.h:40

ps_lexer_state::tokens
struct ps_token_arr * tokens
The token array.
Definition lexer.h:30

ps_loc
Represents a token location.
Definition loc.h:16

ps_loc::filename
STR filename
The file where the token is from.
Definition loc.h:17

ps_loc::pos
usize pos
The token's position in the text.
Definition loc.h:20

ps_loc::line
usize line
The line number where the token is found.
Definition loc.h:18

ps_loc::col
usize col
The column number where the token starts.
Definition loc.h:19

ps_token_type_to_string
STR ps_token_type_to_string(enum ps_token_type type)
Returns: the string representation of the given token type type.
Definition token.c:13

token.h
Defines a token.

ps_token_arr_free
#define ps_token_arr_free(arr)
Definition token.h:48

ps_token_arr_new
#define ps_token_arr_new()
Definition token.h:47

ps_token_type
ps_token_type
The type of a token.
Definition token.h:18

token_type_keyword.h
Keyword token types.

token_type_symbols_operators.h
Symbol and operator token types.