16struct ps_token_matcher {
22static struct ps_token_matcher keyword_map[] = {
23#define ENUM(type, keyword) {keyword, sizeof(keyword) - 1, type},
28static struct ps_token_matcher symbolic_map[] = {
29#define ENUM(type, _, token) {token, sizeof(token) - 1, type},
36 printf(
"struct ps_lexer_state {\n");
37 printf(
" code = %p\n", (
void*)self->code);
39 ps_loc_print(&self->loc,
'\n');
40 printf(
" tokens = %p\n", (
void*)self->tokens);
42 printf(
" in_comment = %s\n",
BOOLSTR(self->in_comment));
43 printf(
" par_nesting = %zu\n", self->par_nesting);
44 printf(
" is_string_escaped = %s\n",
BOOLSTR(self->is_string_escaped));
45 printf(
" interp_string = %s\n",
BOOLSTR(self->interp_string));
46 printf(
" interp_par_nesting = %zu\n", self->interp_par_nesting);
63static bool ps_lexer_will_be_eof(
usize n) {
70static inline char ps_lexer_current(
void) {
75static inline bool ps_lexer_is_at(
char c) {
76 return ps_lexer_current() == c;
82static inline char ps_lexer_peek(
usize n) {
89static inline bool ps_lexer_next_is(
char next) {
90 return ps_lexer_peek(1) == next;
97static inline bool _ps_lexer_next_are(
STR next,
usize n) {
101 for (
usize i = 0; i < n; i++) {
102 if (ps_lexer_peek(i) != next[i]) {
110static inline bool ps_lexer_should_trim(
char c) {
111 return c ==
' ' || c ==
'\t';
115static inline bool ps_starts_id(
char c) {
116 return isalpha(c) || c ==
'_' || c ==
'$';
120static inline bool ps_is_id_char(
char c) {
121 return isalnum(c) || c ==
'_' || c ==
'$';
125static inline bool ps_starts_num(
char c) {
129static inline bool ps_is_num_char(
char c,
enum ps_token_type base,
130 bool* encounted_dot) {
132 case PS_TOKEN_LIT_DEC: {
134 if (*encounted_dot) {
137 *encounted_dot =
true;
144 case PS_TOKEN_LIT_HEX: {
147 case PS_TOKEN_LIT_OCT: {
148 return c >=
'0' && c <=
'7';
150 case PS_TOKEN_LIT_BIN: {
151 return c ==
'0' || c ==
'1';
160static inline bool ps_lexer_starts_str(
void) {
162 return ps_lexer_is_at(
')')
165 return ps_lexer_is_at(
'"');
177static inline bool ps_lexer_end_str(
enum ps_token_type* result) {
179 *result = PS_TOKEN_LIT_STR;
183 *result = PS_TOKEN_LIT_STR_INTERP;
187 *result = PS_TOKEN_UNKNOWN;
193static inline bool ps_lexer_starts_char(
void) {
194 return ps_lexer_is_at(
'\'');
197static inline bool ps_lexer_ends_char(
enum ps_token_type* result) {
199 *result = PS_TOKEN_LIT_CHR;
215static inline void ps_lexer_trim(
void) {
216 while (ps_lexer_should_trim(ps_lexer_current())) {
231 bool this_char_escaped =
false;
234 this_char_escaped =
true;
237 const char current = ps_lexer_current();
239 if (current ==
'\n') {
254 if (!this_char_escaped) {
279static inline void ps_lexer_begin_comment(
void) {
284static inline void ps_lexer_end_comment(
void) {
291static inline void ps_lexer_id_or_keyword(
void) {
298 }
while (ps_is_id_char(ps_lexer_current()));
303 && memcmp(start, keyword_map[i].token,
length) == 0) {
312static inline int ps_lexer_number_literal(
void) {
316 bool encounted_dot =
false;
319 if (ps_lexer_is_at(
'0')) {
320 switch (ps_lexer_peek(1)) {
322 literal_type = PS_TOKEN_LIT_HEX;
327 literal_type = PS_TOKEN_LIT_OCT;
332 literal_type = PS_TOKEN_LIT_BIN;
344 const char current = ps_lexer_current();
346 if (!ps_is_num_char(current, literal_type, &encounted_dot)) {
347 if (encounted_dot && literal_type == PS_TOKEN_LIT_DEC) {
348 literal_type = PS_TOKEN_LIT_FLT;
350 if (current ==
'e' && literal_type == PS_TOKEN_LIT_FLT) {
355 if (isxdigit(current)) {
358 (literal_type == PS_TOKEN_LIT_DEC)
359 ?
"Decimal numbers can only contain the digits 0-9."
360 : (literal_type == PS_TOKEN_LIT_OCT)
361 ?
"Octal numbers can only contain the digits 0-7."
362 : (literal_type == PS_TOKEN_LIT_BIN)
363 ?
"Binary numbers can only contain the digits 0 and 1."
375 if (ps_lexer_is_at(
'e')) {
381 "Expected decimal literal for scientific notation exponent "
383 "Decimal literal expected here", NULL);
391static inline int ps_lexer_string_literal(
void) {
402 while (!ps_lexer_end_str(&type)) {
407 if (type == PS_TOKEN_UNKNOWN &&
length > 1) {
409 "Unterminated string literal",
410 "String literal begun here was not terminated.", NULL);
414 if (type == PS_TOKEN_LIT_STR_INTERP) {
436static inline int ps_lexer_char_literal(
void) {
447 while (!ps_lexer_ends_char(&type)) {
454 1,
"Character literal cannot be empty", NULL, NULL);
458 if (type == PS_TOKEN_UNKNOWN) {
460 "Unterminated character literal",
461 "Character literal begun here was not terminated.", NULL);
493static inline int ps_get_token(
void) {
496 char current = ps_lexer_current();
497 if (current ==
'\n') {
500 }
else if (current ==
'/') {
501 if (ps_lexer_next_is(
'/')) {
503 ps_lexer_begin_comment();
505 while (ps_lexer_current() !=
'\n') {
508 ps_lexer_end_comment();
510 }
else if (ps_lexer_next_is(
'*')) {
512 ps_lexer_begin_comment();
518 ps_error(PS_SCOPE_ERROR, PS_ECODE_UNBALANCED_SLASH_COMMENT,
521 "Unbalanced slash comment",
522 "Comments starting with '/*' must be terminated with "
529 ps_lexer_end_comment();
532 }
else if (ps_starts_id(current)) {
533 ps_lexer_id_or_keyword();
535 }
else if (ps_starts_num(current)) {
536 return ps_lexer_number_literal();
537 }
else if (ps_lexer_starts_str()) {
538 return ps_lexer_string_literal();
539 }
else if (ps_lexer_starts_char()) {
540 return ps_lexer_char_literal();
544 auto_t symbolic = symbolic_map[i];
553 "Unknown token",
"This character was not recognized.",
554 "Check for encoding errors or invalid characters.");
588 if (ps_get_token() != 0) {
594 "Stopping after max errors receieved.", NULL, NULL);
Defines an arena allocator for the compiler.
#define ast_strndup(str, len)
#define ps_add(__arrptr, __capture)
usize ps_error_count(void)
The number of errors that have been reported in the error reporting system.
Error reporting and displaying utilities.
void ps_add_token(enum ps_token_type type, usize length)
struct ps_lexer_state lexer
bool ps_lexer_is_eof(void)
Checks whether the lexer is at the end of input.
void ps_lexer_advance(void)
Advances past one character in the lexer.
struct ps_token_arr * ps_lex(const struct ps_file_ctx *file_ctx)
Lexes the given source code into tokens.
void ps_add_token_with_loc(enum ps_token_type type, usize length, struct ps_loc loc)
void ps_lexer_advance_n(usize n)
Advances past the next n characters in the lexer, updating internal state as necessary.
#define ps_lexer_next_are(next, n)
Whether the next n characters in the lexer stream, starting from the current character,...
#define PS_LEXER_MAX_ERRORS
The lexer will stop in its tracks after it encounters this many errors.
static void usize struct ps_loc loc
#define ps_lexer_next_are_lit(next)
Whether the next characters in the lexer stream, starting from the current character,...
Enables malloc tracking functionality.
#define PS_PRINT_IMPL(T,...)
Represents an error or source-referencing display message.
Information captured in a file necessary for effective info/error reporting.
Represents lexer state for a single file.
struct ps_loc loc
The current location of the lexer.
usize length
The length of the file.
usize par_nesting
Nesting level of parentheses.
STR code
The source code of the file.
bool is_string_escaped
Whether the next character in the string should be escaped.
usize interp_par_nesting
The nesting of parentheses at the start of string interpolation.
bool in_comment
Whether the lexer is currently in a comment.
bool interp_string
Whether the lexer is currrently in an interpolated string.
struct ps_token_arr * tokens
The token array.
Represents a token location.
STR filename
The file where the token is from.
usize pos
The token's position in the text.
usize line
The line number where the token is found.
usize col
The column number where the token starts.
STR ps_token_type_to_string(enum ps_token_type type)
Returns: the string representation of the given token type type.
#define ps_token_arr_free(arr)
#define ps_token_arr_new()
ps_token_type
The type of a token.
Symbol and operator token types.