libpulsar
A modular compiler for the pulsar programming language
Loading...
Searching...
No Matches
lexer.c
Go to the documentation of this file.
1// Copyright (C) 2023 Ethan Uppal. All rights reserved.
2
3#include <string.h>
4#include <stdbool.h>
5#include <stdio.h>
6#include <ctype.h>
9#include "util/arena.h"
10#include "error/error.h"
11#include "util/mtrack.h"
12
14
16struct ps_token_matcher {
17 STR token;
19 enum ps_token_type type;
20};
21
22static struct ps_token_matcher keyword_map[] = {
23#define ENUM(type, keyword) {keyword, sizeof(keyword) - 1, type},
24#include "token_type_keyword.h"
25#undef ENUM
26};
27
28static struct ps_token_matcher symbolic_map[] = {
29#define ENUM(type, _, token) {token, sizeof(token) - 1, type},
31#undef ENUM
32};
33
34/* clang-format off */
35static _A(unused) PS_PRINT_IMPL(ps_lexer_state, {
36 printf("struct ps_lexer_state {\n");
37 printf(" code = %p\n", (void*)self->code);
38 printf(" loc = ");
39 ps_loc_print(&self->loc, '\n');
40 printf(" tokens = %p\n", (void*)self->tokens);
41 // printf(" defines = %p\n", self->defines);
42 printf(" in_comment = %s\n", BOOLSTR(self->in_comment));
43 printf(" par_nesting = %zu\n", self->par_nesting);
44 printf(" is_string_escaped = %s\n", BOOLSTR(self->is_string_escaped));
45 printf(" interp_string = %s\n", BOOLSTR(self->interp_string));
46 printf(" interp_par_nesting = %zu\n", self->interp_par_nesting);
47 printf("}");
48})
49
50/* State-querying */
51
55inline bool ps_lexer_is_eof(void) {
56 return lexer.loc.pos >= lexer.length;
57}
58/* clang-format on */
59
63static bool ps_lexer_will_be_eof(usize n) {
64 return lexer.loc.pos + n >= lexer.length;
65}
66
70static inline char ps_lexer_current(void) {
71 return ps_lexer_is_eof() ? EOF : lexer.code[lexer.loc.pos];
72}
73
75static inline bool ps_lexer_is_at(char c) {
76 return ps_lexer_current() == c;
77}
78
82static inline char ps_lexer_peek(usize n) {
83 return ps_lexer_will_be_eof(n) ? EOF : lexer.code[lexer.loc.pos + n];
84}
85
89static inline bool ps_lexer_next_is(char next) {
90 return ps_lexer_peek(1) == next;
91}
92
97static inline bool _ps_lexer_next_are(STR next, usize n) {
98 // if (ps_lexer_will_be_eof(n)) {
99 // return false;
100 // }
101 for (usize i = 0; i < n; i++) {
102 if (ps_lexer_peek(i) != next[i]) {
103 return false;
104 }
105 }
106 return true;
107}
108
110static inline bool ps_lexer_should_trim(char c) {
111 return c == ' ' || c == '\t';
112}
113
115static inline bool ps_starts_id(char c) {
116 return isalpha(c) || c == '_' || c == '$';
117}
118
120static inline bool ps_is_id_char(char c) {
121 return isalnum(c) || c == '_' || c == '$';
122}
123
125static inline bool ps_starts_num(char c) {
126 return isdigit(c);
127}
128
129static inline bool ps_is_num_char(char c, enum ps_token_type base,
130 bool* encounted_dot) {
131 switch (base) {
132 case PS_TOKEN_LIT_DEC: {
133 if (c == '.') {
134 if (*encounted_dot) {
135 return false;
136 } else {
137 *encounted_dot = true;
138 return true;
139 }
140 }
141
142 return isdigit(c);
143 }
144 case PS_TOKEN_LIT_HEX: {
145 return isxdigit(c);
146 }
147 case PS_TOKEN_LIT_OCT: {
148 return c >= '0' && c <= '7';
149 }
150 case PS_TOKEN_LIT_BIN: {
151 return c == '0' || c == '1';
152 }
153 default: {
154 return false;
155 }
156 }
157}
158
160static inline bool ps_lexer_starts_str(void) {
161 if (lexer.interp_string) {
162 return ps_lexer_is_at(')')
164 } else {
165 return ps_lexer_is_at('"');
166 }
167}
168
177static inline bool ps_lexer_end_str(enum ps_token_type* result) {
178 if (ps_lexer_is_at('"') && !lexer.is_string_escaped) {
179 *result = PS_TOKEN_LIT_STR;
180 return true;
181 }
182 if (ps_lexer_is_at('(') && lexer.is_string_escaped) {
183 *result = PS_TOKEN_LIT_STR_INTERP;
184 return true;
185 }
186 if (ps_lexer_is_eof()) {
187 *result = PS_TOKEN_UNKNOWN;
188 return true;
189 }
190 return false;
191}
192
193static inline bool ps_lexer_starts_char(void) {
194 return ps_lexer_is_at('\'');
195}
196
197static inline bool ps_lexer_ends_char(enum ps_token_type* result) {
198 if (ps_lexer_is_at('\'') && !lexer.is_string_escaped) {
199 *result = PS_TOKEN_LIT_CHR;
200 return true;
201 }
202 if (ps_lexer_is_eof()) {
203 return true;
204 }
205 return false;
206}
207
208/* State-changing */
209
215static inline void ps_lexer_trim(void) {
216 while (ps_lexer_should_trim(ps_lexer_current())) {
218 }
219}
220
231 bool this_char_escaped = false;
233 lexer.is_string_escaped = false;
234 this_char_escaped = true;
235 }
236
237 const char current = ps_lexer_current();
238
239 if (current == '\n') {
240 lexer.loc.line++;
241 lexer.loc.col = 1;
242 } else if (!lexer.in_comment) {
243 switch (current) {
244 case '(': {
246 break;
247 }
248 case ')': {
250 break;
251 }
252 case '\\': {
253 // If not escaped, we use this as an escape
254 if (!this_char_escaped) {
256 }
257 break;
258 }
259 }
260 }
261
262 lexer.loc.pos++;
263 lexer.loc.col++;
264}
265
272inline void ps_lexer_advance_n(usize n) {
273 while (!ps_lexer_is_eof() && n--) {
275 }
276}
277
279static inline void ps_lexer_begin_comment(void) {
280 lexer.in_comment = true;
281}
282
284static inline void ps_lexer_end_comment(void) {
285 lexer.in_comment = false;
286}
287
288/* Multi-character tokens */
289
291static inline void ps_lexer_id_or_keyword(void) {
292 struct ps_loc loc = lexer.loc;
293
294 usize length = 0;
295 do {
297 length++;
298 } while (ps_is_id_char(ps_lexer_current()));
299
300 STR start = lexer.code + loc.pos;
301 for (usize i = 0; i < lengthof(keyword_map); i++) {
302 if (length == keyword_map[i].length
303 && memcmp(start, keyword_map[i].token, length) == 0) {
304 ps_add_token_with_loc(keyword_map[i].type, length, loc);
305 return;
306 }
307 }
308
309 ps_add_token_with_loc(PS_TOKEN_ID, length, loc);
310}
311
312static inline int ps_lexer_number_literal(void) {
313 struct ps_loc loc = lexer.loc;
314
315 enum ps_token_type literal_type = PS_TOKEN_LIT_DEC;
316 bool encounted_dot = false;
317
318 // Handle different bases
319 if (ps_lexer_is_at('0')) {
320 switch (ps_lexer_peek(1)) {
321 case 'x': {
322 literal_type = PS_TOKEN_LIT_HEX;
324 break;
325 }
326 case 'o': {
327 literal_type = PS_TOKEN_LIT_OCT;
329 break;
330 }
331 case 'b': {
332 literal_type = PS_TOKEN_LIT_BIN;
334 break;
335 }
336 default: {
337 break;
338 }
339 }
340 }
341
342 while (1) {
344 const char current = ps_lexer_current();
345
346 if (!ps_is_num_char(current, literal_type, &encounted_dot)) {
347 if (encounted_dot && literal_type == PS_TOKEN_LIT_DEC) {
348 literal_type = PS_TOKEN_LIT_FLT;
349
350 if (current == 'e' && literal_type == PS_TOKEN_LIT_FLT) {
351 break;
352 }
353 }
354
355 if (isxdigit(current)) {
356 ps_error(PS_SCOPE_ERROR, PS_ECODE_INVALID_NUMBER, lexer.code,
357 lexer.loc, 1, "Invalid digit",
358 (literal_type == PS_TOKEN_LIT_DEC)
359 ? "Decimal numbers can only contain the digits 0-9."
360 : (literal_type == PS_TOKEN_LIT_OCT)
361 ? "Octal numbers can only contain the digits 0-7."
362 : (literal_type == PS_TOKEN_LIT_BIN)
363 ? "Binary numbers can only contain the digits 0 and 1."
364 : ps_token_type_to_string(literal_type),
365 NULL);
366 return 1;
367 }
368
369 break;
370 }
371 }
372
373 ps_add_token_with_loc(literal_type, lexer.loc.pos - loc.pos, loc);
374
375 if (ps_lexer_is_at('e')) {
376 ps_add_token(PS_TOKEN_E, 1);
377
378 if (ps_lexer_is_eof() || !ps_starts_num(ps_lexer_current())) {
379 ps_error(PS_SCOPE_ERROR, PS_ECODE_INVALID_NUMBER, lexer.code,
380 lexer.loc, ps_lexer_is_eof() ? 0 : 1,
381 "Expected decimal literal for scientific notation exponent "
382 "after 'e'.",
383 "Decimal literal expected here", NULL);
384 return 1;
385 }
386 }
387
388 return 0;
389}
390
391static inline int ps_lexer_string_literal(void) {
392 const struct ps_loc start_loc = lexer.loc;
393
394 // skip past quote
396
397 const struct ps_loc token_loc = lexer.loc;
398 enum ps_token_type type = PS_TOKEN_UNKNOWN;
399
400 usize length = 0;
401
402 while (!ps_lexer_end_str(&type)) {
404 length++;
405 }
406
407 if (type == PS_TOKEN_UNKNOWN && length > 1) {
408 ps_error(PS_SCOPE_ERROR, PS_ECODE_MISMATCH, lexer.code, start_loc, 1,
409 "Unterminated string literal",
410 "String literal begun here was not terminated.", NULL);
411 return 1;
412 }
413
414 if (type == PS_TOKEN_LIT_STR_INTERP) {
415 lexer.interp_string = true;
416
417 // Start counting nesting
419
420 // Ignore the \ before the (
421 length--;
422 } else {
423 lexer.interp_string = false;
424 }
425
426 // advance past string token end character, be it '(' or '"'
428
429 // TODO: process string for escape sequences
430
431 ps_add_token_with_loc(type, length, token_loc);
432
433 return 0;
434}
435
436static inline int ps_lexer_char_literal(void) {
437 const struct ps_loc start_loc = lexer.loc;
438
439 // skip past quote
441
442 const struct ps_loc token_loc = lexer.loc;
443 enum ps_token_type type = PS_TOKEN_UNKNOWN;
444
445 usize length = 0;
446
447 while (!ps_lexer_ends_char(&type)) {
449 length++;
450 }
451
452 if (length == 0) {
453 ps_error(PS_SCOPE_ERROR, PS_ECODE_EMPTY_CHR_LIT, lexer.code, start_loc,
454 1, "Character literal cannot be empty", NULL, NULL);
455 return 1;
456 }
457
458 if (type == PS_TOKEN_UNKNOWN) {
459 ps_error(PS_SCOPE_ERROR, PS_ECODE_MISMATCH, lexer.code, start_loc, 1,
460 "Unterminated character literal",
461 "Character literal begun here was not terminated.", NULL);
462 return 1;
463 }
464
465 // skip past ending quote
467
468 // TODO: process char for escape sequences
469
470 ps_add_token_with_loc(PS_TOKEN_LIT_CHR, length, token_loc);
471
472 return 0;
473}
474
475/* Main functions */
476
481
483 struct ps_loc loc) {
484 ps_add(
485 &lexer.tokens, $(token) in {
486 token->start = ast_strndup(lexer.code + loc.pos, length);
487 token->length = length;
488 token->loc = loc;
489 token->type = type;
490 });
491}
492
493static inline int ps_get_token(void) {
494 ps_lexer_trim();
495
496 char current = ps_lexer_current();
497 if (current == '\n') {
498 ps_add_token(PS_TOKEN_NL, 1);
499 return 0;
500 } else if (current == '/') {
501 if (ps_lexer_next_is('/')) {
502 // if we found a double matching comment //, then go till \n
503 ps_lexer_begin_comment();
505 while (ps_lexer_current() != '\n') {
507 }
508 ps_lexer_end_comment();
509 return 0;
510 } else if (ps_lexer_next_is('*')) {
511 // if we found a mutliline comment /*, then go till */
512 ps_lexer_begin_comment();
513 struct ps_loc prior_loc = lexer.loc;
515 while (!ps_lexer_next_are_lit("*/")) {
517 if (ps_lexer_is_eof()) {
518 ps_error(PS_SCOPE_ERROR, PS_ECODE_UNBALANCED_SLASH_COMMENT,
519 lexer.code, prior_loc,
520 2, // length
521 "Unbalanced slash comment",
522 "Comments starting with '/*' must be terminated with "
523 "'*/'",
524 NULL);
525 return 1;
526 }
527 }
529 ps_lexer_end_comment();
530 return 0;
531 }
532 } else if (ps_starts_id(current)) {
533 ps_lexer_id_or_keyword();
534 return 0;
535 } else if (ps_starts_num(current)) {
536 return ps_lexer_number_literal();
537 } else if (ps_lexer_starts_str()) {
538 return ps_lexer_string_literal();
539 } else if (ps_lexer_starts_char()) {
540 return ps_lexer_char_literal();
541 }
542
543 for (usize i = 0; i < lengthof(symbolic_map); i++) {
544 auto_t symbolic = symbolic_map[i];
545 if (ps_lexer_next_are(symbolic.token, symbolic.length)) {
546 ps_add_token(symbolic.type, symbolic.length);
547 return 0;
548 }
549 }
550
551 ps_error(PS_SCOPE_ERROR, PS_ECODE_UNKNOWN_TOKEN, lexer.code, lexer.loc,
552 1, // length
553 "Unknown token", "This character was not recognized.",
554 "Check for encoding errors or invalid characters.");
555
556 return 1;
557}
558
559struct ps_token_arr* ps_lex(const struct ps_file_ctx* file_ctx) {
560 // if (length == -1) {
561 // length = strlen(file_ctx->buffer);
562 // }
563
564 lexer.code = file_ctx->buffer;
565 lexer.length = file_ctx->length;
566
567 lexer.loc.filename = file_ctx->filename;
568 lexer.loc.line = 1;
569 lexer.loc.col = 1;
570 lexer.loc.pos = 0;
571
572 lexer.in_comment = false;
573 lexer.par_nesting = 0;
574
575 lexer.is_string_escaped = false;
576 lexer.interp_string = false;
578
580
581 if (!lexer.tokens) {
582 return NULL;
583 }
584
585 // Process tokens until 5 errors, then exit if any (up to 5) errors.
586 int status = 0;
587 while (!ps_lexer_is_eof()) {
588 if (ps_get_token() != 0) {
589 status = 1;
592 } else {
593 ps_error(PS_SCOPE_INFO, PS_ECODE_STOP, lexer.code, lexer.loc, 1,
594 "Stopping after max errors receieved.", NULL, NULL);
595 break;
596 }
597 }
598 }
599 if (status != 0) {
601 return NULL;
602 }
603
604 return lexer.tokens;
605}
Defines an arena allocator for the compiler.
#define ast_strndup(str, len)
Definition arena.h:63
#define in
#define _A(attr)
Definition def.h:56
#define BOOLSTR(x)
Definition def.h:75
#define STR
Definition def.h:40
#define lengthof(array)
Definition def.h:55
#define auto_t
Definition def.h:53
#define usize
Definition def.h:50
#define ps_add(__arrptr, __capture)
Definition dynarr.h:50
usize ps_error_count(void)
The number of errors that have been reported in the error reporting system.
Definition error.c:240
Error reporting and displaying utilities.
void ps_add_token(enum ps_token_type type, usize length)
Definition lexer.c:477
struct ps_lexer_state lexer
Definition lexer.c:13
bool ps_lexer_is_eof(void)
Checks whether the lexer is at the end of input.
Definition lexer.c:55
void ps_lexer_advance(void)
Advances past one character in the lexer.
Definition lexer.c:230
struct ps_token_arr * ps_lex(const struct ps_file_ctx *file_ctx)
Lexes the given source code into tokens.
Definition lexer.c:559
void ps_add_token_with_loc(enum ps_token_type type, usize length, struct ps_loc loc)
Definition lexer.c:482
void ps_lexer_advance_n(usize n)
Advances past the next n characters in the lexer, updating internal state as necessary.
Definition lexer.c:272
Interface for the lexer.
#define ps_lexer_next_are(next, n)
Whether the next n characters in the lexer stream, starting from the current character,...
Definition lexer.h:51
#define PS_LEXER_MAX_ERRORS
The lexer will stop in its tracks after it encounters this many errors.
Definition lexer.h:22
static void usize struct ps_loc loc
Definition lexer.h:76
#define ps_lexer_next_are_lit(next)
Whether the next characters in the lexer stream, starting from the current character,...
Definition lexer.h:56
static void usize length
Definition lexer.h:71
Enables malloc tracking functionality.
#define PS_PRINT_IMPL(T,...)
Definition print.h:10
Represents an error or source-referencing display message.
Definition error.h:43
Information captured in a file necessary for effective info/error reporting.
Definition io.h:15
usize length
Definition io.h:18
char * buffer
Definition io.h:17
STR filename
Definition io.h:16
Represents lexer state for a single file.
Definition lexer.h:26
struct ps_loc loc
The current location of the lexer.
Definition lexer.h:29
usize length
The length of the file.
Definition lexer.h:28
usize par_nesting
Nesting level of parentheses.
Definition lexer.h:35
STR code
The source code of the file.
Definition lexer.h:27
bool is_string_escaped
Whether the next character in the string should be escaped.
Definition lexer.h:38
usize interp_par_nesting
The nesting of parentheses at the start of string interpolation.
Definition lexer.h:42
bool in_comment
Whether the lexer is currently in a comment.
Definition lexer.h:34
bool interp_string
Whether the lexer is currrently in an interpolated string.
Definition lexer.h:40
struct ps_token_arr * tokens
The token array.
Definition lexer.h:30
Represents a token location.
Definition loc.h:16
STR filename
The file where the token is from.
Definition loc.h:17
usize pos
The token's position in the text.
Definition loc.h:20
usize line
The line number where the token is found.
Definition loc.h:18
usize col
The column number where the token starts.
Definition loc.h:19
STR ps_token_type_to_string(enum ps_token_type type)
Returns: the string representation of the given token type type.
Definition token.c:13
Defines a token.
#define ps_token_arr_free(arr)
Definition token.h:48
#define ps_token_arr_new()
Definition token.h:47
ps_token_type
The type of a token.
Definition token.h:18
Keyword token types.
Symbol and operator token types.