GCC Code Coverage Report


Directory: ./
File: src/lexer/lexer.cpp
Date: 2023-04-27 00:55:30
Exec Total Coverage
Lines: 150 168 89.3%
Functions: 6 7 85.7%
Branches: 173 233 74.2%

Line Branch Exec Source
1 #include "lexer.h"
2 #include "unlex.h"
3 #include "utilities/strings.h"
4
5 namespace lython {
6
7 Dict<String, OpConfig> const& default_precedence() {
8 // clang-format off
9 static Dict<String, OpConfig> val = {
10 // Predecence, Left Associative, is_binary, is_bool, can_be_unary, kind
11 // Arithmetic
12 {"+", {20, true , tok_operator, BinaryOperator::Add, UnaryOperator::UAdd}},
13 {"-", {20, true , tok_operator, BinaryOperator::Sub, UnaryOperator::USub}},
14 {"%", {10, true , tok_operator, BinaryOperator::Mod}},
15 {"*", {30, true , tok_operator, BinaryOperator::Mult}},
16 {"**", {40, true , tok_operator, BinaryOperator::Pow}},
17 {"/", {30, true , tok_operator, BinaryOperator::Div}},
18 {"//", {30, true , tok_operator, BinaryOperator::FloorDiv}},
19 {".*", {20, true , tok_operator, BinaryOperator::EltMult}},
20 {"./", {20, true , tok_operator, BinaryOperator::EltDiv}},
21 //*/ Shorthand
22 {"+=", {50, true , tok_augassign, BinaryOperator::Add}},
23 {"-=", {50, true , tok_augassign, BinaryOperator::Sub}},
24 {"*=", {50, true , tok_augassign, BinaryOperator::Mult}},
25 {"/=", {50, true , tok_augassign, BinaryOperator::Div}},
26 {"%=", {50, true , tok_augassign, BinaryOperator::Mod}},
27 {"**=", {50, true , tok_augassign, BinaryOperator::Pow}},
28 {"//=", {50, true , tok_augassign, BinaryOperator::FloorDiv}},
29 //*/
30 // Assignment
31 {"=", {50, true , tok_assign}},
32 // Logic
33 {"~", {40, false, tok_operator, BinaryOperator::None, UnaryOperator::Invert}},
34 {"<<", {40, false, tok_operator, BinaryOperator::LShift}},
35 {">>", {40, false, tok_operator, BinaryOperator::RShift}},
36 {"^", {40, false, tok_operator, BinaryOperator::BitXor}},
37 {"&", {40, true , tok_operator, BinaryOperator::BitAnd}},
38 {"and", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::And}},
39 {"|", {40, true , tok_operator, BinaryOperator::BitOr}},
40 {"or", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::Or}},
41 {"!", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::Not}},
42 {"not", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::Not}},
43 // Comparison
44 {"==", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::Eq}},
45 {"!=", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::NotEq}},
46 {">=", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::GtE}},
47 {"<=", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::LtE}},
48 {">", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::Gt}},
49 {"<", {40, true , tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::Lt}},
50 // membership
51 {"in", {40, false, tok_in , BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::In}},
52 {"not in", {40, false, tok_in , BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::NotIn}},
53 // identity
54 {"is", {40, false, tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::Is}},
55 {"is not", {40, false, tok_operator, BinaryOperator::None, UnaryOperator::None, BoolOperator::None, CmpOperator::IsNot}},
56 // Not an operator but we use same data structure for parsing
57 {"->", {10, false, tok_arrow}},
58 {":=", {10, false, tok_walrus}},
59 {":", {10, false, (TokenType)':'}},
60 {".", {60, true , tok_dot}},
61
6/11
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12872 times.
✓ Branch 3 taken 12 times.
✗ Branch 4 not taken.
✓ Branch 6 taken 12 times.
✓ Branch 10 taken 492 times.
✓ Branch 11 taken 12 times.
✗ Branch 13 not taken.
✗ Branch 14 not taken.
✗ Branch 16 not taken.
✗ Branch 17 not taken.
13388 };
62 // clang-format on
63 12884 return val;
64 24 }
65
66 std::ostream& AbstractLexer::debug_print(std::ostream& out) {
67
68 Token t = next_token();
69 int k = 1;
70 do {
71 out << fmt::format("{:4}", k) << " ";
72 t.debug_print(out) << std::endl;
73 k += 1;
74 } while ((t = next_token()));
75
76 out << fmt::format("{:4}", k) << " ";
77 t.debug_print(out) << std::endl; // eof
78
79 return out;
80 }
81
82 // print out tokens as they were inputed
83 std::ostream& AbstractLexer::print(std::ostream& out) {
84
85
2/2
✓ Branch 1 taken 16 times.
✓ Branch 4 taken 16 times.
16 Token t = next_token();
86 16 Unlex unlex;
87
88 do {
89
1/1
✓ Branch 1 taken 339 times.
339 unlex.format(out, t);
90
4/4
✓ Branch 1 taken 339 times.
✓ Branch 4 taken 339 times.
✓ Branch 7 taken 323 times.
✓ Branch 8 taken 16 times.
339 } while ((t = next_token()));
91
92 // send eof for reset
93
1/1
✓ Branch 1 taken 16 times.
16 unlex.format(out, t);
94
95 16 return out;
96 16 }
97
98 int Lexer::get_mode() const {
99 55 return int(_fmtstr);
100 }
101
102 void Lexer::set_mode(int mode) {
103 110 _fmtstr = mode > 0;
104 110 }
105
106 Token const& Lexer::format_tokenizer() {
107 195 char c = peek();
108 195 nextc();
109 195 return make_token(c);
110 }
111
112
113 Token const& Lexer::next_token() {
114 12838 _count += 1;
115
116 // if we peeked ahead return that one
117
2/2
✓ Branch 1 taken 168 times.
✓ Branch 2 taken 12670 times.
12838 if (_buffer.size() > 0) {
118 168 _token = _buffer[_buffer.size() - 1];
119 168 _buffer.pop_back();
120 168 return _token;
121 }
122
123
2/2
✓ Branch 0 taken 195 times.
✓ Branch 1 taken 12475 times.
12670 if (_fmtstr) {
124 195 return format_tokenizer();
125 }
126
127 12475 char c = peek();
128
129 // newline
130
2/2
✓ Branch 0 taken 1216 times.
✓ Branch 1 taken 11259 times.
12475 if (c == '\n') {
131 // Only reset current indentation once in case of double new_lines
132
2/2
✓ Branch 0 taken 578 times.
✓ Branch 1 taken 638 times.
1216 if (_cindent != 0) {
133 578 _oindent = _cindent;
134 578 _cindent = 0;
135 }
136 1216 consume();
137 1216 return make_token(tok_newline);
138 }
139
140
2/2
✓ Branch 0 taken 2528 times.
✓ Branch 1 taken 8731 times.
11259 if (c == EOF)
141 2528 return make_token(tok_eof);
142
143 // Indentation
144 // --------------------------------
145
6/6
✓ Branch 0 taken 4206 times.
✓ Branch 1 taken 4525 times.
✓ Branch 3 taken 742 times.
✓ Branch 4 taken 3464 times.
✓ Branch 5 taken 742 times.
✓ Branch 6 taken 7989 times.
8731 if (c == ' ' && empty_line()) {
146 742 int k = 1;
147 do {
148 2226 c = nextc();
149 2226 k++;
150
151
4/4
✓ Branch 0 taken 742 times.
✓ Branch 1 taken 1484 times.
✓ Branch 2 taken 740 times.
✓ Branch 3 taken 2 times.
2226 if (k == LYTHON_INDENT && c == ' ') {
152 740 consume();
153 740 break;
154 }
155
2/2
✓ Branch 0 taken 1484 times.
✓ Branch 1 taken 2 times.
1486 } while (c == ' ');
156
157 742 _cindent += LYTHON_INDENT;
158
159 // if current indent is the same do nothing
160
2/2
✓ Branch 0 taken 330 times.
✓ Branch 1 taken 412 times.
742 if (_cindent <= _oindent)
161 330 return next_token();
162
163 // else increase indent
164 412 return make_token(tok_indent);
165 }
166
167 // only broadcast desindent on actual code
168 // comments have no impacts on our indentation level
169 //
170 // Doing it here brings another problem:
171 // - now comment indentation is going to change
172 // this could be a good thing as it forces comment
173 // to be at the "right" indentation
174 //
175 // but if you write a comment after a class its indentation is going to be wrong
176 //
177 // class X:
178 // # comment
179 // def __init__(self):
180 // ...
181 //
182 // becomes
183 //
184 // class X:
185 // # comment
186 // def __init__(self):
187 // ...
188 //
189 // and
190 //
191 // for i in range(10):
192 // ...
193 // # comment
194 //
195 // becomes
196 //
197 // for i in range(10):
198 // ...
199 // # comment
200 //
201 // 1) is ok, the comment was written inside a statement block
202 // 2) is problematic, the comment was written outside the block
203 // but we cannot tell until we reached a desindent block
204 // which happens AFTER the comment
205 //
206 // SOLUTION: make the parser associate comment with the comming statement
207 #define FORCE_COMMENT_INDENT(X) X
208
209
4/4
✓ Branch 0 taken 163 times.
✓ Branch 1 taken 7826 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 151 times.
7989 bool desindent_comment = _cindent < _oindent && c == tok_comment;
210
211
2/2
✓ Branch 0 taken 163 times.
✓ Branch 1 taken 7826 times.
7989 if (_cindent < _oindent) {
212 // TODO: this behaviour is not good for the Unlexer
213 // but it is fine for the parser
214
2/2
✓ Branch 0 taken 151 times.
✓ Branch 1 taken 12 times.
163 if (FORCE_COMMENT_INDENT(c != tok_comment)) {
215 151 _oindent -= LYTHON_INDENT;
216 151 return make_token(tok_desindent);
217 } else {
218 // reset current indent to match previous indentation level
219 // because comment indentation do not matter
220 12 _cindent = _oindent;
221 }
222 }
223
224 // remove white space
225
2/2
✓ Branch 0 taken 3794 times.
✓ Branch 1 taken 7838 times.
11632 while (c == ' ') {
226 3794 c = nextc();
227 }
228
229 // Identifiers
230 // -----------
231
4/4
✓ Branch 0 taken 3777 times.
✓ Branch 1 taken 4061 times.
✓ Branch 2 taken 15 times.
✓ Branch 3 taken 3762 times.
7838 if ((isalpha(c) || c == '_')) {
232 4076 String identifier;
233
234 // FIXME: check that ident can be an identifier
235
1/1
✓ Branch 1 taken 4076 times.
4076 identifier.push_back(c);
236
237
3/3
✓ Branch 1 taken 10425 times.
✓ Branch 4 taken 6349 times.
✓ Branch 5 taken 4076 times.
10425 while (is_identifier(c = nextc())) {
238
1/1
✓ Branch 1 taken 6349 times.
6349 identifier.push_back(c);
239 }
240
241 // is it a string operator (is, not, in, and, or) ?
242 {
243
2/2
✓ Branch 1 taken 4076 times.
✓ Branch 4 taken 4076 times.
4076 auto result = default_precedence().find(identifier);
244
3/3
✓ Branch 1 taken 4076 times.
✓ Branch 5 taken 119 times.
✓ Branch 6 taken 3957 times.
4076 if (result != default_precedence().end()) {
245 119 OpConfig const& conf = result->second;
246
1/1
✓ Branch 2 taken 119 times.
119 Token tok = dummy();
247
248 // combine is not & not in right now
249
6/6
✓ Branch 1 taken 105 times.
✓ Branch 2 taken 14 times.
✓ Branch 4 taken 14 times.
✓ Branch 5 taken 91 times.
✓ Branch 6 taken 28 times.
✓ Branch 7 taken 91 times.
119 if (identifier == "is" || identifier == "not") {
250
2/2
✓ Branch 1 taken 28 times.
✓ Branch 4 taken 28 times.
28 tok = next_token();
251 } else {
252
1/1
✓ Branch 1 taken 91 times.
91 return make_token(conf.type, identifier);
253 }
254
255
6/6
✓ Branch 1 taken 14 times.
✓ Branch 2 taken 14 times.
✓ Branch 5 taken 7 times.
✓ Branch 6 taken 7 times.
✓ Branch 7 taken 7 times.
✓ Branch 8 taken 21 times.
28 if (identifier == "is" && tok.operator_name() == "not") {
256
2/2
✓ Branch 2 taken 7 times.
✓ Branch 5 taken 7 times.
7 return make_token(conf.type, "is not");
257 }
258
259
6/6
✓ Branch 1 taken 14 times.
✓ Branch 2 taken 7 times.
✓ Branch 5 taken 7 times.
✓ Branch 6 taken 7 times.
✓ Branch 7 taken 7 times.
✓ Branch 8 taken 14 times.
21 if (identifier == "not" && tok.operator_name() == "in") {
260
2/2
✓ Branch 2 taken 7 times.
✓ Branch 5 taken 7 times.
7 return make_token(conf.type, "not in");
261 }
262
263
1/1
✓ Branch 1 taken 14 times.
14 _buffer.push_back(tok);
264
1/1
✓ Branch 1 taken 14 times.
14 return make_token(conf.type, identifier);
265 119 }
266 }
267
268 // is it a keyword ?
269 {
270
2/2
✓ Branch 1 taken 3957 times.
✓ Branch 4 taken 3957 times.
3957 auto result = keywords().find(identifier);
271
3/3
✓ Branch 1 taken 3957 times.
✓ Branch 5 taken 1189 times.
✓ Branch 6 taken 2768 times.
3957 if (result != keywords().end()) {
272
1/1
✓ Branch 2 taken 1189 times.
1189 return make_token(result->second);
273 }
274 }
275
276 // is it followed by a quote
277
5/6
✓ Branch 1 taken 2750 times.
✓ Branch 2 taken 18 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 2750 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 2750 times.
2768 if (peek() == '"' || peek() == '\'') {
278
1/1
✓ Branch 1 taken 18 times.
18 return make_token(tok_formatstr, identifier);
279 }
280
281 // then it must be an identifier
282
1/1
✓ Branch 1 taken 2750 times.
2750 return make_token(tok_identifier, identifier);
283 4076 }
284
285 // Operators
286 // -----------------------------------------------
287 // c is not alpha num
288 {
289 3762 auto next = _operators.match(c);
290
2/2
✓ Branch 0 taken 1480 times.
✓ Branch 1 taken 2282 times.
3762 if (next != nullptr) {
291 1480 String op;
292
1/1
✓ Branch 1 taken 1480 times.
1480 op.reserve(6);
293 1480 auto prev = next;
294
295
2/2
✓ Branch 0 taken 1628 times.
✓ Branch 1 taken 1480 times.
3108 while (next != nullptr) {
296
1/1
✓ Branch 1 taken 1628 times.
1628 op.push_back(c);
297
1/1
✓ Branch 1 taken 1628 times.
1628 c = nextc();
298 1628 prev = next;
299
1/1
✓ Branch 1 taken 1628 times.
1628 next = prev->matching(c);
300 }
301
302
1/2
✓ Branch 1 taken 1480 times.
✗ Branch 2 not taken.
1480 if (prev->leaf()) {
303
1/1
✓ Branch 1 taken 1480 times.
1480 op = strip(op);
304
2/2
✓ Branch 1 taken 1480 times.
✓ Branch 4 taken 1480 times.
1480 auto result = default_precedence().find(op);
305
2/3
✓ Branch 1 taken 1480 times.
✓ Branch 5 taken 1480 times.
✗ Branch 6 not taken.
1480 if (result != default_precedence().end()) {
306 1480 OpConfig const& conf = result->second;
307
1/1
✓ Branch 1 taken 1480 times.
1480 return make_token(conf.type, op);
308 }
309 }
310 1480 }
311 }
312
313 // Numbers
314 // -----------------------------------------------
315
2/2
✓ Branch 0 taken 436 times.
✓ Branch 1 taken 1846 times.
2282 if (std::isdigit(c)) {
316 436 String num;
317 436 TokenType ntype = tok_int;
318
319
2/2
✓ Branch 0 taken 445 times.
✓ Branch 1 taken 436 times.
881 while (std::isdigit(c)) {
320
1/1
✓ Branch 1 taken 445 times.
445 num.push_back(c);
321
1/1
✓ Branch 1 taken 445 times.
445 c = nextc();
322 }
323
324
2/2
✓ Branch 0 taken 88 times.
✓ Branch 1 taken 348 times.
436 if (c == '.') {
325 88 ntype = tok_float;
326
1/1
✓ Branch 1 taken 88 times.
88 num.push_back(c);
327
1/1
✓ Branch 1 taken 88 times.
88 c = nextc();
328
2/2
✓ Branch 0 taken 88 times.
✓ Branch 1 taken 88 times.
176 while (std::isdigit(c)) {
329
1/1
✓ Branch 1 taken 88 times.
88 num.push_back(c);
330
1/1
✓ Branch 1 taken 88 times.
88 c = nextc();
331 }
332 }
333
334 /*/ Incorrect Numbers
335 while (c != ' ' && c != '\n' && c != EOF){
336 num.push_back(c);
337 c = nextc();
338 ntype = tok_incorrect;
339 }*/
340
341 // std::cout << '"' << num << '"' << ntype << ',' << tok_incorrect << std::endl;
342 // throw 0;
343
1/1
✓ Branch 1 taken 436 times.
436 return make_token(ntype, num);
344 436 }
345
346 // Strings
347 // --------------------------------------------------
348
349 // Regular string
350 // --------------
351
3/4
✓ Branch 0 taken 1761 times.
✓ Branch 1 taken 85 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1761 times.
1846 if (c == '"' || c == '\'') {
352 85 char end = c;
353 85 String str;
354 85 TokenType tok = tok_string;
355
1/1
✓ Branch 1 taken 85 times.
85 char c2 = nextc();
356 85 char c3 = '\0';
357
358
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 69 times.
85 if (c2 == end) {
359
1/1
✓ Branch 1 taken 16 times.
16 char c3 = nextc();
360
1/2
✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
16 if (c3 == end) {
361 16 tok = tok_docstring;
362 } else {
363 str.push_back(c2);
364 str.push_back(c3);
365 }
366 } else {
367
1/1
✓ Branch 1 taken 69 times.
69 str.push_back(c2);
368 }
369
370
2/2
✓ Branch 0 taken 69 times.
✓ Branch 1 taken 16 times.
85 if (tok == tok_string)
371
6/7
✓ Branch 1 taken 175 times.
✓ Branch 3 taken 106 times.
✓ Branch 4 taken 69 times.
✓ Branch 5 taken 106 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 106 times.
✓ Branch 8 taken 69 times.
175 while ((c = nextc()) != end && c != EOF) {
372
1/1
✓ Branch 1 taken 106 times.
106 str.push_back(c);
373 }
374 else {
375
1/2
✓ Branch 0 taken 180 times.
✗ Branch 1 not taken.
180 while (c != EOF) {
376
1/1
✓ Branch 1 taken 180 times.
180 c = nextc();
377
378
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 164 times.
180 if (c == end) {
379
1/1
✓ Branch 1 taken 16 times.
16 c2 = nextc();
380
1/2
✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
16 if (c2 == end) {
381
1/1
✓ Branch 1 taken 16 times.
16 c3 = nextc();
382
1/2
✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
16 if (c3 == end) {
383 16 break;
384 } else {
385 str.push_back(c);
386 str.push_back(c2);
387 str.push_back(c3);
388 }
389 } else {
390 str.push_back(c);
391 str.push_back(c2);
392 }
393 } else {
394
1/1
✓ Branch 1 taken 164 times.
164 str.push_back(c);
395 }
396 }
397 }
398
1/1
✓ Branch 1 taken 85 times.
85 consume();
399
1/1
✓ Branch 1 taken 85 times.
85 return make_token(tok, str);
400 85 }
401
402 1761 c = peek();
403
2/2
✓ Branch 0 taken 278 times.
✓ Branch 1 taken 1483 times.
1761 if (c == tok_comment) {
404 278 String comment;
405
1/1
✓ Branch 1 taken 278 times.
278 comment.reserve(128);
406
407 // eat the comment token
408
1/1
✓ Branch 1 taken 278 times.
278 c = nextc();
409
410 // eat all characters until the newline
411
3/4
✓ Branch 0 taken 4173 times.
✓ Branch 1 taken 278 times.
✓ Branch 2 taken 4173 times.
✗ Branch 3 not taken.
4451 while (c != '\n' && c != EOF) {
412
1/1
✓ Branch 1 taken 4173 times.
4173 comment.push_back(c);
413
1/1
✓ Branch 1 taken 4173 times.
4173 c = nextc();
414 };
415
416
1/1
✓ Branch 1 taken 278 times.
278 return make_token(tok_comment, comment);
417 278 }
418
419 // get next char
420 1483 c = peek();
421 1483 consume();
422
423
1/2
✓ Branch 0 taken 1483 times.
✗ Branch 1 not taken.
1483 if (c > 0) {
424 1483 return make_token(c);
425 }
426 return make_token(tok_incorrect);
427 }
428
429 } // namespace lython
430