| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #pragma once | ||
| 2 | |||
| 3 | #include <cctype> | ||
| 4 | #include <ostream> | ||
| 5 | |||
| 6 | #include "ast/nodes.h" | ||
| 7 | #include "lexer/buffer.h" | ||
| 8 | #include "lexer/token.h" | ||
| 9 | #include "utilities/trie.h" | ||
| 10 | |||
| 11 | #include "dtypes.h" | ||
| 12 | |||
| 13 | #include <iostream> | ||
| 14 | |||
| 15 | /* | ||
| 16 | * Lexer is a stream of tokens | ||
| 17 | * | ||
| 18 | * TODO: DocString support | ||
| 19 | */ | ||
| 20 | |||
| 21 | namespace lython { | ||
| 22 | |||
| 23 | template <typename T, typename N> | ||
| 24 | bool in(T const& e, N const& v) { | ||
| 25 | 22732 | return e == v; | |
| 26 | } | ||
| 27 | |||
| 28 | template <typename T, typename N, typename... Args> | ||
| 29 | bool in(T const& e, N const& v, Args... args) { | ||
| 30 |
4/4✓ Branch 0 taken 41104 times.
✓ Branch 1 taken 3000 times.
✓ Branch 3 taken 4644 times.
✓ Branch 4 taken 36460 times.
|
88208 | return e == v || in(e, args...); |
| 31 | } | ||
| 32 | |||
| 33 | template <typename T, typename... Args> | ||
| 34 | bool in(T const& e, Args... args) { | ||
| 35 | return in(e, args...); | ||
| 36 | } | ||
| 37 | |||
| 38 | struct OpConfig { | ||
| 39 | int precedence = -1; | ||
| 40 | bool left_associative = true; | ||
| 41 | TokenType type = TokenType::tok_eof; | ||
| 42 | BinaryOperator binarykind = BinaryOperator::None; | ||
| 43 | UnaryOperator unarykind = UnaryOperator::None; | ||
| 44 | BoolOperator boolkind = BoolOperator::None; | ||
| 45 | CmpOperator cmpkind = CmpOperator::None; | ||
| 46 | |||
| 47 | void print(std::ostream& out) const { | ||
| 48 | ✗ | out << to_string(type) << "(pred: " << precedence << ") " | |
| 49 | ✗ | << "(binary: " << int(binarykind) << ") " | |
| 50 | ✗ | << "(unary: " << int(unarykind) << ") " | |
| 51 | ✗ | << "(bool: " << int(boolkind) << ") " | |
| 52 | ✗ | << "(cmp: " << int(cmpkind) << ") "; | |
| 53 | ✗ | } | |
| 54 | }; | ||
| 55 | |||
| 56 | Dict<String, OpConfig> const& default_precedence(); | ||
| 57 | |||
| 58 | class LexerOperators { | ||
| 59 | public: | ||
| 60 | LexerOperators() { | ||
| 61 |
2/2✓ Branch 4 taken 41 times.
✓ Branch 5 taken 1 times.
|
42 | for (auto& c: _precedence_table) { |
| 62 |
1/2✓ Branch 2 taken 41 times.
✗ Branch 3 not taken.
|
41 | _operators.insert(c.first); |
| 63 | } | ||
| 64 | 1 | } | |
| 65 | |||
| 66 | Trie<128> const* match(int c) const { return _operators.trie().matching(c); } | ||
| 67 | |||
| 68 | Dict<String, OpConfig> const& precedence_table() const { return _precedence_table; } | ||
| 69 | |||
| 70 | TokenType token_type(String const& str) const { return _precedence_table.at(str).type; } | ||
| 71 | |||
| 72 | private: | ||
| 73 | CoWTrie<128> _operators; | ||
| 74 | Dict<String, OpConfig> _precedence_table = default_precedence(); | ||
| 75 | }; | ||
| 76 | |||
| 77 | class AbstractLexer { | ||
| 78 | public: | ||
| 79 | virtual ~AbstractLexer() {} | ||
| 80 | |||
| 81 | virtual Token const& next_token() = 0; | ||
| 82 | |||
| 83 | virtual Token const& peek_token() = 0; | ||
| 84 | |||
| 85 | virtual Token const& token() = 0; | ||
| 86 | |||
| 87 | virtual char peekc() const { return '\0'; } | ||
| 88 | |||
| 89 | virtual const String& file_name() = 0; | ||
| 90 | |||
| 91 | virtual int get_mode() const { return 0; } | ||
| 92 | virtual void set_mode(int mode) {} | ||
| 93 | |||
| 94 | // print tokens with their info | ||
| 95 | ::std::ostream& debug_print(::std::ostream& out); | ||
| 96 | |||
| 97 | // print out tokens as they were inputed | ||
| 98 | ::std::ostream& print(::std::ostream& out); | ||
| 99 | |||
| 100 | // extract a token stream into a token vector | ||
| 101 | Array<Token> extract_token() { | ||
| 102 | Array<Token> v; | ||
| 103 | |||
| 104 | Token t = next_token(); | ||
| 105 | do { | ||
| 106 | v.push_back(t); | ||
| 107 | } while ((t = next_token())); | ||
| 108 | |||
| 109 | v.push_back(t); // push eof token | ||
| 110 | return v; | ||
| 111 | } | ||
| 112 | }; | ||
| 113 | |||
| 114 | class ReplayLexer: public AbstractLexer { | ||
| 115 | public: | ||
| 116 | ReplayLexer(Array<Token>& tokens): tokens(tokens) { | ||
| 117 | Token& last = tokens[tokens.size() - 1]; | ||
| 118 | if (last.type() != tok_eof) { | ||
| 119 | tokens.emplace_back(tok_eof, 0, 0); | ||
| 120 | } | ||
| 121 | } | ||
| 122 | |||
| 123 | Token const& next_token() override final { | ||
| 124 | if (i + 1 < tokens.size()) | ||
| 125 | i += 1; | ||
| 126 | |||
| 127 | return tokens[i]; | ||
| 128 | } | ||
| 129 | |||
| 130 | Token const& peek_token() override final { | ||
| 131 | auto n = i + 1; | ||
| 132 | |||
| 133 | if (n >= tokens.size()) | ||
| 134 | n = i; | ||
| 135 | |||
| 136 | return tokens[n]; | ||
| 137 | } | ||
| 138 | |||
| 139 | Token const& token() override final { return tokens[i]; } | ||
| 140 | |||
| 141 | const String& file_name() override { | ||
| 142 | static String fakefile = "<replay buffer>"; | ||
| 143 | return fakefile; | ||
| 144 | } | ||
| 145 | |||
| 146 | ~ReplayLexer() {} | ||
| 147 | |||
| 148 | private: | ||
| 149 | ::std::size_t i = 0; | ||
| 150 | Array<Token>& tokens; | ||
| 151 | }; | ||
| 152 | |||
| 153 | enum class LexerMode { | ||
| 154 | Default = 0, | ||
| 155 | Character = 1 | ||
| 156 | }; | ||
| 157 | |||
| 158 | class Lexer: public AbstractLexer { | ||
| 159 | public: | ||
| 160 | Lexer(AbstractBuffer& reader): | ||
| 161 |
2/4✓ Branch 3 taken 1 times.
✗ Branch 4 not taken.
✓ Branch 8 taken 1 times.
✗ Branch 9 not taken.
|
1 | AbstractLexer(), _reader(reader), _cindent(indent()), _oindent(indent()) {} |
| 162 | |||
| 163 | ~Lexer() {} | ||
| 164 | |||
| 165 | Token const& token() override final { | ||
| 166 |
2/2✓ Branch 0 taken 744 times.
✓ Branch 1 taken 107709 times.
|
108453 | if (_count == 0) { |
| 167 | 744 | return next_token(); | |
| 168 | } | ||
| 169 | 107709 | return _token; | |
| 170 | } | ||
| 171 | |||
| 172 | int get_mode() const override final; | ||
| 173 | void set_mode(int mode) override final; | ||
| 174 | Token const& format_tokenizer() ; | ||
| 175 | Token const& next_token() override final; | ||
| 176 | Token const& peek_token() override final { | ||
| 177 | // we can only peek ahead once | ||
| 178 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 154 times.
|
154 | if (_buffer.size() > 0) |
| 179 | ✗ | return _buffer[_buffer.size() - 1]; | |
| 180 | |||
| 181 | // Save current token a get next | ||
| 182 |
1/1✓ Branch 1 taken 154 times.
|
154 | Token current_token = _token; |
| 183 |
2/2✓ Branch 1 taken 154 times.
✓ Branch 4 taken 154 times.
|
154 | _buffer.push_back(next_token()); |
| 184 |
1/1✓ Branch 1 taken 154 times.
|
154 | _token = current_token; |
| 185 | 154 | return _buffer[_buffer.size() - 1]; | |
| 186 | 154 | } | |
| 187 | |||
| 188 | Token const& make_token(int8 t) { | ||
| 189 | 7174 | _token = Token(t, line(), col()); | |
| 190 | 7174 | return _token; | |
| 191 | } | ||
| 192 | |||
| 193 | Token const& make_token(int8 t, const String& identifier) { | ||
| 194 | 5166 | _token = Token(t, line(), col()); | |
| 195 | 5166 | _token.identifier() = identifier; | |
| 196 | 5166 | return _token; | |
| 197 | } | ||
| 198 | |||
| 199 | const String& file_name() override { return _reader.file_name(); } | ||
| 200 | char peekc() const { return _reader.peek(); } | ||
| 201 | |||
| 202 | private: | ||
| 203 | int _count = 0; | ||
| 204 | AbstractBuffer& _reader; | ||
| 205 | Token _token{dummy()}; | ||
| 206 | int32 _cindent; | ||
| 207 | int32 _oindent; | ||
| 208 | LexerOperators _operators; | ||
| 209 | Array<Token> _buffer; | ||
| 210 | bool _fmtstr = false; | ||
| 211 | char _quote; | ||
| 212 | int _quotes = 0; | ||
| 213 | |||
| 214 | // shortcuts | ||
| 215 | |||
| 216 | int32 line() { return _reader.line(); } | ||
| 217 | int32 col() { return _reader.col(); } | ||
| 218 | int32 indent() { return _reader.indent(); } | ||
| 219 | void consume() { return _reader.consume(); } | ||
| 220 | char peek() { return _reader.peek(); } | ||
| 221 | bool empty_line() { return _reader.empty_line(); } | ||
| 222 | |||
| 223 | // state | ||
| 224 | bool desindent_for_comment = false; | ||
| 225 | |||
| 226 | char nextc() { | ||
| 227 | 23828 | _reader.consume(); | |
| 228 | 23828 | return _reader.peek(); | |
| 229 | } | ||
| 230 | |||
| 231 | // what characters are allowed in identifiers | ||
| 232 | bool is_identifier(char c) { | ||
| 233 |
7/10✓ Branch 0 taken 4190 times.
✓ Branch 1 taken 6235 times.
✓ Branch 2 taken 4076 times.
✓ Branch 3 taken 114 times.
✓ Branch 4 taken 4076 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 4076 times.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✓ Branch 9 taken 4076 times.
|
10425 | if (::std::isalnum(c) || c == '_' || c == '?' || c == '!' || c == '-') |
| 234 | 6349 | return true; | |
| 235 | 4076 | return false; | |
| 236 | } | ||
| 237 | }; | ||
| 238 | |||
| 239 | } // namespace lython | ||
| 240 |