Chromium Code Reviews| Index: src/lexer/lexer.re |
| diff --git a/src/lexer/lexer.re b/src/lexer/lexer.re |
| index 370de928ae7e3dc92aa924eac4674741a3a4a020..d2a8603b6107e7d5e61af0d1bb83d71b0d1ce10f 100644 |
| --- a/src/lexer/lexer.re |
| +++ b/src/lexer/lexer.re |
| @@ -4,7 +4,6 @@ |
| #include <stdlib.h> |
| #include <string.h> |
| - |
| /* |
| TODO: |
| - SpiderMonkey compatibility hack: " --> something" is treated as a single line comment. |
| @@ -13,7 +12,16 @@ TODO: |
| */ |
| -/*!types:re2c */ |
| + |
| +enum Condition { |
| + EConditionNormal, |
| + EConditionDoubleQuoteString, |
| + EConditionSingleQuoteString, |
| + EConditionIdentifier, |
| + EConditionSingleLineComment, |
| + EConditionMultiLineComment, |
| + EConditionHtmlComment |
|
ulan
2013/10/14 14:28:52
Had to move it here from the generated file, becau
|
| +}; |
| #if defined(WIN32) |
| @@ -36,102 +44,17 @@ TODO: |
| #endif |
| -// ---------------------------------------------------------------------- |
| -#define PUSH_EOS(T) { printf("got eos\n"); } |
| -#define PUSH_TOKEN(T) { \ |
| - printf("got token %s (%d)\n", tokenNames[T], T); \ |
| - SKIP(); } |
| -#define PUSH_STRING() { \ |
| - printf("got string\n"); \ |
| - size_t tokenSize = cursor-start; \ |
| - fwrite(start, tokenSize, 1, stdout); \ |
| - printf("\n"); \ |
| - SKIP(); } |
| -#define PUSH_NUMBER() { \ |
| - printf("got number\n"); \ |
| - size_t tokenSize = cursor-start; \ |
| - fwrite(start, tokenSize, 1, stdout); \ |
| - printf("\n"); \ |
| - SKIP(); } |
| -#define PUSH_IDENTIFIER() { \ |
| - --cursor; \ |
| - printf("got identifier: "); \ |
| - size_t tokenSize = cursor-start; \ |
| - fwrite(start, tokenSize, 1, stdout); \ |
| - printf("\n"); \ |
| - SKIP(); } |
| -#define PUSH_LINE_TERMINATOR() { printf("got line terminator\n"); SKIP();} |
| -#define TERMINATE_ILLEGAL() { return 1; } |
| - |
| -#define TOKENS \ |
| - TOK(EOS) \ |
| - TOK(LPAREN) \ |
| - TOK(RPAREN) \ |
| - TOK(LBRACK) \ |
| - TOK(RBRACK) \ |
| - TOK(LBRACE) \ |
| - TOK(RBRACE) \ |
| - TOK(COLON) \ |
| - TOK(SEMICOLON) \ |
| - TOK(PERIOD) \ |
| - TOK(CONDITIONAL) \ |
| - TOK(INC) \ |
| - TOK(DEC) \ |
| - TOK(ASSIGN) \ |
| - TOK(ASSIGN_BIT_OR) \ |
| - TOK(ASSIGN_BIT_XOR) \ |
| - TOK(ASSIGN_BIT_AND) \ |
| - TOK(ASSIGN_SHL) \ |
| - TOK(ASSIGN_SAR) \ |
| - TOK(ASSIGN_SHR) \ |
| - TOK(ASSIGN_ADD) \ |
| - TOK(ASSIGN_SUB) \ |
| - TOK(ASSIGN_MUL) \ |
| - TOK(ASSIGN_DIV) \ |
| - TOK(ASSIGN_MOD) \ |
| - TOK(COMMA) \ |
| - TOK(OR) \ |
| - TOK(AND) \ |
| - TOK(BIT_OR) \ |
| - TOK(BIT_XOR) \ |
| - TOK(BIT_AND) \ |
| - TOK(SHL) \ |
| - TOK(SAR) \ |
| - TOK(ADD) \ |
| - TOK(SUB) \ |
| - TOK(MUL) \ |
| - TOK(DIV) \ |
| - TOK(MOD) \ |
| - TOK(EQ) \ |
| - TOK(NE) \ |
| - TOK(EQ_STRICT) \ |
| - TOK(NE_STRICT) \ |
| - TOK(LT) \ |
| - TOK(GT) \ |
| - TOK(LTE) \ |
| - TOK(GTE) \ |
| - TOK(NOT) \ |
| - TOK(BIT_NOT) \ |
| +#include "lexer.h" |
| +using namespace v8::internal; |
| // ---------------------------------------------------------------------- |
| -static const char *tokenNames[] = |
| -{ |
| - #define TOK(x) #x, |
| - TOKENS |
| - #undef TOK |
| -}; |
| +#define PUSH_TOKEN(T) { send(T); SKIP(); } |
| +#define PUSH_LINE_TERMINATOR() { SKIP(); } |
| +#define TERMINATE_ILLEGAL() { return 1; } |
| // ---------------------------------------------------------------------- |
| class PushScanner |
| { |
| -public: |
| - |
| - enum Token |
| - { |
| - #define TOK(x) x, |
| - TOKENS |
| - #undef TOK |
| - }; |
| private: |
| @@ -143,6 +66,7 @@ private: |
| uint8_t *start; |
| uint8_t *cursor; |
| uint8_t *marker; |
| + int real_start; |
| uint8_t *buffer; |
| uint8_t *bufferEnd; |
| @@ -150,10 +74,12 @@ private: |
| uint8_t yych; |
| uint32_t yyaccept; |
| + ExperimentalScanner* sink_; |
| + |
| public: |
| // ---------------------------------------------------------------------- |
| - PushScanner() |
| + PushScanner(ExperimentalScanner* sink) |
| { |
| limit = 0; |
| start = 0; |
| @@ -164,6 +90,8 @@ public: |
| buffer = 0; |
| eof = false; |
| bufferEnd = 0; |
| + sink_ = sink; |
| + real_start = 0; |
| } |
| // ---------------------------------------------------------------------- |
| @@ -172,49 +100,27 @@ public: |
| } |
| // ---------------------------------------------------------------------- |
| - void send( |
| - Token token |
| - ) |
| - { |
| - size_t tokenSize = cursor-start; |
| - const char *tokenName = tokenNames[token]; |
| - printf( |
| - "scanner is pushing out a token of type %d (%s)", |
| - token, |
| - tokenName |
| - ); |
| - |
| - if(token==EOS) putchar('\n'); |
| - else |
| - { |
| - size_t tokenNameSize = strlen(tokenNames[token]); |
| - size_t padSize = 20-(20<tokenNameSize ? 20 : tokenNameSize); |
| - for(size_t i=0; i<padSize; ++i) putchar(' '); |
| - printf(" : ---->"); |
| - |
| - fwrite( |
| - start, |
| - tokenSize, |
| - 1, |
| - stdout |
| - ); |
| - |
| - printf("<----\n"); |
| + void send(Token::Value token) { |
| + int beg = (start - buffer) + real_start; |
| + int end = (cursor - buffer) + real_start; |
| + if (FLAG_trace_lexer) { |
| + printf("got %s at (%d, %d): ", Token::Name(token), beg, end); |
| + for (uint8_t* s = start; s != cursor; s++) printf("%c", (char)*s); |
| + printf(".\n"); |
| } |
| + sink_->Record(token, beg, end); |
| } |
| // ---------------------------------------------------------------------- |
| - uint32_t push( |
| - const void *input, |
| - ssize_t inputSize |
| - ) |
| - { |
| - printf( |
| - "scanner is receiving a new data batch of length %ld\n" |
| - "scanner continues with saved state = %d\n", |
| - inputSize, |
| - state |
| - ); |
| + uint32_t push(const void *input, int input_size) { |
| + if (FLAG_trace_lexer) { |
| + printf( |
| + "scanner is receiving a new data batch of length %d\n" |
| + "scanner continues with saved state = %d\n", |
| + input_size, |
| + state |
| + ); |
| + } |
| /* |
| * Data source is signaling end of file when batch size |
| @@ -224,12 +130,12 @@ public: |
| * the longest keyword, so given our grammar, 32 is a safe bet. |
| */ |
| uint8_t null[64]; |
| - const ssize_t maxFill = 32; |
| - if(inputSize<maxFill) // FIXME: do something about this!!! |
| + const int maxFill = 32; |
| + if(input_size<maxFill) // FIXME: do something about this!!! |
| { |
| eof = true; |
| input = null; |
| - inputSize = sizeof(null); |
| + input_size = sizeof(null); |
| memset(null, 0, sizeof(null)); |
| } |
| @@ -246,7 +152,7 @@ public: |
| * |
| */ |
| size_t used = limit-buffer; |
| - size_t needed = used+inputSize; |
| + size_t needed = used+input_size; |
| size_t allocated = bufferEnd-buffer; |
| if(allocated<needed) |
| { |
| @@ -263,8 +169,8 @@ public: |
| start = buffer + startOffset; |
| limit = limitOffset + buffer; |
| } |
| - memcpy(limit, input, inputSize); |
| - limit += inputSize; |
| + memcpy(limit, input, input_size); |
| + limit += input_size; |
| // The scanner starts here |
| #define YYLIMIT limit |
| @@ -283,7 +189,9 @@ public: |
| start: |
| - printf("Starting a round; state: %d, condition: %d\n", state, condition); |
| + if (FLAG_trace_lexer) { |
| + printf("Starting a round; state: %d, condition: %d\n", state, condition); |
| + } |
| /*!re2c |
| re2c:indent:top = 1; |
| @@ -302,66 +210,66 @@ public: |
| hex_digit = [0-9a-fA-F]; |
| maybe_exponent = ('e' [-+]? digit+)?; |
| - <Normal> "|=" { PUSH_TOKEN(ASSIGN_BIT_OR); } |
| - <Normal> "^=" { PUSH_TOKEN(ASSIGN_BIT_XOR); } |
| - <Normal> "&=" { PUSH_TOKEN(ASSIGN_BIT_AND); } |
| - <Normal> "+=" { PUSH_TOKEN(ASSIGN_ADD); } |
| - <Normal> "-=" { PUSH_TOKEN(ASSIGN_SUB); } |
| - <Normal> "*=" { PUSH_TOKEN(ASSIGN_MUL); } |
| - <Normal> "/=" { PUSH_TOKEN(ASSIGN_DIV); } |
| - <Normal> "%=" { PUSH_TOKEN(ASSIGN_MOD); } |
| - |
| - <Normal> "===" { PUSH_TOKEN(EQ_STRICT); } |
| - <Normal> "==" { PUSH_TOKEN(EQ); } |
| - <Normal> "=" { PUSH_TOKEN(ASSIGN); } |
| - <Normal> "!==" { PUSH_TOKEN(NE_STRICT); } |
| - <Normal> "!=" { PUSH_TOKEN(NE); } |
| - <Normal> "!" { PUSH_TOKEN(NOT); } |
| + <Normal> "|=" { PUSH_TOKEN(Token::ASSIGN_BIT_OR); } |
| + <Normal> "^=" { PUSH_TOKEN(Token::ASSIGN_BIT_XOR); } |
| + <Normal> "&=" { PUSH_TOKEN(Token::ASSIGN_BIT_AND); } |
| + <Normal> "+=" { PUSH_TOKEN(Token::ASSIGN_ADD); } |
| + <Normal> "-=" { PUSH_TOKEN(Token::ASSIGN_SUB); } |
| + <Normal> "*=" { PUSH_TOKEN(Token::ASSIGN_MUL); } |
| + <Normal> "/=" { PUSH_TOKEN(Token::ASSIGN_DIV); } |
| + <Normal> "%=" { PUSH_TOKEN(Token::ASSIGN_MOD); } |
| + |
| + <Normal> "===" { PUSH_TOKEN(Token::EQ_STRICT); } |
| + <Normal> "==" { PUSH_TOKEN(Token::EQ); } |
| + <Normal> "=" { PUSH_TOKEN(Token::ASSIGN); } |
| + <Normal> "!==" { PUSH_TOKEN(Token::NE_STRICT); } |
| + <Normal> "!=" { PUSH_TOKEN(Token::NE); } |
| + <Normal> "!" { PUSH_TOKEN(Token::NOT); } |
| <Normal> "//" :=> SingleLineComment |
| <Normal> "/*" :=> MultiLineComment |
| <Normal> "<!--" :=> HtmlComment |
| - <Normal> ">>>=" { PUSH_TOKEN(ASSIGN_SHR); } |
| - <Normal> "<<=" { PUSH_TOKEN(ASSIGN_SHL); } |
| - <Normal> ">>=" { PUSH_TOKEN(ASSIGN_SAR); } |
| - <Normal> "<=" { PUSH_TOKEN(LTE); } |
| - <Normal> ">=" { PUSH_TOKEN(GTE); } |
| - <Normal> "<<" { PUSH_TOKEN(SHL); } |
| - <Normal> ">>" { PUSH_TOKEN(SAR); } |
| - <Normal> "<" { PUSH_TOKEN(LT); } |
| - <Normal> ">" { PUSH_TOKEN(GT); } |
| - |
| - <Normal> '0x' hex_digit+ { PUSH_NUMBER(); } |
| - <Normal> "." digit+ maybe_exponent { PUSH_NUMBER(); } |
| - <Normal> digit+ ("." digit+)? maybe_exponent { PUSH_NUMBER(); } |
| - |
| - <Normal> "(" { PUSH_TOKEN(LPAREN); } |
| - <Normal> ")" { PUSH_TOKEN(RPAREN); } |
| - <Normal> "[" { PUSH_TOKEN(LBRACK); } |
| - <Normal> "]" { PUSH_TOKEN(RBRACK); } |
| - <Normal> "{" { PUSH_TOKEN(LBRACE); } |
| - <Normal> "}" { PUSH_TOKEN(RBRACE); } |
| - <Normal> ":" { PUSH_TOKEN(COLON); } |
| - <Normal> ";" { PUSH_TOKEN(SEMICOLON); } |
| - <Normal> "." { PUSH_TOKEN(PERIOD); } |
| - <Normal> "?" { PUSH_TOKEN(CONDITIONAL); } |
| - <Normal> "++" { PUSH_TOKEN(INC); } |
| - <Normal> "--" { PUSH_TOKEN(DEC); } |
| - |
| - <Normal> "||" { PUSH_TOKEN(OR); } |
| - <Normal> "&&" { PUSH_TOKEN(AND); } |
| - |
| - <Normal> "|" { PUSH_TOKEN(BIT_OR); } |
| - <Normal> "^" { PUSH_TOKEN(BIT_XOR); } |
| - <Normal> "&" { PUSH_TOKEN(BIT_AND); } |
| - <Normal> "+" { PUSH_TOKEN(ADD); } |
| - <Normal> "-" { PUSH_TOKEN(SUB); } |
| - <Normal> "*" { PUSH_TOKEN(MUL); } |
| - <Normal> "/" { PUSH_TOKEN(DIV); } |
| - <Normal> "%" { PUSH_TOKEN(MOD); } |
| - <Normal> "~" { PUSH_TOKEN(BIT_NOT); } |
| - <Normal> "," { PUSH_TOKEN(COMMA); } |
| + <Normal> ">>>=" { PUSH_TOKEN(Token::ASSIGN_SHR); } |
| + <Normal> "<<=" { PUSH_TOKEN(Token::ASSIGN_SHL); } |
| + <Normal> ">>=" { PUSH_TOKEN(Token::ASSIGN_SAR); } |
| + <Normal> "<=" { PUSH_TOKEN(Token::LTE); } |
| + <Normal> ">=" { PUSH_TOKEN(Token::GTE); } |
| + <Normal> "<<" { PUSH_TOKEN(Token::SHL); } |
| + <Normal> ">>" { PUSH_TOKEN(Token::SAR); } |
| + <Normal> "<" { PUSH_TOKEN(Token::LT); } |
| + <Normal> ">" { PUSH_TOKEN(Token::GT); } |
| + |
| + <Normal> '0x' hex_digit+ { PUSH_TOKEN(Token::NUMBER); } |
| + <Normal> "." digit+ maybe_exponent { PUSH_TOKEN(Token::NUMBER); } |
| + <Normal> digit+ ("." digit+)? maybe_exponent { PUSH_TOKEN(Token::NUMBER); } |
| + |
| + <Normal> "(" { PUSH_TOKEN(Token::LPAREN); } |
| + <Normal> ")" { PUSH_TOKEN(Token::RPAREN); } |
| + <Normal> "[" { PUSH_TOKEN(Token::LBRACK); } |
| + <Normal> "]" { PUSH_TOKEN(Token::RBRACK); } |
| + <Normal> "{" { PUSH_TOKEN(Token::LBRACE); } |
| + <Normal> "}" { PUSH_TOKEN(Token::RBRACE); } |
| + <Normal> ":" { PUSH_TOKEN(Token::COLON); } |
| + <Normal> ";" { PUSH_TOKEN(Token::SEMICOLON); } |
| + <Normal> "." { PUSH_TOKEN(Token::PERIOD); } |
| + <Normal> "?" { PUSH_TOKEN(Token::CONDITIONAL); } |
| + <Normal> "++" { PUSH_TOKEN(Token::INC); } |
| + <Normal> "--" { PUSH_TOKEN(Token::DEC); } |
| + |
| + <Normal> "||" { PUSH_TOKEN(Token::OR); } |
| + <Normal> "&&" { PUSH_TOKEN(Token::AND); } |
| + |
| + <Normal> "|" { PUSH_TOKEN(Token::BIT_OR); } |
| + <Normal> "^" { PUSH_TOKEN(Token::BIT_XOR); } |
| + <Normal> "&" { PUSH_TOKEN(Token::BIT_AND); } |
| + <Normal> "+" { PUSH_TOKEN(Token::ADD); } |
| + <Normal> "-" { PUSH_TOKEN(Token::SUB); } |
| + <Normal> "*" { PUSH_TOKEN(Token::MUL); } |
| + <Normal> "/" { PUSH_TOKEN(Token::DIV); } |
| + <Normal> "%" { PUSH_TOKEN(Token::MOD); } |
| + <Normal> "~" { PUSH_TOKEN(Token::BIT_NOT); } |
| + <Normal> "," { PUSH_TOKEN(Token::COMMA); } |
| <Normal> line_terminator+ { PUSH_LINE_TERMINATOR(); } |
| <Normal> whitespace { SKIP(); } |
| @@ -371,19 +279,19 @@ public: |
| <Normal> identifier_start :=> Identifier |
| - <Normal> eof { PUSH_EOS(); return 1; } |
| + <Normal> eof { PUSH_TOKEN(Token::EOS); return 1; } |
| <Normal> any { TERMINATE_ILLEGAL(); } |
| <DoubleQuoteString> "\\\"" { goto yy0; } |
| - <DoubleQuoteString> '"' { PUSH_STRING();} |
| + <DoubleQuoteString> '"' { PUSH_TOKEN(Token::STRING);} |
| <DoubleQuoteString> any { goto yy0; } |
| <SingleQuoteString> "\\'" { goto yy0; } |
| - <SingleQuoteString> "'" { PUSH_STRING();} |
| + <SingleQuoteString> "'" { PUSH_TOKEN(Token::STRING);} |
| <SingleQuoteString> any { goto yy0; } |
| <Identifier> identifier_char+ { goto yy0; } |
| - <Identifier> any { PUSH_IDENTIFIER(); } |
| + <Identifier> any { cursor--; PUSH_TOKEN(Token::IDENTIFIER); } |
| <SingleLineComment> line_terminator { PUSH_LINE_TERMINATOR();} |
| <SingleLineComment> eof { PUSH_LINE_TERMINATOR();} |
| @@ -399,82 +307,80 @@ public: |
| */ |
| fill: |
| - ssize_t unfinishedSize = cursor-start; |
| - printf( |
| - "scanner needs a refill. Exiting for now with:\n" |
| - " saved fill state = %d\n" |
| - " unfinished token size = %ld\n", |
| - state, |
| - unfinishedSize |
| - ); |
| - |
| - if(0<unfinishedSize && start<limit) |
| - { |
| - printf(" unfinished token is: "); |
| - fwrite(start, 1, cursor-start, stdout); |
| + int unfinishedSize = cursor-start; |
| + if (FLAG_trace_lexer) { |
| + printf( |
| + "scanner needs a refill. Exiting for now with:\n" |
| + " saved fill state = %d\n" |
| + " unfinished token size = %d\n", |
| + state, |
| + unfinishedSize |
| + ); |
| + if(0 < unfinishedSize && start < limit) { |
| + printf(" unfinished token is: "); |
| + fwrite(start, 1, cursor-start, stdout); |
| + putchar('\n'); |
| + } |
| putchar('\n'); |
| } |
| - putchar('\n'); |
| /* |
| * Once we get here, we can get rid of |
| * everything before start and after limit. |
| */ |
| - if(eof==true) goto start; |
| - if(buffer<start) |
| - { |
| - size_t startOffset = start-buffer; |
| - memmove(buffer, start, limit-start); |
| - marker -= startOffset; |
| - cursor -= startOffset; |
| - limit -= startOffset; |
| - start -= startOffset; |
| + if (eof == true) goto start; |
| + if (buffer < start) { |
| + size_t start_offset = start - buffer; |
| + memmove(buffer, start, limit - start); |
| + marker -= start_offset; |
| + cursor -= start_offset; |
| + limit -= start_offset; |
| + start -= start_offset; |
| + real_start += start_offset; |
| } |
| return 0; |
| } |
| }; |
| -// ---------------------------------------------------------------------- |
| -int main( |
| - int argc, |
| - char **argv |
| -) |
| -{ |
| - // Parse cmd line |
| - int input = 0; |
| - if(1<argc) |
| - { |
| - input = open(argv[1], O_RDONLY | O_BINARY); |
| - if(input<0) |
| - { |
| - fprintf( |
| - stderr, |
| - "could not open file %s\n", |
| - argv[1] |
| - ); |
| - exit(1); |
| - } |
| - } |
| - /* |
| - * Tokenize input file by pushing batches |
| - * of data one by one into the scanner. |
| - */ |
| - const size_t batchSize = 256; |
| - uint8_t buffer[batchSize]; |
| - PushScanner scanner; |
| - while(1) |
| - { |
| - ssize_t n = read(input, buffer, batchSize); |
| - if (scanner.push(buffer, n)) { |
| - printf("Scanner: illegal data\n"); |
| - return 1; |
| - } |
| - if(n<batchSize) break; |
| - } |
| - scanner.push(0, -1); |
| - close(input); |
| +ExperimentalScanner::ExperimentalScanner(const char* fname) : |
| + current_(0), fetched_(0) { |
| + file_ = fopen(fname, "rb"); |
| + scanner_ = new PushScanner(this); |
| +} |
| + |
| + |
| +ExperimentalScanner::~ExperimentalScanner() { |
| + fclose(file_); |
| +} |
| + |
| + |
| +void ExperimentalScanner::FillTokens() { |
| + current_ = 0; |
| + fetched_ = 0; |
| + uint8_t chars[BUFFER_SIZE]; |
| + int n = static_cast<int>(fread(&chars, 1, BUFFER_SIZE, file_)); |
| + scanner_->push(chars, n); |
| +} |
| + |
| + |
| +Token::Value ExperimentalScanner::Next(int* beg_pos, int* end_pos) { |
| + if (current_ == fetched_) { |
| + FillTokens(); |
| + } |
| + *beg_pos = beg_[current_]; |
| + *end_pos = end_[current_]; |
| + Token::Value res = token_[current_]; |
| + if (token_[current_] != Token::Token::EOS && |
| + token_[current_] != Token::ILLEGAL) current_++; |
| + return res; |
| +} |
| + |
| - // Done |
| - return 0; |
| +void ExperimentalScanner::Record(Token::Value token, int beg, int end) { |
| + if (token == Token::EOS) end--; |
| + token_[fetched_] = token; |
| + beg_[fetched_] = beg; |
| + end_[fetched_] = end; |
| + fetched_++; |
| } |