src/lexer/lexer.re - Issue 26764004: Invoke generated lexer along with baseline lexer to compare results.

Unified Diff: src/lexer/lexer.re

Issue 26764004: Invoke generated lexer along with baseline lexer to compare results. (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/lexer/lexer.re

diff --git a/src/lexer/lexer.re b/src/lexer/lexer.re

index 370de928ae7e3dc92aa924eac4674741a3a4a020..d2a8603b6107e7d5e61af0d1bb83d71b0d1ce10f 100644

--- a/src/lexer/lexer.re

+++ b/src/lexer/lexer.re

@@ -4,7 +4,6 @@

#include <stdlib.h>

#include <string.h>

TODO:

- SpiderMonkey compatibility hack: " --> something" is treated as a single line comment.

@@ -13,7 +12,16 @@ TODO:

-/*!types:re2c */

+enum Condition {

+ EConditionNormal,

+ EConditionDoubleQuoteString,

+ EConditionSingleQuoteString,

+ EConditionIdentifier,

+ EConditionSingleLineComment,

+ EConditionMultiLineComment,

+ EConditionHtmlComment

ulan 2013/10/14 14:28:52 Had to move it here from the generated file, becau

+};

#if defined(WIN32)

@@ -36,102 +44,17 @@ TODO:

#endif

-// ----------------------------------------------------------------------

-#define PUSH_EOS(T) { printf("got eos\n"); }

-#define PUSH_TOKEN(T) { \

- printf("got token %s (%d)\n", tokenNames[T], T); \

- SKIP(); }

-#define PUSH_STRING() { \

- printf("got string\n"); \

- size_t tokenSize = cursor-start; \

- fwrite(start, tokenSize, 1, stdout); \

- printf("\n"); \

- SKIP(); }

-#define PUSH_NUMBER() { \

- printf("got number\n"); \

- size_t tokenSize = cursor-start; \

- fwrite(start, tokenSize, 1, stdout); \

- printf("\n"); \

- SKIP(); }

-#define PUSH_IDENTIFIER() { \

- --cursor; \

- printf("got identifier: "); \

- size_t tokenSize = cursor-start; \

- fwrite(start, tokenSize, 1, stdout); \

- printf("\n"); \

- SKIP(); }

-#define PUSH_LINE_TERMINATOR() { printf("got line terminator\n"); SKIP();}

-#define TERMINATE_ILLEGAL() { return 1; }

-#define TOKENS \

- TOK(EOS) \

- TOK(LPAREN) \

- TOK(RPAREN) \

- TOK(LBRACK) \

- TOK(RBRACK) \

- TOK(LBRACE) \

- TOK(RBRACE) \

- TOK(COLON) \

- TOK(SEMICOLON) \

- TOK(PERIOD) \

- TOK(CONDITIONAL) \

- TOK(INC) \

- TOK(DEC) \

- TOK(ASSIGN) \

- TOK(ASSIGN_BIT_OR) \

- TOK(ASSIGN_BIT_XOR) \

- TOK(ASSIGN_BIT_AND) \

- TOK(ASSIGN_SHL) \

- TOK(ASSIGN_SAR) \

- TOK(ASSIGN_SHR) \

- TOK(ASSIGN_ADD) \

- TOK(ASSIGN_SUB) \

- TOK(ASSIGN_MUL) \

- TOK(ASSIGN_DIV) \

- TOK(ASSIGN_MOD) \

- TOK(COMMA) \

- TOK(OR) \

- TOK(AND) \

- TOK(BIT_OR) \

- TOK(BIT_XOR) \

- TOK(BIT_AND) \

- TOK(SHL) \

- TOK(SAR) \

- TOK(ADD) \

- TOK(SUB) \

- TOK(MUL) \

- TOK(DIV) \

- TOK(MOD) \

- TOK(EQ) \

- TOK(NE) \

- TOK(EQ_STRICT) \

- TOK(NE_STRICT) \

- TOK(LT) \

- TOK(GT) \

- TOK(LTE) \

- TOK(GTE) \

- TOK(NOT) \

- TOK(BIT_NOT) \

+#include "lexer.h"

+using namespace v8::internal;

// ----------------------------------------------------------------------

-static const char *tokenNames[] =

- #define TOK(x) #x,

- TOKENS

- #undef TOK

-};

+#define PUSH_TOKEN(T) { send(T); SKIP(); }

+#define PUSH_LINE_TERMINATOR() { SKIP(); }

+#define TERMINATE_ILLEGAL() { return 1; }

// ----------------------------------------------------------------------

class PushScanner

{

-public:

- enum Token

- {

- #define TOK(x) x,

- TOKENS

- #undef TOK

- };

private:

@@ -143,6 +66,7 @@ private:

uint8_t *start;

uint8_t *cursor;

uint8_t *marker;

+ int real_start;

uint8_t *buffer;

uint8_t *bufferEnd;

@@ -150,10 +74,12 @@ private:

uint8_t yych;

uint32_t yyaccept;

+ ExperimentalScanner* sink_;

public:

// ----------------------------------------------------------------------

- PushScanner()

+ PushScanner(ExperimentalScanner* sink)

{

limit = 0;

start = 0;

@@ -164,6 +90,8 @@ public:

buffer = 0;

eof = false;

bufferEnd = 0;

+ sink_ = sink;

+ real_start = 0;

}

// ----------------------------------------------------------------------

@@ -172,49 +100,27 @@ public:

}

// ----------------------------------------------------------------------

- void send(

- Token token

- )

- {

- size_t tokenSize = cursor-start;

- const char *tokenName = tokenNames[token];

- printf(

- "scanner is pushing out a token of type %d (%s)",

- token,

- tokenName

- );

- if(token==EOS) putchar('\n');

- else

- {

- size_t tokenNameSize = strlen(tokenNames[token]);

- size_t padSize = 20-(20<tokenNameSize ? 20 : tokenNameSize);

- for(size_t i=0; i<padSize; ++i) putchar(' ');

- printf(" : ---->");

- fwrite(

- start,

- tokenSize,

- 1,

- stdout

- );

- printf("<----\n");

+ void send(Token::Value token) {

+ int beg = (start - buffer) + real_start;

+ int end = (cursor - buffer) + real_start;

+ if (FLAG_trace_lexer) {

+ printf("got %s at (%d, %d): ", Token::Name(token), beg, end);

+ for (uint8_t* s = start; s != cursor; s++) printf("%c", (char)*s);

+ printf(".\n");

}

+ sink_->Record(token, beg, end);

}

// ----------------------------------------------------------------------

- uint32_t push(

- const void *input,

- ssize_t inputSize

- )

- {

- printf(

- "scanner is receiving a new data batch of length %ld\n"

- "scanner continues with saved state = %d\n",

- inputSize,

- state

- );

+ uint32_t push(const void *input, int input_size) {

+ if (FLAG_trace_lexer) {

+ printf(

+ "scanner is receiving a new data batch of length %d\n"

+ "scanner continues with saved state = %d\n",

+ input_size,

+ state

+ );

+ }

* Data source is signaling end of file when batch size

@@ -224,12 +130,12 @@ public:

* the longest keyword, so given our grammar, 32 is a safe bet.

uint8_t null[64];

- const ssize_t maxFill = 32;

- if(inputSize<maxFill) // FIXME: do something about this!!!

+ const int maxFill = 32;

+ if(input_size<maxFill) // FIXME: do something about this!!!

{

eof = true;

input = null;

- inputSize = sizeof(null);

+ input_size = sizeof(null);

memset(null, 0, sizeof(null));

}

@@ -246,7 +152,7 @@ public:

size_t used = limit-buffer;

- size_t needed = used+inputSize;

+ size_t needed = used+input_size;

size_t allocated = bufferEnd-buffer;

if(allocated<needed)

{

@@ -263,8 +169,8 @@ public:

start = buffer + startOffset;

limit = limitOffset + buffer;

}

- memcpy(limit, input, inputSize);

- limit += inputSize;

+ memcpy(limit, input, input_size);

+ limit += input_size;

// The scanner starts here

#define YYLIMIT limit

@@ -283,7 +189,9 @@ public:

start:

- printf("Starting a round; state: %d, condition: %d\n", state, condition);

+ if (FLAG_trace_lexer) {

+ printf("Starting a round; state: %d, condition: %d\n", state, condition);

+ }

/*!re2c

re2c:indent:top = 1;

@@ -302,66 +210,66 @@ public:

hex_digit = [0-9a-fA-F];

maybe_exponent = ('e' [-+]? digit+)?;

- <Normal> "|=" { PUSH_TOKEN(ASSIGN_BIT_OR); }

- <Normal> "^=" { PUSH_TOKEN(ASSIGN_BIT_XOR); }

- <Normal> "&=" { PUSH_TOKEN(ASSIGN_BIT_AND); }

- <Normal> "+=" { PUSH_TOKEN(ASSIGN_ADD); }

- <Normal> "-=" { PUSH_TOKEN(ASSIGN_SUB); }

- <Normal> "*=" { PUSH_TOKEN(ASSIGN_MUL); }

- <Normal> "/=" { PUSH_TOKEN(ASSIGN_DIV); }

- <Normal> "%=" { PUSH_TOKEN(ASSIGN_MOD); }

- <Normal> "===" { PUSH_TOKEN(EQ_STRICT); }

- <Normal> "==" { PUSH_TOKEN(EQ); }

- <Normal> "=" { PUSH_TOKEN(ASSIGN); }

- <Normal> "!==" { PUSH_TOKEN(NE_STRICT); }

- <Normal> "!=" { PUSH_TOKEN(NE); }

- <Normal> "!" { PUSH_TOKEN(NOT); }

+ <Normal> "|=" { PUSH_TOKEN(Token::ASSIGN_BIT_OR); }

+ <Normal> "^=" { PUSH_TOKEN(Token::ASSIGN_BIT_XOR); }

+ <Normal> "&=" { PUSH_TOKEN(Token::ASSIGN_BIT_AND); }

+ <Normal> "+=" { PUSH_TOKEN(Token::ASSIGN_ADD); }

+ <Normal> "-=" { PUSH_TOKEN(Token::ASSIGN_SUB); }

+ <Normal> "*=" { PUSH_TOKEN(Token::ASSIGN_MUL); }

+ <Normal> "/=" { PUSH_TOKEN(Token::ASSIGN_DIV); }

+ <Normal> "%=" { PUSH_TOKEN(Token::ASSIGN_MOD); }

+ <Normal> "===" { PUSH_TOKEN(Token::EQ_STRICT); }

+ <Normal> "==" { PUSH_TOKEN(Token::EQ); }

+ <Normal> "=" { PUSH_TOKEN(Token::ASSIGN); }

+ <Normal> "!==" { PUSH_TOKEN(Token::NE_STRICT); }

+ <Normal> "!=" { PUSH_TOKEN(Token::NE); }

+ <Normal> "!" { PUSH_TOKEN(Token::NOT); }

<Normal> "//" :=> SingleLineComment

<Normal> "/*" :=> MultiLineComment

<Normal> "<!--" :=> HtmlComment

- <Normal> ">>>=" { PUSH_TOKEN(ASSIGN_SHR); }

- <Normal> "<<=" { PUSH_TOKEN(ASSIGN_SHL); }

- <Normal> ">>=" { PUSH_TOKEN(ASSIGN_SAR); }

- <Normal> "<=" { PUSH_TOKEN(LTE); }

- <Normal> ">=" { PUSH_TOKEN(GTE); }

- <Normal> "<<" { PUSH_TOKEN(SHL); }

- <Normal> ">>" { PUSH_TOKEN(SAR); }

- <Normal> "<" { PUSH_TOKEN(LT); }

- <Normal> ">" { PUSH_TOKEN(GT); }

- <Normal> '0x' hex_digit+ { PUSH_NUMBER(); }

- <Normal> "." digit+ maybe_exponent { PUSH_NUMBER(); }

- <Normal> digit+ ("." digit+)? maybe_exponent { PUSH_NUMBER(); }

- <Normal> "(" { PUSH_TOKEN(LPAREN); }

- <Normal> ")" { PUSH_TOKEN(RPAREN); }

- <Normal> "[" { PUSH_TOKEN(LBRACK); }

- <Normal> "]" { PUSH_TOKEN(RBRACK); }

- <Normal> "{" { PUSH_TOKEN(LBRACE); }

- <Normal> "}" { PUSH_TOKEN(RBRACE); }

- <Normal> ":" { PUSH_TOKEN(COLON); }

- <Normal> ";" { PUSH_TOKEN(SEMICOLON); }

- <Normal> "." { PUSH_TOKEN(PERIOD); }

- <Normal> "?" { PUSH_TOKEN(CONDITIONAL); }

- <Normal> "++" { PUSH_TOKEN(INC); }

- <Normal> "--" { PUSH_TOKEN(DEC); }

- <Normal> "||" { PUSH_TOKEN(OR); }

- <Normal> "&&" { PUSH_TOKEN(AND); }

- <Normal> "|" { PUSH_TOKEN(BIT_OR); }

- <Normal> "^" { PUSH_TOKEN(BIT_XOR); }

- <Normal> "&" { PUSH_TOKEN(BIT_AND); }

- <Normal> "+" { PUSH_TOKEN(ADD); }

- <Normal> "-" { PUSH_TOKEN(SUB); }

- <Normal> "*" { PUSH_TOKEN(MUL); }

- <Normal> "/" { PUSH_TOKEN(DIV); }

- <Normal> "%" { PUSH_TOKEN(MOD); }

- <Normal> "~" { PUSH_TOKEN(BIT_NOT); }

- <Normal> "," { PUSH_TOKEN(COMMA); }

+ <Normal> ">>>=" { PUSH_TOKEN(Token::ASSIGN_SHR); }

+ <Normal> "<<=" { PUSH_TOKEN(Token::ASSIGN_SHL); }

+ <Normal> ">>=" { PUSH_TOKEN(Token::ASSIGN_SAR); }

+ <Normal> "<=" { PUSH_TOKEN(Token::LTE); }

+ <Normal> ">=" { PUSH_TOKEN(Token::GTE); }

+ <Normal> "<<" { PUSH_TOKEN(Token::SHL); }

+ <Normal> ">>" { PUSH_TOKEN(Token::SAR); }

+ <Normal> "<" { PUSH_TOKEN(Token::LT); }

+ <Normal> ">" { PUSH_TOKEN(Token::GT); }

+ <Normal> '0x' hex_digit+ { PUSH_TOKEN(Token::NUMBER); }

+ <Normal> "." digit+ maybe_exponent { PUSH_TOKEN(Token::NUMBER); }

+ <Normal> digit+ ("." digit+)? maybe_exponent { PUSH_TOKEN(Token::NUMBER); }

+ <Normal> "(" { PUSH_TOKEN(Token::LPAREN); }

+ <Normal> ")" { PUSH_TOKEN(Token::RPAREN); }

+ <Normal> "[" { PUSH_TOKEN(Token::LBRACK); }

+ <Normal> "]" { PUSH_TOKEN(Token::RBRACK); }

+ <Normal> "{" { PUSH_TOKEN(Token::LBRACE); }

+ <Normal> "}" { PUSH_TOKEN(Token::RBRACE); }

+ <Normal> ":" { PUSH_TOKEN(Token::COLON); }

+ <Normal> ";" { PUSH_TOKEN(Token::SEMICOLON); }

+ <Normal> "." { PUSH_TOKEN(Token::PERIOD); }

+ <Normal> "?" { PUSH_TOKEN(Token::CONDITIONAL); }

+ <Normal> "++" { PUSH_TOKEN(Token::INC); }

+ <Normal> "--" { PUSH_TOKEN(Token::DEC); }

+ <Normal> "||" { PUSH_TOKEN(Token::OR); }

+ <Normal> "&&" { PUSH_TOKEN(Token::AND); }

+ <Normal> "|" { PUSH_TOKEN(Token::BIT_OR); }

+ <Normal> "^" { PUSH_TOKEN(Token::BIT_XOR); }

+ <Normal> "&" { PUSH_TOKEN(Token::BIT_AND); }

+ <Normal> "+" { PUSH_TOKEN(Token::ADD); }

+ <Normal> "-" { PUSH_TOKEN(Token::SUB); }

+ <Normal> "*" { PUSH_TOKEN(Token::MUL); }

+ <Normal> "/" { PUSH_TOKEN(Token::DIV); }

+ <Normal> "%" { PUSH_TOKEN(Token::MOD); }

+ <Normal> "~" { PUSH_TOKEN(Token::BIT_NOT); }

+ <Normal> "," { PUSH_TOKEN(Token::COMMA); }

<Normal> line_terminator+ { PUSH_LINE_TERMINATOR(); }

<Normal> whitespace { SKIP(); }

@@ -371,19 +279,19 @@ public:

<Normal> identifier_start :=> Identifier

- <Normal> eof { PUSH_EOS(); return 1; }

+ <Normal> eof { PUSH_TOKEN(Token::EOS); return 1; }

<Normal> any { TERMINATE_ILLEGAL(); }

<DoubleQuoteString> "\\\"" { goto yy0; }

- <DoubleQuoteString> '"' { PUSH_STRING();}

+ <DoubleQuoteString> '"' { PUSH_TOKEN(Token::STRING);}

<DoubleQuoteString> any { goto yy0; }

<SingleQuoteString> "\\'" { goto yy0; }

- <SingleQuoteString> "'" { PUSH_STRING();}

+ <SingleQuoteString> "'" { PUSH_TOKEN(Token::STRING);}

<SingleQuoteString> any { goto yy0; }

<Identifier> identifier_char+ { goto yy0; }

- <Identifier> any { PUSH_IDENTIFIER(); }

+ <Identifier> any { cursor--; PUSH_TOKEN(Token::IDENTIFIER); }

<SingleLineComment> line_terminator { PUSH_LINE_TERMINATOR();}

<SingleLineComment> eof { PUSH_LINE_TERMINATOR();}

@@ -399,82 +307,80 @@ public:

fill:

- ssize_t unfinishedSize = cursor-start;

- printf(

- "scanner needs a refill. Exiting for now with:\n"

- " saved fill state = %d\n"

- " unfinished token size = %ld\n",

- state,

- unfinishedSize

- );

- if(0<unfinishedSize && start<limit)

- {

- printf(" unfinished token is: ");

- fwrite(start, 1, cursor-start, stdout);

+ int unfinishedSize = cursor-start;

+ if (FLAG_trace_lexer) {

+ printf(

+ "scanner needs a refill. Exiting for now with:\n"

+ " saved fill state = %d\n"

+ " unfinished token size = %d\n",

+ state,

+ unfinishedSize

+ );

+ if(0 < unfinishedSize && start < limit) {

+ printf(" unfinished token is: ");

+ fwrite(start, 1, cursor-start, stdout);

+ putchar('\n');

+ }

putchar('\n');

}

- putchar('\n');

* Once we get here, we can get rid of

* everything before start and after limit.

- if(eof==true) goto start;

- if(buffer<start)

- {

- size_t startOffset = start-buffer;

- memmove(buffer, start, limit-start);

- marker -= startOffset;

- cursor -= startOffset;

- limit -= startOffset;

- start -= startOffset;

+ if (eof == true) goto start;

+ if (buffer < start) {

+ size_t start_offset = start - buffer;

+ memmove(buffer, start, limit - start);

+ marker -= start_offset;

+ cursor -= start_offset;

+ limit -= start_offset;

+ start -= start_offset;

+ real_start += start_offset;

}

return 0;

}

};

-// ----------------------------------------------------------------------

-int main(

- int argc,

- char **argv

- // Parse cmd line

- int input = 0;

- if(1<argc)

- {

- input = open(argv[1], O_RDONLY | O_BINARY);

- if(input<0)

- {

- fprintf(

- stderr,

- "could not open file %s\n",

- argv[1]

- );

- exit(1);

- }

- /*

- * Tokenize input file by pushing batches

- * of data one by one into the scanner.

- */

- const size_t batchSize = 256;

- uint8_t buffer[batchSize];

- PushScanner scanner;

- while(1)

- {

- ssize_t n = read(input, buffer, batchSize);

- if (scanner.push(buffer, n)) {

- printf("Scanner: illegal data\n");

- return 1;

- }

- if(n<batchSize) break;

- }

- scanner.push(0, -1);

- close(input);

+ExperimentalScanner::ExperimentalScanner(const char* fname) :

+ current_(0), fetched_(0) {

+ file_ = fopen(fname, "rb");

+ scanner_ = new PushScanner(this);

+ExperimentalScanner::~ExperimentalScanner() {

+ fclose(file_);

+void ExperimentalScanner::FillTokens() {

+ current_ = 0;

+ fetched_ = 0;

+ uint8_t chars[BUFFER_SIZE];

+ int n = static_cast<int>(fread(&chars, 1, BUFFER_SIZE, file_));

+ scanner_->push(chars, n);

+Token::Value ExperimentalScanner::Next(int* beg_pos, int* end_pos) {

+ if (current_ == fetched_) {

+ FillTokens();

+ }

+ *beg_pos = beg_[current_];

+ *end_pos = end_[current_];

+ Token::Value res = token_[current_];

+ if (token_[current_] != Token::Token::EOS &&

+ token_[current_] != Token::ILLEGAL) current_++;

+ return res;

- // Done

- return 0;

+void ExperimentalScanner::Record(Token::Value token, int beg, int end) {

+ if (token == Token::EOS) end--;

+ token_[fetched_] = token;

+ beg_[fetched_] = beg;

+ end_[fetched_] = end;

+ fetched_++;

}

« no previous file with comments | « src/lexer/lexer.gyp ('k') | src/lexer/lexer-shell.cc » ('j') | no next file with comments »