src/jsregexp.h - Issue 9961009: Regexp: Unify the lookahead code for the BoyerMoore-like skipping

Unified Diff: src/jsregexp.h

Issue 9961009: Regexp: Unify the lookahead code for the BoyerMoore-like skipping (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.h

===================================================================

--- src/jsregexp.h (revision 11224)

+++ src/jsregexp.h (working copy)

@@ -40,6 +40,7 @@

class RegExpMacroAssembler;

class RegExpNode;

class RegExpTree;

+class BoyerMooreLookahead;

class RegExpImpl {

public:

@@ -224,48 +225,6 @@

};

-// Represents the relation of two sets.

-// Sets can be either disjoint, partially or fully overlapping, or equal.

-class SetRelation BASE_EMBEDDED {

- public:

- // Relation is represented by a bit saying whether there are elements in

- // one set that is not in the other, and a bit saying that there are elements

- // that are in both sets.

- // Location of an element. Corresponds to the internal areas of

- // a Venn diagram.

- enum {

- kInFirst = 1 << kInsideFirst,

- kInSecond = 1 << kInsideSecond,

- kInBoth = 1 << kInsideBoth

- };

- SetRelation() : bits_(0) {}

- ~SetRelation() {}

- // Add the existence of objects in a particular

- void SetElementsInFirstSet() { bits_ |= kInFirst; }

- void SetElementsInSecondSet() { bits_ |= kInSecond; }

- void SetElementsInBothSets() { bits_ |= kInBoth; }

- // Check the currently known relation of the sets (common functions only,

- // for other combinations, use value() to get the bits and check them

- // manually).

- // Sets are completely disjoint.

- bool Disjoint() { return (bits_ & kInBoth) == 0; }

- // Sets are equal.

- bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; }

- // First set contains second.

- bool Contains() { return (bits_ & kInSecond) == 0; }

- // Second set contains first.

- bool ContainedIn() { return (bits_ & kInFirst) == 0; }

- bool NonTrivialIntersection() {

- return (bits_ == (kInFirst | kInSecond | kInBoth));

- }

- int value() { return bits_; }

- private:

- int bits_;

-};

class CharacterRange {

public:

CharacterRange() : from_(0), to_(0) { }

@@ -273,7 +232,7 @@

CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT

CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }

static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges);

- static Vector<const uc16> GetWordBounds();

+ static Vector<const int> GetWordBounds();

static inline CharacterRange Singleton(uc16 value) {

return CharacterRange(value, value);

}

@@ -294,7 +253,7 @@

bool IsSingleton() { return (from_ == to_); }

void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii);

static void Split(ZoneList<CharacterRange>* base,

- Vector<const uc16> overlay,

+ Vector<const int> overlay,

ZoneList<CharacterRange>** included,

ZoneList<CharacterRange>** excluded);

// Whether a range list is in canonical form: Ranges ordered by from value,

@@ -305,28 +264,6 @@

// adjacent ranges are merged. The resulting list may be shorter than the

// original, but cannot be longer.

static void Canonicalize(ZoneList<CharacterRange>* ranges);

- // Check how the set of characters defined by a CharacterRange list relates

- // to the set of word characters. List must be in canonical form.

- static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges);

- // Takes two character range lists (representing character sets) in canonical

- // form and merges them.

- // The characters that are only covered by the first set are added to

- // first_set_only_out. the characters that are only in the second set are

- // added to second_set_only_out, and the characters that are in both are

- // added to both_sets_out.

- // The pointers to first_set_only_out, second_set_only_out and both_sets_out

- // should be to empty lists, but they need not be distinct, and may be NULL.

- // If NULL, the characters are dropped, and if two arguments are the same

- // pointer, the result is the union of the two sets that would be created

- // if the pointers had been distinct.

- // This way, the Merge function can compute all the usual set operations:

- // union (all three out-sets are equal), intersection (only both_sets_out is

- // non-NULL), and set difference (only first_set is non-NULL).

- static void Merge(ZoneList<CharacterRange>* first_set,

- ZoneList<CharacterRange>* second_set,

- ZoneList<CharacterRange>* first_set_only_out,

- ZoneList<CharacterRange>* second_set_only_out,

- ZoneList<CharacterRange>* both_sets_out);

// Negate the contents of a character range in canonical form.

static void Negate(ZoneList<CharacterRange>* src,

ZoneList<CharacterRange>* dst);

@@ -421,90 +358,6 @@

};

-// Improve the speed that we scan for an initial point where a non-anchored

-// regexp can match by using a Boyer-Moore-like table. This is done by

-// identifying non-greedy non-capturing loops in the nodes that eat any

-// character one at a time. For example in the middle of the regexp

-// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly

-// inserted at the start of any non-anchored regexp.

-//

-// When we have found such a loop we look ahead in the nodes to find the set of

-// characters that can come at given distances. For example for the regexp

-// /.?foo/ we know that there are at least 3 characters ahead of us, and the

-// sets of characters that can occur are [any, [f, o], [o]]. We find a range in

-// the lookahead info where the set of characters is reasonably constrained. In

-// our example this is from index 1 to 2 (0 is not constrained). We can now

-// look 3 characters ahead and if we don't find one of [f, o] (the union of

-// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).

-//

-// For Unicode input strings we do the same, but modulo 128.

-//

-// We also look at the first string fed to the regexp and use that to get a hint

-// of the character frequencies in the inputs. This affects the assessment of

-// whether the set of characters is 'reasonably constrained'.

-//

-// We also have another lookahead mechanism (called quick check in the code),

-// which uses a wide load of multiple characters followed by a mask and compare

-// to determine whether a match is possible at this point.

-class BoyerMooreLookahead {

- public:

- BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler);

- int length() { return length_; }

- int max_char() { return max_char_; }

- RegExpCompiler* compiler() { return compiler_; }

- static const int kTooManyCharacters = 32;

- int Count(int map_number) {

- ZoneList<bool>* map = bitmaps_->at(map_number);

- if (map == NULL) return map_length_;

- int count = 0;

- for (int i = 0; i < map_length_; i++) {

- if (map->at(i)) count++;

- }

- return count;

- }

- void Set(int map_number, int character) {

- if (character > max_char_) return;

- ZoneList<bool>* map = bitmaps_->at(map_number);

- if (map == NULL) return;

- map->at(character & (map_length_ - 1)) = true;

- }

- void SetAll(int map_number) {

- bitmaps_->at(map_number) = NULL;

- }

- void SetRest(int from_map) {

- for (int i = from_map; i < length_; i++) SetAll(i);

- }

- bool EmitSkipInstructions(RegExpMacroAssembler* masm);

- private:

- // This is the value obtained by EatsAtLeast. If we do not have at least this

- // many characters left in the sample string then the match is bound to fail.

- // Therefore it is OK to read a character this far ahead of the current match

- // point.

- int length_;

- // We conservatively consider all character values modulo this length. For

- // ASCII there is no loss of precision, since this has a value of 128.

- int map_length_;

- RegExpCompiler* compiler_;

- // 0x7f for ASCII, 0xffff for UTF-16.

- int max_char_;

- ZoneList<ZoneList<bool>*>* bitmaps_;

- int GetSkipTable(int min_lookahead,

- int max_lookahead,

- Handle<ByteArray> boolean_skip_table);

- bool FindWorthwhileInterval(int* from, int* to);

- int FindBestInterval(

- int max_number_of_chars, int old_biggest_points, int* from, int* to);

-};

#define FOR_EACH_NODE_TYPE(VISIT) \

VISIT(End) \

VISIT(Action) \

@@ -687,7 +540,9 @@

class RegExpNode: public ZoneObject {

public:

- RegExpNode() : first_character_set_(NULL), trace_count_(0) { }

+ RegExpNode() : first_character_set_(NULL), trace_count_(0) {

+ bm_info_[0] = bm_info_[1] = NULL;

+ }

virtual ~RegExpNode();

virtual void Accept(NodeVisitor* visitor) = 0;

// Generates a goto to this node or actually generates the code at this point.

@@ -731,7 +586,7 @@

// Collects information on the possible code units (mod 128) that can match if

// we look forward. This is used for a Boyer-Moore-like string searching

// implementation. TODO(erikcorry): This should share more code with

- // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet.

+ // EatsAtLeast, GetQuickCheckDetails.

virtual void FillInBMInfo(

int offset, BoyerMooreLookahead* bm, bool not_at_start) {

UNREACHABLE();

@@ -759,17 +614,6 @@

SiblingList* siblings() { return &siblings_; }

void set_siblings(SiblingList* other) { siblings_ = *other; }

- // Return the set of possible next characters recognized by the regexp

- // (or a safe subset, potentially the set of all characters).

- ZoneList<CharacterRange>* FirstCharacterSet();

- // Compute (if possible within the budget of traversed nodes) the

- // possible first characters of the input matched by this node and

- // its continuation. Returns the remaining budget after the computation.

- // If the budget is spent, the result is negative, and the cached

- // first_character_set_ value isn't set.

- virtual int ComputeFirstCharacterSet(int budget);

// Get and set the cached first character set value.

ZoneList<CharacterRange>* first_character_set() {

return first_character_set_;

@@ -777,6 +621,9 @@

void set_first_character_set(ZoneList<CharacterRange>* character_set) {

first_character_set_ = character_set;

}

+ BoyerMooreLookahead* bm_info(bool not_at_start) {

+ return bm_info_[not_at_start ? 1 : 0];

+ }

protected:

enum LimitResult { DONE, CONTINUE };

@@ -801,6 +648,10 @@

// processed before it is on a usable state.

virtual RegExpNode* Clone() = 0;

+ void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {

+ bm_info_[not_at_start ? 1 : 0] = bm;

+ }

private:

static const int kFirstCharBudget = 10;

Label label_;

@@ -813,6 +664,7 @@

// a trace, in which case it is generic and can be reused by flushing the

// deferred operations in the current trace and generating a goto.

int trace_count_;

+ BoyerMooreLookahead* bm_info_[2];

};

@@ -833,8 +685,8 @@

return (from_ <= value) && (value <= to_);

}

bool is_empty() { return from_ == kNone; }

- int from() { return from_; }

- int to() { return to_; }

+ int from() const { return from_; }

+ int to() const { return to_; }

static Interval Empty() { return Interval(); }

static const int kNone = -1;

private:

@@ -852,6 +704,7 @@

virtual void FillInBMInfo(

int offset, BoyerMooreLookahead* bm, bool not_at_start) {

on_success_->FillInBMInfo(offset, bm, not_at_start);

+ if (offset == 0) set_bm_info(not_at_start, bm);

}

private:

RegExpNode* on_success_;

@@ -905,7 +758,6 @@

// TODO(erikcorry): We should allow some action nodes in greedy loops.

virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }

virtual ActionNode* Clone() { return new ActionNode(*this); }

- virtual int ComputeFirstCharacterSet(int budget);

private:

union {

@@ -978,7 +830,6 @@

return result;

}

void CalculateOffsets();

- virtual int ComputeFirstCharacterSet(int budget);

private:

enum TextEmitPassType {

@@ -1009,12 +860,7 @@

AT_START,

AT_BOUNDARY,

AT_NON_BOUNDARY,

- AFTER_NEWLINE,

- // Types not directly expressible in regexp syntax.

- // Used for modifying a boundary node if its following character is

- // known to be word and/or non-word.

- AFTER_NONWORD_CHARACTER,

- AFTER_WORD_CHARACTER

+ AFTER_NEWLINE

};

static AssertionNode* AtEnd(RegExpNode* on_success) {

return new AssertionNode(AT_END, on_success);

@@ -1042,12 +888,16 @@

bool not_at_start);

virtual void FillInBMInfo(

int offset, BoyerMooreLookahead* bm, bool not_at_start);

- virtual int ComputeFirstCharacterSet(int budget);

virtual AssertionNode* Clone() { return new AssertionNode(*this); }

AssertionNodeType type() { return type_; }

void set_type(AssertionNodeType type) { type_ = type; }

private:

+ void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);

+ enum IfPrevious { kIsNonWord, kIsWord };

+ void BacktrackIfPrevious(RegExpCompiler* compiler,

+ Trace* trace,

+ IfPrevious backtrack_if_previous);

AssertionNode(AssertionNodeType t, RegExpNode* on_success)

: SeqRegExpNode(on_success), type_(t) { }

AssertionNodeType type_;

@@ -1076,13 +926,8 @@

return;

}

virtual void FillInBMInfo(

- int offset, BoyerMooreLookahead* bm, bool not_at_start) {

- // Working out the set of characters that a backreference can match is too

- // hard, so we just say that any character can match.

- bm->SetRest(offset);

- }

+ int offset, BoyerMooreLookahead* bm, bool not_at_start);

virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }

- virtual int ComputeFirstCharacterSet(int budget);

private:

int start_reg_;

@@ -1249,6 +1094,7 @@

virtual void FillInBMInfo(

int offset, BoyerMooreLookahead* bm, bool not_at_start) {

alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start);

+ if (offset == 0) set_bm_info(not_at_start, bm);

}

// For a negative lookahead we don't emit the quick check for the

// alternative that is expected to fail. This is because quick check code

@@ -1256,7 +1102,6 @@

// characters, but on a negative lookahead the negative branch did not take

// part in that calculation (EatsAtLeast) so the assumptions don't hold.

virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }

- virtual int ComputeFirstCharacterSet(int budget);

};

@@ -1279,7 +1124,6 @@

bool not_at_start);

virtual void FillInBMInfo(

int offset, BoyerMooreLookahead* bm, bool not_at_start);

- virtual int ComputeFirstCharacterSet(int budget);

virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }

RegExpNode* loop_node() { return loop_node_; }

RegExpNode* continue_node() { return continue_node_; }

@@ -1300,6 +1144,141 @@

};

+// Improve the speed that we scan for an initial point where a non-anchored

+// regexp can match by using a Boyer-Moore-like table. This is done by

+// identifying non-greedy non-capturing loops in the nodes that eat any

+// character one at a time. For example in the middle of the regexp

+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly

+// inserted at the start of any non-anchored regexp.

+//

+// When we have found such a loop we look ahead in the nodes to find the set of

+// characters that can come at given distances. For example for the regexp

+// /.?foo/ we know that there are at least 3 characters ahead of us, and the

+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in

+// the lookahead info where the set of characters is reasonably constrained. In

+// our example this is from index 1 to 2 (0 is not constrained). We can now

+// look 3 characters ahead and if we don't find one of [f, o] (the union of

+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).

+//

+// For Unicode input strings we do the same, but modulo 128.

+//

+// We also look at the first string fed to the regexp and use that to get a hint

+// of the character frequencies in the inputs. This affects the assessment of

+// whether the set of characters is 'reasonably constrained'.

+//

+// We also have another lookahead mechanism (called quick check in the code),

+// which uses a wide load of multiple characters followed by a mask and compare

+// to determine whether a match is possible at this point.

+enum ContainedInLattice {

+ kNotYet = 0,

+ kLatticeIn = 1,

+ kLatticeOut = 2,

+ kLatticeUnknown = 3 // Can also mean both in and out.

+};

+ContainedInLattice AddRange(ContainedInLattice a,

+ const int* ranges,

+ int ranges_size,

+ Interval new_range);

+class BoyerMoorePositionInfo : public ZoneObject {

+ public:

+ BoyerMoorePositionInfo()

+ : map_(new ZoneList<bool>(kMapSize)),

+ map_count_(0),

+ w_(kNotYet),

+ s_(kNotYet),

+ d_(kNotYet),

+ surrogate_(kNotYet) {

+ for (int i = 0; i < kMapSize; i++) {

+ map_->Add(false);

+ }

+ bool& at(int i) { return map_->at(i); }

+ static const int kMapSize = 128;

+ static const int kMask = kMapSize -1 ;

ulan 2012/04/16 13:52:22 Should be kMapSize - 1;

Erik Corry 2012/04/17 07:51:45 Done.

+ int map_count() const { return map_count_; }

+ void Set(int character);

+ void SetInterval(const Interval& interval);

+ void SetAll();

+ bool is_non_word() { return w_ == kLatticeOut; }

+ bool is_word() { return w_ == kLatticeIn; }

+ private:

+ ZoneList<bool>* map_;

+ int map_count_; // Number of set bits in the map.

+ ContainedInLattice w_; // The \w character class.

+ ContainedInLattice s_; // The \s character class.

+ ContainedInLattice d_; // The \d character class.

+ ContainedInLattice surrogate_; // Surrogate UTF-16 code units.

+};

+class BoyerMooreLookahead : public ZoneObject {

+ public:

+ BoyerMooreLookahead(int length, RegExpCompiler* compiler);

+ int length() { return length_; }

+ int max_char() { return max_char_; }

+ RegExpCompiler* compiler() { return compiler_; }

+ int Count(int map_number) {

+ return bitmaps_->at(map_number)->map_count();

+ }

+ BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }

+ void Set(int map_number, int character) {

+ if (character > max_char_) return;

+ BoyerMoorePositionInfo* info = bitmaps_->at(map_number);

+ info->Set(character);

+ }

+ void SetInterval(int map_number, const Interval& interval) {

+ if (interval.from() > max_char_) return;

+ BoyerMoorePositionInfo* info = bitmaps_->at(map_number);

+ if (interval.to() > max_char_) {

+ info->SetInterval(Interval(interval.from(), max_char_));

+ } else {

+ info->SetInterval(interval);

+ }

+ void SetAll(int map_number) {

+ bitmaps_->at(map_number)->SetAll();

+ }

+ void SetRest(int from_map) {

+ for (int i = from_map; i < length_; i++) SetAll(i);

+ }

+ bool EmitSkipInstructions(RegExpMacroAssembler* masm);

+ private:

+ // This is the value obtained by EatsAtLeast. If we do not have at least this

+ // many characters left in the sample string then the match is bound to fail.

+ // Therefore it is OK to read a character this far ahead of the current match

+ // point.

+ int length_;

+ RegExpCompiler* compiler_;

+ // 0x7f for ASCII, 0xffff for UTF-16.

+ int max_char_;

+ ZoneList<BoyerMoorePositionInfo*>* bitmaps_;

+ int GetSkipTable(int min_lookahead,

+ int max_lookahead,

+ Handle<ByteArray> boolean_skip_table);

+ bool FindWorthwhileInterval(int* from, int* to);

+ int FindBestInterval(

+ int max_number_of_chars, int old_biggest_points, int* from, int* to);

+};

// There are many ways to generate code for a node. This class encapsulates

// the current way we should be generating. In other words it encapsulates

// the current state of the code generator. The effect of this is that we

« no previous file with comments | « no previous file | src/jsregexp.cc » ('j') | src/jsregexp.cc » ('J')