icu51/source/common/unisetspan.cpp - Issue 20882002: Check in the pristine copy of ICU 51.2

Unified Diff: icu51/source/common/unisetspan.cpp

Issue 20882002: Check in the pristine copy of ICU 51.2 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu51/source/common/unisetspan.cpp

===================================================================

--- icu51/source/common/unisetspan.cpp (revision 0)

+++ icu51/source/common/unisetspan.cpp (revision 0)

@@ -0,0 +1,1510 @@

+/*

+******************************************************************************

+* file name: unisetspan.cpp

+* encoding: US-ASCII

+* tab size: 8 (not used)

+* indentation:4

+* created on: 2007mar01

+* created by: Markus W. Scherer

+*/

+#include "unicode/utypes.h"

+#include "unicode/uniset.h"

+#include "unicode/ustring.h"

+#include "unicode/utf8.h"

+#include "unicode/utf16.h"

+#include "cmemory.h"

+#include "uvector.h"

+#include "unisetspan.h"

+U_NAMESPACE_BEGIN

+/*

+ * List of offsets from the current position from where to try matching

+ * a code point or a string.

+ * Store offsets rather than indexes to simplify the code and use the same list

+ * for both increments (in span()) and decrements (in spanBack()).

+ *

+ * Assumption: The maximum offset is limited, and the offsets that are stored

+ * at any one time are relatively dense, that is, there are normally no gaps of

+ * hundreds or thousands of offset values.

+ *

+ * The implementation uses a circular buffer of byte flags,

+ * each indicating whether the corresponding offset is in the list.

+ * This avoids inserting into a sorted list of offsets (or absolute indexes) and

+ * physically moving part of the list.

+ *

+ * Note: In principle, the caller should setMaxLength() to the maximum of the

+ * max string length and U16_LENGTH/U8_LENGTH to account for

+ * "long" single code points.

+ * However, this implementation uses at least a staticList with more than

+ * U8_LENGTH entries anyway.

+ *

+ * Note: If maxLength were guaranteed to be no more than 32 or 64,

+ * the list could be stored as bit flags in a single integer.

+ * Rather than handling a circular buffer with a start list index,

+ * the integer would simply be shifted when lower offsets are removed.

+ * UnicodeSet does not have a limit on the lengths of strings.

+ */

+class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.

+public:

+ OffsetList() : list(staticList), capacity(0), length(0), start(0) {}

+ ~OffsetList() {

+ if(list!=staticList) {

+ uprv_free(list);

+ }

+ // Call exactly once if the list is to be used.

+ void setMaxLength(int32_t maxLength) {

+ if(maxLength<=(int32_t)sizeof(staticList)) {

+ capacity=(int32_t)sizeof(staticList);

+ } else {

+ UBool *l=(UBool *)uprv_malloc(maxLength);

+ if(l!=NULL) {

+ list=l;

+ capacity=maxLength;

+ }

+ uprv_memset(list, 0, capacity);

+ }

+ void clear() {

+ uprv_memset(list, 0, capacity);

+ start=length=0;

+ }

+ UBool isEmpty() const {

+ return (UBool)(length==0);

+ }

+ // Reduce all stored offsets by delta, used when the current position

+ // moves by delta.

+ // There must not be any offsets lower than delta.

+ // If there is an offset equal to delta, it is removed.

+ // delta=[1..maxLength]

+ void shift(int32_t delta) {

+ int32_t i=start+delta;

+ if(i>=capacity) {

+ i-=capacity;

+ }

+ if(list[i]) {

+ list[i]=FALSE;

+ --length;

+ }

+ start=i;

+ }

+ // Add an offset. The list must not contain it yet.

+ // offset=[1..maxLength]

+ void addOffset(int32_t offset) {

+ int32_t i=start+offset;

+ if(i>=capacity) {

+ i-=capacity;

+ }

+ list[i]=TRUE;

+ ++length;

+ }

+ // offset=[1..maxLength]

+ UBool containsOffset(int32_t offset) const {

+ int32_t i=start+offset;

+ if(i>=capacity) {

+ i-=capacity;

+ }

+ return list[i];

+ }

+ // Find the lowest stored offset from a non-empty list, remove it,

+ // and reduce all other offsets by this minimum.

+ // Returns [1..maxLength].

+ int32_t popMinimum() {

+ // Look for the next offset in list[start+1..capacity-1].

+ int32_t i=start, result;

+ while(++i<capacity) {

+ if(list[i]) {

+ list[i]=FALSE;

+ --length;

+ result=i-start;

+ start=i;

+ return result;

+ }

+ // i==capacity

+ // Wrap around and look for the next offset in list[0..start].

+ // Since the list is not empty, there will be one.

+ result=capacity-start;

+ i=0;

+ while(!list[i]) {

+ ++i;

+ }

+ list[i]=FALSE;

+ --length;

+ start=i;

+ return result+=i;

+ }

+private:

+ UBool *list;

+ int32_t capacity;

+ int32_t length;

+ int32_t start;

+ UBool staticList[16];

+};

+// Get the number of UTF-8 bytes for a UTF-16 (sub)string.

+static int32_t

+getUTF8Length(const UChar *s, int32_t length) {

+ UErrorCode errorCode=U_ZERO_ERROR;

+ int32_t length8=0;

+ u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);

+ if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {

+ return length8;

+ } else {

+ // The string contains an unpaired surrogate.

+ // Ignore this string.

+ return 0;

+ }

+// Append the UTF-8 version of the string to t and return the appended UTF-8 length.

+static int32_t

+appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {

+ UErrorCode errorCode=U_ZERO_ERROR;

+ int32_t length8=0;

+ u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);

+ if(U_SUCCESS(errorCode)) {

+ return length8;

+ } else {

+ // The string contains an unpaired surrogate.

+ // Ignore this string.

+ return 0;

+ }

+static inline uint8_t

+makeSpanLengthByte(int32_t spanLength) {

+ // 0xfe==UnicodeSetStringSpan::LONG_SPAN

+ return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;

+// Construct for all variants of span(), or only for any one variant.

+// Initialize as little as possible, for single use.

+UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,

+ const UVector &setStrings,

+ uint32_t which)

+ : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),

+ utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),

+ utf8Length(0),

+ maxLength16(0), maxLength8(0),

+ all((UBool)(which==ALL)) {

+ spanSet.retainAll(set);

+ if(which&NOT_CONTAINED) {

+ // Default to the same sets.

+ // addToSpanNotSet() will create a separate set if necessary.

+ pSpanNotSet=&spanSet;

+ }

+ // Determine if the strings even need to be taken into account at all for span() etc.

+ // If any string is relevant, then all strings need to be used for

+ // span(longest match) but only the relevant ones for span(while contained).

+ // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH

+ // and do not store UTF-8 strings if !thisRelevant and CONTAINED.

+ // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)

+ // Also count the lengths of the UTF-8 versions of the strings for memory allocation.

+ int32_t stringsLength=strings.size();

+ int32_t i, spanLength;

+ UBool someRelevant=FALSE;

+ for(i=0; i<stringsLength; ++i) {

+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

+ const UChar *s16=string.getBuffer();

+ int32_t length16=string.length();

+ UBool thisRelevant;

+ spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);

+ if(spanLength<length16) { // Relevant string.

+ someRelevant=thisRelevant=TRUE;

+ } else {

+ thisRelevant=FALSE;

+ }

+ if((which&UTF16) && length16>maxLength16) {

+ maxLength16=length16;

+ }

+ if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {

+ int32_t length8=getUTF8Length(s16, length16);

+ utf8Length+=length8;

+ if(length8>maxLength8) {

+ maxLength8=length8;

+ }

+ if(!someRelevant) {

+ maxLength16=maxLength8=0;

+ return;

+ }

+ // Freeze after checking for the need to use strings at all because freezing

+ // a set takes some time and memory which are wasted if there are no relevant strings.

+ if(all) {

+ spanSet.freeze();

+ }

+ uint8_t *spanBackLengths;

+ uint8_t *spanUTF8Lengths;

+ uint8_t *spanBackUTF8Lengths;

+ // Allocate a block of meta data.

+ int32_t allocSize;

+ if(all) {

+ // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.

+ allocSize=stringsLength*(4+1+1+1+1)+utf8Length;

+ } else {

+ allocSize=stringsLength; // One set of span lengths.

+ if(which&UTF8) {

+ // UTF-8 lengths and UTF-8 strings.

+ allocSize+=stringsLength*4+utf8Length;

+ }

+ if(allocSize<=(int32_t)sizeof(staticLengths)) {

+ utf8Lengths=staticLengths;

+ } else {

+ utf8Lengths=(int32_t *)uprv_malloc(allocSize);

+ if(utf8Lengths==NULL) {

+ maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.

+ return; // Out of memory.

+ }

+ if(all) {

+ // Store span lengths for all span() variants.

+ spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

+ spanBackLengths=spanLengths+stringsLength;

+ spanUTF8Lengths=spanBackLengths+stringsLength;

+ spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;

+ utf8=spanBackUTF8Lengths+stringsLength;

+ } else {

+ // Store span lengths for only one span() variant.

+ if(which&UTF8) {

+ spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

+ utf8=spanLengths+stringsLength;

+ } else {

+ spanLengths=(uint8_t *)utf8Lengths;

+ }

+ spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;

+ }

+ // Set the meta data and pSpanNotSet and write the UTF-8 strings.

+ int32_t utf8Count=0; // Count UTF-8 bytes written so far.

+ for(i=0; i<stringsLength; ++i) {

+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

+ const UChar *s16=string.getBuffer();

+ int32_t length16=string.length();

+ spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);

+ if(spanLength<length16) { // Relevant string.

+ if(which&UTF16) {

+ if(which&CONTAINED) {

+ if(which&FWD) {

+ spanLengths[i]=makeSpanLengthByte(spanLength);

+ }

+ if(which&BACK) {

+ spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);

+ spanBackLengths[i]=makeSpanLengthByte(spanLength);

+ }

+ } else /* not CONTAINED, not all, but NOT_CONTAINED */ {

+ spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.

+ }

+ if(which&UTF8) {

+ uint8_t *s8=utf8+utf8Count;

+ int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);

+ utf8Count+=utf8Lengths[i]=length8;

+ if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.

+ spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;

+ } else { // Relevant for UTF-8.

+ if(which&CONTAINED) {

+ if(which&FWD) {

+ spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);

+ spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);

+ }

+ if(which&BACK) {

+ spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);

+ spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);

+ }

+ } else /* not CONTAINED, not all, but NOT_CONTAINED */ {

+ spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.

+ }

+ if(which&NOT_CONTAINED) {

+ // Add string start and end code points to the spanNotSet so that

+ // a span(while not contained) stops before any string.

+ UChar32 c;

+ if(which&FWD) {

+ int32_t len=0;

+ U16_NEXT(s16, len, length16, c);

+ addToSpanNotSet(c);

+ }

+ if(which&BACK) {

+ int32_t len=length16;

+ U16_PREV(s16, 0, len, c);

+ addToSpanNotSet(c);

+ }

+ } else { // Irrelevant string.

+ if(which&UTF8) {

+ if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.

+ uint8_t *s8=utf8+utf8Count;

+ int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);

+ utf8Count+=utf8Lengths[i]=length8;

+ } else {

+ utf8Lengths[i]=0;

+ }

+ if(all) {

+ spanLengths[i]=spanBackLengths[i]=

+ spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=

+ (uint8_t)ALL_CP_CONTAINED;

+ } else {

+ // All spanXYZLengths pointers contain the same address.

+ spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;

+ }

+ // Finish.

+ if(all) {

+ pSpanNotSet->freeze();

+ }

+// Copy constructor. Assumes which==ALL for a frozen set.

+UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,

+ const UVector &newParentSetStrings)

+ : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),

+ utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),

+ utf8Length(otherStringSpan.utf8Length),

+ maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),

+ all(TRUE) {

+ if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {

+ pSpanNotSet=&spanSet;

+ } else {

+ pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();

+ }

+ // Allocate a block of meta data.

+ // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.

+ int32_t stringsLength=strings.size();

+ int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;

+ if(allocSize<=(int32_t)sizeof(staticLengths)) {

+ utf8Lengths=staticLengths;

+ } else {

+ utf8Lengths=(int32_t *)uprv_malloc(allocSize);

+ if(utf8Lengths==NULL) {

+ maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.

+ return; // Out of memory.

+ }

+ spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

+ utf8=spanLengths+stringsLength*4;

+ uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);

+UnicodeSetStringSpan::~UnicodeSetStringSpan() {

+ if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {

+ delete pSpanNotSet;

+ }

+ if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {

+ uprv_free(utf8Lengths);

+ }

+void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {

+ if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {

+ if(spanSet.contains(c)) {

+ return; // Nothing to do.

+ }

+ UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();

+ if(newSet==NULL) {

+ return; // Out of memory.

+ } else {

+ pSpanNotSet=newSet;

+ }

+ pSpanNotSet->add(c);

+// Compare strings without any argument checks. Requires length>0.

+static inline UBool

+matches16(const UChar *s, const UChar *t, int32_t length) {

+ do {

+ if(*s++!=*t++) {

+ return FALSE;

+ }

+ } while(--length>0);

+ return TRUE;

+static inline UBool

+matches8(const uint8_t *s, const uint8_t *t, int32_t length) {

+ do {

+ if(*s++!=*t++) {

+ return FALSE;

+ }

+ } while(--length>0);

+ return TRUE;

+// Compare 16-bit Unicode strings (which may be malformed UTF-16)

+// at code point boundaries.

+// That is, each edge of a match must not be in the middle of a surrogate pair.

+static inline UBool

+matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {

+ s+=start;

+ limit-=start;

+ return matches16(s, t, length) &&

+ !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&

+ !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));

+// Does the set contain the next code point?

+// If so, return its length; otherwise return its negative length.

+static inline int32_t

+spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {

+ UChar c=*s, c2;

+ if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {

+ return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;

+ }

+ return set.contains(c) ? 1 : -1;

+static inline int32_t

+spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {

+ UChar c=s[length-1], c2;

+ if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {

+ return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;

+ }

+ return set.contains(c) ? 1 : -1;

+static inline int32_t

+spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {

+ UChar32 c=*s;

+ if((int8_t)c>=0) {

+ return set.contains(c) ? 1 : -1;

+ }

+ // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().

+ int32_t i=0;

+ U8_NEXT_OR_FFFD(s, i, length, c);

+ return set.contains(c) ? i : -i;

+static inline int32_t

+spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {

+ UChar32 c=s[length-1];

+ if((int8_t)c>=0) {

+ return set.contains(c) ? 1 : -1;

+ }

+ int32_t i=length-1;

+ c=utf8_prevCharSafeBody(s, 0, &i, c, -3);

+ length-=i;

+ return set.contains(c) ? length : -length;

+/*

+ * Note: In span() when spanLength==0 (after a string match, or at the beginning

+ * after an empty code point span) and in spanNot() and spanNotUTF8(),

+ * string matching could use a binary search

+ * because all string matches are done from the same start index.

+ *

+ * For UTF-8, this would require a comparison function that returns UTF-16 order.

+ *

+ * This optimization should not be necessary for normal UnicodeSets because

+ * most sets have no strings, and most sets with strings have

+ * very few very short strings.

+ * For cases with many strings, it might be better to use a different API

+ * and implementation with a DFA (state machine).

+ */

+/*

+ * Algorithm for span(USET_SPAN_CONTAINED)

+ *

+ * Theoretical algorithm:

+ * - Iterate through the string, and at each code point boundary:

+ * + If the code point there is in the set, then remember to continue after it.

+ * + If a set string matches at the current position, then remember to continue after it.

+ * + Either recursively span for each code point or string match,

+ * or recursively span for all but the shortest one and

+ * iteratively continue the span with the shortest local match.

+ * + Remember the longest recursive span (the farthest end point).

+ * + If there is no match at the current position, neither for the code point there

+ * nor for any set string, then stop and return the longest recursive span length.

+ *

+ * Optimized implementation:

+ *

+ * (We assume that most sets will have very few very short strings.

+ * A span using a string-less set is extremely fast.)

+ *

+ * Create and cache a spanSet which contains all of the single code points

+ * of the original set but none of its strings.

+ *

+ * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).

+ * - Loop:

+ * + Try to match each set string at the end of the spanLength.

+ * ~ Set strings that start with set-contained code points must be matched

+ * with a partial overlap because the recursive algorithm would have tried

+ * to match them at every position.

+ * ~ Set strings that entirely consist of set-contained code points

+ * are irrelevant for span(USET_SPAN_CONTAINED) because the

+ * recursive algorithm would continue after them anyway

+ * and find the longest recursive match from their end.

+ * ~ Rather than recursing, note each end point of a set string match.

+ * + If no set string matched after spanSet.span(), then return

+ * with where the spanSet.span() ended.

+ * + If at least one set string matched after spanSet.span(), then

+ * pop the shortest string match end point and continue

+ * the loop, trying to match all set strings from there.

+ * + If at least one more set string matched after a previous string match,

+ * then test if the code point after the previous string match is also

+ * contained in the set.

+ * Continue the loop with the shortest end point of either this code point

+ * or a matching set string.

+ * + If no more set string matched after a previous string match,

+ * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).

+ * Stop if spanLength==0, otherwise continue the loop.

+ *

+ * By noting each end point of a set string match,

+ * the function visits each string position at most once and finishes

+ * in linear time.

+ *

+ * The recursive algorithm may visit the same string position many times

+ * if multiple paths lead to it and finishes in exponential time.

+ */

+/*

+ * Algorithm for span(USET_SPAN_SIMPLE)

+ *

+ * Theoretical algorithm:

+ * - Iterate through the string, and at each code point boundary:

+ * + If the code point there is in the set, then remember to continue after it.

+ * + If a set string matches at the current position, then remember to continue after it.

+ * + Continue from the farthest match position and ignore all others.

+ * + If there is no match at the current position,

+ * then stop and return the current position.

+ *

+ * Optimized implementation:

+ *

+ * (Same assumption and spanSet as above.)

+ *

+ * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).

+ * - Loop:

+ * + Try to match each set string at the end of the spanLength.

+ * ~ Set strings that start with set-contained code points must be matched

+ * with a partial overlap because the standard algorithm would have tried

+ * to match them earlier.

+ * ~ Set strings that entirely consist of set-contained code points

+ * must be matched with a full overlap because the longest-match algorithm

+ * would hide set string matches that end earlier.

+ * Such set strings need not be matched earlier inside the code point span

+ * because the standard algorithm would then have continued after

+ * the set string match anyway.

+ * ~ Remember the longest set string match (farthest end point) from the earliest

+ * starting point.

+ * + If no set string matched after spanSet.span(), then return

+ * with where the spanSet.span() ended.

+ * + If at least one set string matched, then continue the loop after the

+ * longest match from the earliest position.

+ * + If no more set string matched after a previous string match,

+ * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).

+ * Stop if spanLength==0, otherwise continue the loop.

+ */

+int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {

+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {

+ return spanNot(s, length);

+ }

+ int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);

+ if(spanLength==length) {

+ return length;

+ }

+ // Consider strings; they may overlap with the span.

+ OffsetList offsets;

+ if(spanCondition==USET_SPAN_CONTAINED) {

+ // Use offset list to try all possibilities.

+ offsets.setMaxLength(maxLength16);

+ }

+ int32_t pos=spanLength, rest=length-pos;

+ int32_t i, stringsLength=strings.size();

+ for(;;) {

+ if(spanCondition==USET_SPAN_CONTAINED) {

+ for(i=0; i<stringsLength; ++i) {

+ int32_t overlap=spanLengths[i];

+ if(overlap==ALL_CP_CONTAINED) {

+ continue; // Irrelevant string.

+ }

+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

+ const UChar *s16=string.getBuffer();

+ int32_t length16=string.length();

+ // Try to match this string at pos-overlap..pos.

+ if(overlap>=LONG_SPAN) {

+ overlap=length16;

+ // While contained: No point matching fully inside the code point span.

+ U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.

+ }

+ if(overlap>spanLength) {

+ overlap=spanLength;

+ }

+ int32_t inc=length16-overlap; // Keep overlap+inc==length16.

+ for(;;) {

+ if(inc>rest) {

+ break;

+ }

+ // Try to match if the increment is not listed already.

+ if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {

+ if(inc==rest) {

+ return length; // Reached the end of the string.

+ }

+ offsets.addOffset(inc);

+ }

+ if(overlap==0) {

+ break;

+ }

+ --overlap;

+ ++inc;

+ }

+ } else /* USET_SPAN_SIMPLE */ {

+ int32_t maxInc=0, maxOverlap=0;

+ for(i=0; i<stringsLength; ++i) {

+ int32_t overlap=spanLengths[i];

+ // For longest match, we do need to try to match even an all-contained string

+ // to find the match from the earliest start.

+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

+ const UChar *s16=string.getBuffer();

+ int32_t length16=string.length();

+ // Try to match this string at pos-overlap..pos.

+ if(overlap>=LONG_SPAN) {

+ overlap=length16;

+ // Longest match: Need to match fully inside the code point span

+ // to find the match from the earliest start.

+ }

+ if(overlap>spanLength) {

+ overlap=spanLength;

+ }

+ int32_t inc=length16-overlap; // Keep overlap+inc==length16.

+ for(;;) {

+ if(inc>rest || overlap<maxOverlap) {

+ break;

+ }

+ // Try to match if the string is longer or starts earlier.

+ if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&

+ matches16CPB(s, pos-overlap, length, s16, length16)

+ ) {

+ maxInc=inc; // Longest match from earliest start.

+ maxOverlap=overlap;

+ break;

+ }

+ --overlap;

+ ++inc;

+ }

+ if(maxInc!=0 || maxOverlap!=0) {

+ // Longest-match algorithm, and there was a string match.

+ // Simply continue after it.

+ pos+=maxInc;

+ rest-=maxInc;

+ if(rest==0) {

+ return length; // Reached the end of the string.

+ }

+ spanLength=0; // Match strings from after a string match.

+ continue;

+ }

+ // Finished trying to match all strings at pos.

+ if(spanLength!=0 || pos==0) {

+ // The position is after an unlimited code point span (spanLength!=0),

+ // not after a string match.

+ // The only position where spanLength==0 after a span is pos==0.

+ // Otherwise, an unlimited code point span is only tried again when no

+ // strings match, and if such a non-initial span fails we stop.

+ if(offsets.isEmpty()) {

+ return pos; // No strings matched after a span.

+ }

+ // Match strings from after the next string match.

+ } else {

+ // The position is after a string match (or a single code point).

+ if(offsets.isEmpty()) {

+ // No more strings matched after a previous string match.

+ // Try another code point span from after the last string match.

+ spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);

+ if( spanLength==rest || // Reached the end of the string, or

+ spanLength==0 // neither strings nor span progressed.

+ ) {

+ return pos+spanLength;

+ }

+ pos+=spanLength;

+ rest-=spanLength;

+ continue; // spanLength>0: Match strings from after a span.

+ } else {

+ // Try to match only one code point from after a string match if some

+ // string matched beyond it, so that we try all possible positions

+ // and don't overshoot.

+ spanLength=spanOne(spanSet, s+pos, rest);

+ if(spanLength>0) {

+ if(spanLength==rest) {

+ return length; // Reached the end of the string.

+ }

+ // Match strings after this code point.

+ // There cannot be any increments below it because UnicodeSet strings

+ // contain multiple code points.

+ pos+=spanLength;

+ rest-=spanLength;

+ offsets.shift(spanLength);

+ spanLength=0;

+ continue; // Match strings from after a single code point.

+ }

+ // Match strings from after the next string match.

+ }

+ int32_t minOffset=offsets.popMinimum();

+ pos+=minOffset;

+ rest-=minOffset;

+ spanLength=0; // Match strings from after a string match.

+ }

+int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {

+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {

+ return spanNotBack(s, length);

+ }

+ int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);

+ if(pos==0) {

+ return 0;

+ }

+ int32_t spanLength=length-pos;

+ // Consider strings; they may overlap with the span.

+ OffsetList offsets;

+ if(spanCondition==USET_SPAN_CONTAINED) {

+ // Use offset list to try all possibilities.

+ offsets.setMaxLength(maxLength16);

+ }

+ int32_t i, stringsLength=strings.size();

+ uint8_t *spanBackLengths=spanLengths;

+ if(all) {

+ spanBackLengths+=stringsLength;

+ }

+ for(;;) {

+ if(spanCondition==USET_SPAN_CONTAINED) {

+ for(i=0; i<stringsLength; ++i) {

+ int32_t overlap=spanBackLengths[i];

+ if(overlap==ALL_CP_CONTAINED) {

+ continue; // Irrelevant string.

+ }

+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

+ const UChar *s16=string.getBuffer();

+ int32_t length16=string.length();

+ // Try to match this string at pos-(length16-overlap)..pos-length16.

+ if(overlap>=LONG_SPAN) {

+ overlap=length16;

+ // While contained: No point matching fully inside the code point span.

+ int32_t len1=0;

+ U16_FWD_1(s16, len1, overlap);

+ overlap-=len1; // Length of the string minus the first code point.

+ }

+ if(overlap>spanLength) {

+ overlap=spanLength;

+ }

+ int32_t dec=length16-overlap; // Keep dec+overlap==length16.

+ for(;;) {

+ if(dec>pos) {

+ break;

+ }

+ // Try to match if the decrement is not listed already.

+ if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {

+ if(dec==pos) {

+ return 0; // Reached the start of the string.

+ }

+ offsets.addOffset(dec);

+ }

+ if(overlap==0) {

+ break;

+ }

+ --overlap;

+ ++dec;

+ }

+ } else /* USET_SPAN_SIMPLE */ {

+ int32_t maxDec=0, maxOverlap=0;

+ for(i=0; i<stringsLength; ++i) {

+ int32_t overlap=spanBackLengths[i];

+ // For longest match, we do need to try to match even an all-contained string

+ // to find the match from the latest end.

+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

+ const UChar *s16=string.getBuffer();

+ int32_t length16=string.length();

+ // Try to match this string at pos-(length16-overlap)..pos-length16.

+ if(overlap>=LONG_SPAN) {

+ overlap=length16;

+ // Longest match: Need to match fully inside the code point span

+ // to find the match from the latest end.

+ }

+ if(overlap>spanLength) {

+ overlap=spanLength;

+ }

+ int32_t dec=length16-overlap; // Keep dec+overlap==length16.

+ for(;;) {

+ if(dec>pos || overlap<maxOverlap) {

+ break;

+ }

+ // Try to match if the string is longer or ends later.

+ if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&

+ matches16CPB(s, pos-dec, length, s16, length16)

+ ) {

+ maxDec=dec; // Longest match from latest end.

+ maxOverlap=overlap;

+ break;

+ }

+ --overlap;

+ ++dec;

+ }

+ if(maxDec!=0 || maxOverlap!=0) {

+ // Longest-match algorithm, and there was a string match.

+ // Simply continue before it.

+ pos-=maxDec;

+ if(pos==0) {

+ return 0; // Reached the start of the string.

+ }

+ spanLength=0; // Match strings from before a string match.

+ continue;

+ }

+ // Finished trying to match all strings at pos.

+ if(spanLength!=0 || pos==length) {

+ // The position is before an unlimited code point span (spanLength!=0),

+ // not before a string match.

+ // The only position where spanLength==0 before a span is pos==length.

+ // Otherwise, an unlimited code point span is only tried again when no

+ // strings match, and if such a non-initial span fails we stop.

+ if(offsets.isEmpty()) {

+ return pos; // No strings matched before a span.

+ }

+ // Match strings from before the next string match.

+ } else {

+ // The position is before a string match (or a single code point).

+ if(offsets.isEmpty()) {

+ // No more strings matched before a previous string match.

+ // Try another code point span from before the last string match.

+ int32_t oldPos=pos;

+ pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);

+ spanLength=oldPos-pos;

+ if( pos==0 || // Reached the start of the string, or

+ spanLength==0 // neither strings nor span progressed.

+ ) {

+ return pos;

+ }

+ continue; // spanLength>0: Match strings from before a span.

+ } else {

+ // Try to match only one code point from before a string match if some

+ // string matched beyond it, so that we try all possible positions

+ // and don't overshoot.

+ spanLength=spanOneBack(spanSet, s, pos);

+ if(spanLength>0) {

+ if(spanLength==pos) {

+ return 0; // Reached the start of the string.

+ }

+ // Match strings before this code point.

+ // There cannot be any decrements below it because UnicodeSet strings

+ // contain multiple code points.

+ pos-=spanLength;

+ offsets.shift(spanLength);

+ spanLength=0;

+ continue; // Match strings from before a single code point.

+ }

+ // Match strings from before the next string match.

+ }

+ pos-=offsets.popMinimum();

+ spanLength=0; // Match strings from before a string match.

+ }

+int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {

+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {

+ return spanNotUTF8(s, length);

+ }

+ int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);

+ if(spanLength==length) {

+ return length;

+ }

+ // Consider strings; they may overlap with the span.

+ OffsetList offsets;

+ if(spanCondition==USET_SPAN_CONTAINED) {

+ // Use offset list to try all possibilities.

+ offsets.setMaxLength(maxLength8);

+ }

+ int32_t pos=spanLength, rest=length-pos;

+ int32_t i, stringsLength=strings.size();

+ uint8_t *spanUTF8Lengths=spanLengths;

+ if(all) {

+ spanUTF8Lengths+=2*stringsLength;

+ }

+ for(;;) {

+ const uint8_t *s8=utf8;

+ int32_t length8;

+ if(spanCondition==USET_SPAN_CONTAINED) {

+ for(i=0; i<stringsLength; ++i) {

+ length8=utf8Lengths[i];

+ if(length8==0) {

+ continue; // String not representable in UTF-8.

+ }

+ int32_t overlap=spanUTF8Lengths[i];

+ if(overlap==ALL_CP_CONTAINED) {

+ s8+=length8;

+ continue; // Irrelevant string.

+ }

+ // Try to match this string at pos-overlap..pos.

+ if(overlap>=LONG_SPAN) {

+ overlap=length8;

+ // While contained: No point matching fully inside the code point span.

+ U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.

+ }

+ if(overlap>spanLength) {

+ overlap=spanLength;

+ }

+ int32_t inc=length8-overlap; // Keep overlap+inc==length8.

+ for(;;) {

+ if(inc>rest) {

+ break;

+ }

+ // Try to match if the increment is not listed already.

+ // Match at code point boundaries. (The UTF-8 strings were converted

+ // from UTF-16 and are guaranteed to be well-formed.)

+ if( !U8_IS_TRAIL(s[pos-overlap]) &&

+ !offsets.containsOffset(inc) &&

+ matches8(s+pos-overlap, s8, length8)

+ ) {

+ if(inc==rest) {

+ return length; // Reached the end of the string.

+ }

+ offsets.addOffset(inc);

+ }

+ if(overlap==0) {

+ break;

+ }

+ --overlap;

+ ++inc;

+ }

+ s8+=length8;

+ }

+ } else /* USET_SPAN_SIMPLE */ {

+ int32_t maxInc=0, maxOverlap=0;

+ for(i=0; i<stringsLength; ++i) {

+ length8=utf8Lengths[i];

+ if(length8==0) {

+ continue; // String not representable in UTF-8.

+ }

+ int32_t overlap=spanUTF8Lengths[i];

+ // For longest match, we do need to try to match even an all-contained string

+ // to find the match from the earliest start.

+ // Try to match this string at pos-overlap..pos.

+ if(overlap>=LONG_SPAN) {

+ overlap=length8;

+ // Longest match: Need to match fully inside the code point span

+ // to find the match from the earliest start.

+ }

+ if(overlap>spanLength) {

+ overlap=spanLength;

+ }

+ int32_t inc=length8-overlap; // Keep overlap+inc==length8.

+ for(;;) {

+ if(inc>rest || overlap<maxOverlap) {

+ break;

+ }

+ // Try to match if the string is longer or starts earlier.

+ // Match at code point boundaries. (The UTF-8 strings were converted

+ // from UTF-16 and are guaranteed to be well-formed.)

+ if( !U8_IS_TRAIL(s[pos-overlap]) &&

+ (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&

+ matches8(s+pos-overlap, s8, length8)

+ ) {

+ maxInc=inc; // Longest match from earliest start.

+ maxOverlap=overlap;

+ break;

+ }

+ --overlap;

+ ++inc;

+ }

+ s8+=length8;

+ }

+ if(maxInc!=0 || maxOverlap!=0) {

+ // Longest-match algorithm, and there was a string match.

+ // Simply continue after it.

+ pos+=maxInc;

+ rest-=maxInc;

+ if(rest==0) {

+ return length; // Reached the end of the string.

+ }

+ spanLength=0; // Match strings from after a string match.

+ continue;

+ }

+ // Finished trying to match all strings at pos.

+ if(spanLength!=0 || pos==0) {

+ // The position is after an unlimited code point span (spanLength!=0),

+ // not after a string match.

+ // The only position where spanLength==0 after a span is pos==0.

+ // Otherwise, an unlimited code point span is only tried again when no

+ // strings match, and if such a non-initial span fails we stop.

+ if(offsets.isEmpty()) {

+ return pos; // No strings matched after a span.

+ }

+ // Match strings from after the next string match.

+ } else {

+ // The position is after a string match (or a single code point).

+ if(offsets.isEmpty()) {

+ // No more strings matched after a previous string match.

+ // Try another code point span from after the last string match.

+ spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);

+ if( spanLength==rest || // Reached the end of the string, or

+ spanLength==0 // neither strings nor span progressed.

+ ) {

+ return pos+spanLength;

+ }

+ pos+=spanLength;

+ rest-=spanLength;

+ continue; // spanLength>0: Match strings from after a span.

+ } else {

+ // Try to match only one code point from after a string match if some

+ // string matched beyond it, so that we try all possible positions

+ // and don't overshoot.

+ spanLength=spanOneUTF8(spanSet, s+pos, rest);

+ if(spanLength>0) {

+ if(spanLength==rest) {

+ return length; // Reached the end of the string.

+ }

+ // Match strings after this code point.

+ // There cannot be any increments below it because UnicodeSet strings

+ // contain multiple code points.

+ pos+=spanLength;

+ rest-=spanLength;

+ offsets.shift(spanLength);

+ spanLength=0;

+ continue; // Match strings from after a single code point.

+ }

+ // Match strings from after the next string match.

+ }

+ int32_t minOffset=offsets.popMinimum();

+ pos+=minOffset;

+ rest-=minOffset;

+ spanLength=0; // Match strings from after a string match.

+ }

+int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {

+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {

+ return spanNotBackUTF8(s, length);

+ }

+ int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);

+ if(pos==0) {

+ return 0;

+ }

+ int32_t spanLength=length-pos;

+ // Consider strings; they may overlap with the span.

+ OffsetList offsets;

+ if(spanCondition==USET_SPAN_CONTAINED) {

+ // Use offset list to try all possibilities.

+ offsets.setMaxLength(maxLength8);

+ }

+ int32_t i, stringsLength=strings.size();

+ uint8_t *spanBackUTF8Lengths=spanLengths;

+ if(all) {

+ spanBackUTF8Lengths+=3*stringsLength;

+ }

+ for(;;) {

+ const uint8_t *s8=utf8;

+ int32_t length8;

+ if(spanCondition==USET_SPAN_CONTAINED) {

+ for(i=0; i<stringsLength; ++i) {

+ length8=utf8Lengths[i];

+ if(length8==0) {

+ continue; // String not representable in UTF-8.

+ }

+ int32_t overlap=spanBackUTF8Lengths[i];

+ if(overlap==ALL_CP_CONTAINED) {

+ s8+=length8;

+ continue; // Irrelevant string.

+ }

+ // Try to match this string at pos-(length8-overlap)..pos-length8.

+ if(overlap>=LONG_SPAN) {

+ overlap=length8;

+ // While contained: No point matching fully inside the code point span.

+ int32_t len1=0;

+ U8_FWD_1(s8, len1, overlap);

+ overlap-=len1; // Length of the string minus the first code point.

+ }

+ if(overlap>spanLength) {

+ overlap=spanLength;

+ }

+ int32_t dec=length8-overlap; // Keep dec+overlap==length8.

+ for(;;) {

+ if(dec>pos) {

+ break;

+ }

+ // Try to match if the decrement is not listed already.

+ // Match at code point boundaries. (The UTF-8 strings were converted

+ // from UTF-16 and are guaranteed to be well-formed.)

+ if( !U8_IS_TRAIL(s[pos-dec]) &&

+ !offsets.containsOffset(dec) &&

+ matches8(s+pos-dec, s8, length8)

+ ) {

+ if(dec==pos) {

+ return 0; // Reached the start of the string.

+ }

+ offsets.addOffset(dec);

+ }

+ if(overlap==0) {

+ break;

+ }

+ --overlap;

+ ++dec;

+ }

+ s8+=length8;

+ }

+ } else /* USET_SPAN_SIMPLE */ {

+ int32_t maxDec=0, maxOverlap=0;

+ for(i=0; i<stringsLength; ++i) {

+ length8=utf8Lengths[i];

+ if(length8==0) {

+ continue; // String not representable in UTF-8.

+ }

+ int32_t overlap=spanBackUTF8Lengths[i];

+ // For longest match, we do need to try to match even an all-contained string

+ // to find the match from the latest end.

+ // Try to match this string at pos-(length8-overlap)..pos-length8.

+ if(overlap>=LONG_SPAN) {

+ overlap=length8;

+ // Longest match: Need to match fully inside the code point span

+ // to find the match from the latest end.

+ }

+ if(overlap>spanLength) {

+ overlap=spanLength;

+ }

+ int32_t dec=length8-overlap; // Keep dec+overlap==length8.

+ for(;;) {

+ if(dec>pos || overlap<maxOverlap) {

+ break;

+ }

+ // Try to match if the string is longer or ends later.

+ // Match at code point boundaries. (The UTF-8 strings were converted

+ // from UTF-16 and are guaranteed to be well-formed.)

+ if( !U8_IS_TRAIL(s[pos-dec]) &&

+ (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&

+ matches8(s+pos-dec, s8, length8)

+ ) {

+ maxDec=dec; // Longest match from latest end.

+ maxOverlap=overlap;

+ break;

+ }

+ --overlap;

+ ++dec;

+ }

+ s8+=length8;

+ }

+ if(maxDec!=0 || maxOverlap!=0) {

+ // Longest-match algorithm, and there was a string match.

+ // Simply continue before it.

+ pos-=maxDec;

+ if(pos==0) {

+ return 0; // Reached the start of the string.

+ }

+ spanLength=0; // Match strings from before a string match.

+ continue;

+ }

+ // Finished trying to match all strings at pos.

+ if(spanLength!=0 || pos==length) {

+ // The position is before an unlimited code point span (spanLength!=0),

+ // not before a string match.

+ // The only position where spanLength==0 before a span is pos==length.

+ // Otherwise, an unlimited code point span is only tried again when no

+ // strings match, and if such a non-initial span fails we stop.

+ if(offsets.isEmpty()) {

+ return pos; // No strings matched before a span.

+ }

+ // Match strings from before the next string match.

+ } else {

+ // The position is before a string match (or a single code point).

+ if(offsets.isEmpty()) {

+ // No more strings matched before a previous string match.

+ // Try another code point span from before the last string match.

+ int32_t oldPos=pos;

+ pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);

+ spanLength=oldPos-pos;

+ if( pos==0 || // Reached the start of the string, or

+ spanLength==0 // neither strings nor span progressed.

+ ) {

+ return pos;

+ }

+ continue; // spanLength>0: Match strings from before a span.

+ } else {

+ // Try to match only one code point from before a string match if some

+ // string matched beyond it, so that we try all possible positions

+ // and don't overshoot.

+ spanLength=spanOneBackUTF8(spanSet, s, pos);

+ if(spanLength>0) {

+ if(spanLength==pos) {

+ return 0; // Reached the start of the string.

+ }

+ // Match strings before this code point.

+ // There cannot be any decrements below it because UnicodeSet strings

+ // contain multiple code points.

+ pos-=spanLength;

+ offsets.shift(spanLength);

+ spanLength=0;

+ continue; // Match strings from before a single code point.

+ }

+ // Match strings from before the next string match.

+ }

+ pos-=offsets.popMinimum();

+ spanLength=0; // Match strings from before a string match.

+ }

+/*

+ * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)

+ *

+ * Theoretical algorithm:

+ * - Iterate through the string, and at each code point boundary:

+ * + If the code point there is in the set, then return with the current position.

+ * + If a set string matches at the current position, then return with the current position.

+ *

+ * Optimized implementation:

+ *

+ * (Same assumption as for span() above.)

+ *

+ * Create and cache a spanNotSet which contains all of the single code points

+ * of the original set but none of its strings.

+ * For each set string add its initial code point to the spanNotSet.

+ * (Also add its final code point for spanNotBack().)

+ *

+ * - Loop:

+ * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).

+ * + If the current code point is in the original set, then

+ * return the current position.

+ * + If any set string matches at the current position, then

+ * return the current position.

+ * + If there is no match at the current position, neither for the code point there

+ * nor for any set string, then skip this code point and continue the loop.

+ * This happens for set-string-initial code points that were added to spanNotSet

+ * when there is not actually a match for such a set string.

+ */

+int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {

+ int32_t pos=0, rest=length;

+ int32_t i, stringsLength=strings.size();

+ do {

+ // Span until we find a code point from the set,

+ // or a code point that starts or ends some string.

+ i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);

+ if(i==rest) {

+ return length; // Reached the end of the string.

+ }

+ pos+=i;

+ rest-=i;

+ // Check whether the current code point is in the original set,

+ // without the string starts and ends.

+ int32_t cpLength=spanOne(spanSet, s+pos, rest);

+ if(cpLength>0) {

+ return pos; // There is a set element at pos.

+ }

+ // Try to match the strings at pos.

+ for(i=0; i<stringsLength; ++i) {

+ if(spanLengths[i]==ALL_CP_CONTAINED) {

+ continue; // Irrelevant string.

+ }

+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

+ const UChar *s16=string.getBuffer();

+ int32_t length16=string.length();

+ if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {

+ return pos; // There is a set element at pos.

+ }

+ // The span(while not contained) ended on a string start/end which is

+ // not in the original set. Skip this code point and continue.

+ // cpLength<0

+ pos-=cpLength;

+ rest+=cpLength;

+ } while(rest!=0);

+ return length; // Reached the end of the string.

+int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {

+ int32_t pos=length;

+ int32_t i, stringsLength=strings.size();

+ do {

+ // Span until we find a code point from the set,

+ // or a code point that starts or ends some string.

+ pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);

+ if(pos==0) {

+ return 0; // Reached the start of the string.

+ }

+ // Check whether the current code point is in the original set,

+ // without the string starts and ends.

+ int32_t cpLength=spanOneBack(spanSet, s, pos);

+ if(cpLength>0) {

+ return pos; // There is a set element at pos.

+ }

+ // Try to match the strings at pos.

+ for(i=0; i<stringsLength; ++i) {

+ // Use spanLengths rather than a spanBackLengths pointer because

+ // it is easier and we only need to know whether the string is irrelevant

+ // which is the same in either array.

+ if(spanLengths[i]==ALL_CP_CONTAINED) {

+ continue; // Irrelevant string.

+ }

+ const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

+ const UChar *s16=string.getBuffer();

+ int32_t length16=string.length();

+ if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {

+ return pos; // There is a set element at pos.

+ }

+ // The span(while not contained) ended on a string start/end which is

+ // not in the original set. Skip this code point and continue.

+ // cpLength<0

+ pos+=cpLength;

+ } while(pos!=0);

+ return 0; // Reached the start of the string.

+int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {

+ int32_t pos=0, rest=length;

+ int32_t i, stringsLength=strings.size();

+ uint8_t *spanUTF8Lengths=spanLengths;

+ if(all) {

+ spanUTF8Lengths+=2*stringsLength;

+ }

+ do {

+ // Span until we find a code point from the set,

+ // or a code point that starts or ends some string.

+ i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);

+ if(i==rest) {

+ return length; // Reached the end of the string.

+ }

+ pos+=i;

+ rest-=i;

+ // Check whether the current code point is in the original set,

+ // without the string starts and ends.

+ int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);

+ if(cpLength>0) {

+ return pos; // There is a set element at pos.

+ }

+ // Try to match the strings at pos.

+ const uint8_t *s8=utf8;

+ int32_t length8;

+ for(i=0; i<stringsLength; ++i) {

+ length8=utf8Lengths[i];

+ // ALL_CP_CONTAINED: Irrelevant string.

+ if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {

+ return pos; // There is a set element at pos.

+ }

+ s8+=length8;

+ }

+ // The span(while not contained) ended on a string start/end which is

+ // not in the original set. Skip this code point and continue.

+ // cpLength<0

+ pos-=cpLength;

+ rest+=cpLength;

+ } while(rest!=0);

+ return length; // Reached the end of the string.

+int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {

+ int32_t pos=length;

+ int32_t i, stringsLength=strings.size();

+ uint8_t *spanBackUTF8Lengths=spanLengths;

+ if(all) {

+ spanBackUTF8Lengths+=3*stringsLength;

+ }

+ do {

+ // Span until we find a code point from the set,

+ // or a code point that starts or ends some string.

+ pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);

+ if(pos==0) {

+ return 0; // Reached the start of the string.

+ }

+ // Check whether the current code point is in the original set,

+ // without the string starts and ends.

+ int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);

+ if(cpLength>0) {

+ return pos; // There is a set element at pos.

+ }

+ // Try to match the strings at pos.

+ const uint8_t *s8=utf8;

+ int32_t length8;

+ for(i=0; i<stringsLength; ++i) {

+ length8=utf8Lengths[i];

+ // ALL_CP_CONTAINED: Irrelevant string.

+ if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {

+ return pos; // There is a set element at pos.

+ }

+ s8+=length8;

+ }

+ // The span(while not contained) ended on a string start/end which is

+ // not in the original set. Skip this code point and continue.

+ // cpLength<0

+ pos+=cpLength;

+ } while(pos!=0);

+ return 0; // Reached the start of the string.

+U_NAMESPACE_END

Property changes on: icu51/source/common/unisetspan.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu51/source/common/unisetspan.h ('k') | icu51/source/common/unistr.cpp » ('j') | no next file with comments »