Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 qsufsort.h -- Suffix array generation. | 2 qsufsort.h -- Suffix array generation. |
| 3 | 3 |
| 4 Copyright 2003 Colin Percival | 4 Copyright 2003 Colin Percival |
| 5 | 5 |
| 6 For the terms under which this work may be distributed, please see | 6 For the terms under which this work may be distributed, please see |
| 7 the adjoining file "LICENSE". | 7 the adjoining file "LICENSE". |
| 8 | 8 |
| 9 ChangeLog: | 9 ChangeLog: |
| 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit |
| 11 values throughout. | 11 values throughout. |
| 12 --Benjamin Smedberg <benjamin@smedbergs.us> | 12 --Benjamin Smedberg <benjamin@smedbergs.us> |
| 13 2010-05-26 - Use a paged array for V and I. The address space may be too | 13 2010-05-26 - Use a paged array for V and I. The address space may be too |
| 14 fragmented for these big arrays to be contiguous. | 14 fragmented for these big arrays to be contiguous. |
| 15 --Stephen Adams <sra@chromium.org> | 15 --Stephen Adams <sra@chromium.org> |
| 16 2015-08-03 - Extract QSufSort to a separate file as template. | 16 2015-08-03 - Extract QSufSort to a separate file as template. |
| 17 --Samuel Huang <huangs@chromium.org> | 17 --Samuel Huang <huangs@chromium.org> |
| 18 2015-08-19 - Optimize split() and search(), add comments. | 18 2015-08-19 - Optimize split() and search(), add comments. |
| 19 --Samuel Huang <huangs@chromium.org> | 19 --Samuel Huang <huangs@chromium.org> |
| 20 2016-04-27 - Change split() to use Bentley & McIlroy's pivot selection | 20 2016-04-27 - Change split() to use Bentley & McIlroy's pivot selection |
| 21 algorithm, which QSufSort originally used. Reference: | 21 algorithm, which QSufSort originally used. Reference: |
| 22 http://www.larsson.dogma.net/qsufsort.c | 22 http://www.larsson.dogma.net/qsufsort.c |
| 23 --Samuel Huang <huangs@chromium.org> | 23 --Samuel Huang <huangs@chromium.org> |
| 24 */ | 24 */ |
| 25 | 25 |
|
huangs
2016/05/13 03:56:47
Please add
#ifdef ???
#define ???
...
#endif //
altimin
2016/05/13 13:16:57
Done.
| |
| 26 #include <algorithm> | 26 #include <algorithm> |
| 27 #include <cstring> | 27 #include <cstring> |
| 28 | 28 |
| 29 namespace courgette { | 29 namespace courgette { |
| 30 namespace qsuf { | 30 namespace qsuf { |
| 31 | 31 |
| 32 // ------------------------------------------------------------------------ | 32 // ------------------------------------------------------------------------ |
| 33 // | 33 // |
| 34 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the | 34 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the |
| 35 // code formatting and variable names. The changes from the original are: | 35 // code formatting and variable names. The changes from the original are: |
| 36 // (1) replacing tabs with spaces, | 36 // (1) replacing tabs with spaces, |
| 37 // (2) indentation, | 37 // (2) indentation and spacing, |
| 38 // (3) using 'const', | 38 // (3) using 'const', |
| 39 // (4) changing the V and I parameters from int* to template <typename T>. | 39 // (4) changing the V and I parameters from int* to template <typename T>. |
| 40 // (5) optimizing split() and search(); fix styles. | 40 // (5) optimizing split() and search(); fix styles. |
| 41 // | 41 // |
| 42 // The code appears to be a rewritten version of the suffix array algorithm | 42 // The code appears to be a rewritten version of the suffix array algorithm |
| 43 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko | 43 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko |
| 44 // Sadakane, special cased for bytes. | 44 // Sadakane, special cased for bytes. |
| 45 | 45 |
| 46 namespace { | 46 namespace { |
| 47 | 47 |
| 48 template <typename T> T median3(const T& a, const T& b, const T& c) { | 48 template <typename T> |
| 49 T median3(const T& a, const T& b, const T& c) { | |
| 49 if (a < b) | 50 if (a < b) |
| 50 return b < c ? b : (a < c ? c : a); | 51 return b < c ? b : (a < c ? c : a); |
| 51 return b > c ? b : (a > c ? c : a); | 52 return b > c ? b : (a > c ? c : a); |
| 52 } | 53 } |
| 53 | 54 |
| 54 } // namespace | 55 } // namespace |
| 55 | 56 |
| 56 template <typename T> void split(T I, T V, int start, int end, int h) { | 57 template <typename T> |
| 58 void split(T I, T V, int start, int end, int h) { | |
| 57 // For small interval, apply selection sort. | 59 // For small interval, apply selection sort. |
| 58 if (end - start < 16) { | 60 if (end - start < 16) { |
| 59 for (int i = start; i < end; ) { | 61 for (int i = start; i < end;) { |
| 60 int skip = 1; | 62 int skip = 1; |
| 61 int best = V[I[i] + h]; | 63 int best = V[I[i] + h]; |
| 62 for (int j = i + 1; j < end; j++) { | 64 for (int j = i + 1; j < end; j++) { |
| 63 int cur = V[I[j] + h]; | 65 int cur = V[I[j] + h]; |
| 64 if (best > cur) { | 66 if (best > cur) { |
| 65 best = cur; | 67 best = cur; |
| 66 int tmp = I[i]; | 68 int tmp = I[i]; |
| 67 I[i] = I[j]; | 69 I[i] = I[j]; |
| 68 I[j] = tmp; | 70 I[j] = tmp; |
| 69 skip = 1; | 71 skip = 1; |
| (...skipping 29 matching lines...) Expand all Loading... | |
| 99 p2 = median3(p2, V[I[end - 1 - s] + h], V[I[end - 1 - s - s] + h]); | 101 p2 = median3(p2, V[I[end - 1 - s] + h], V[I[end - 1 - s - s] + h]); |
| 100 } // Else medium array: Pseudomedian of 3. | 102 } // Else medium array: Pseudomedian of 3. |
| 101 pivot = median3(pivot, p1, p2); | 103 pivot = median3(pivot, p1, p2); |
| 102 | 104 |
| 103 // Split [start, end) into 3 intervals: | 105 // Split [start, end) into 3 intervals: |
| 104 // [start, j) with secondary keys < pivot, | 106 // [start, j) with secondary keys < pivot, |
| 105 // [j, k) with secondary keys == pivot, | 107 // [j, k) with secondary keys == pivot, |
| 106 // [k, end) with secondary keys > pivot. | 108 // [k, end) with secondary keys > pivot. |
| 107 int j = start; | 109 int j = start; |
| 108 int k = end; | 110 int k = end; |
| 109 for (int i = start; i < k; ) { | 111 for (int i = start; i < k;) { |
| 110 int cur = V[I[i] + h]; | 112 int cur = V[I[i] + h]; |
| 111 if (cur < pivot) { | 113 if (cur < pivot) { |
| 112 if (i != j) { | 114 if (i != j) { |
| 113 int tmp = I[i]; | 115 int tmp = I[i]; |
| 114 I[i] = I[j]; | 116 I[i] = I[j]; |
| 115 I[j] = tmp; | 117 I[j] = tmp; |
| 116 } | 118 } |
| 117 ++i; | 119 ++i; |
| 118 ++j; | 120 ++j; |
| 119 } else if (cur > pivot) { | 121 } else if (cur > pivot) { |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 138 for (int i = j; i < k; ++i) | 140 for (int i = j; i < k; ++i) |
| 139 V[I[i]] = k - 1; | 141 V[I[i]] = k - 1; |
| 140 } | 142 } |
| 141 | 143 |
| 142 // Recurse on the "> pivot" piece. | 144 // Recurse on the "> pivot" piece. |
| 143 if (k < end) | 145 if (k < end) |
| 144 split<T>(I, V, k, end, h); | 146 split<T>(I, V, k, end, h); |
| 145 } | 147 } |
| 146 | 148 |
| 147 template <class T> | 149 template <class T> |
| 148 static void | 150 static void qsufsort(T I, T V, const unsigned char* old, int oldsize) { |
| 149 qsufsort(T I, T V,const unsigned char *old,int oldsize) | |
| 150 { | |
| 151 int buckets[256]; | 151 int buckets[256]; |
| 152 int i,h,len; | 152 int i, h, len; |
| 153 | 153 |
| 154 for(i=0;i<256;i++) buckets[i]=0; | 154 for (i = 0; i < 256; i++) |
| 155 for(i=0;i<oldsize;i++) buckets[old[i]]++; | 155 buckets[i] = 0; |
| 156 for(i=1;i<256;i++) buckets[i]+=buckets[i-1]; | 156 for (i = 0; i < oldsize; i++) |
| 157 for(i=255;i>0;i--) buckets[i]=buckets[i-1]; | 157 buckets[old[i]]++; |
| 158 buckets[0]=0; | 158 for (i = 1; i < 256; i++) |
| 159 buckets[i] += buckets[i - 1]; | |
| 160 for (i = 255; i > 0; i--) | |
| 161 buckets[i] = buckets[i - 1]; | |
| 162 buckets[0] = 0; | |
| 159 | 163 |
| 160 for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i; | 164 for (i = 0; i < oldsize; i++) |
| 161 I[0]=oldsize; | 165 I[++buckets[old[i]]] = i; |
| 162 for(i=0;i<oldsize;i++) V[i]=buckets[old[i]]; | 166 I[0] = oldsize; |
| 163 V[oldsize]=0; | 167 for (i = 0; i < oldsize; i++) |
| 164 for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1; | 168 V[i] = buckets[old[i]]; |
| 165 I[0]=-1; | 169 V[oldsize] = 0; |
| 170 for (i = 1; i < 256; i++) | |
| 171 if (buckets[i] == buckets[i - 1] + 1) | |
| 172 I[buckets[i]] = -1; | |
| 173 I[0] = -1; | |
| 166 | 174 |
| 167 for(h=1;I[0]!=-(oldsize+1);h+=h) { | 175 for (h = 1; I[0] != -(oldsize + 1); h += h) { |
| 168 len=0; | 176 len = 0; |
| 169 for(i=0;i<oldsize+1;) { | 177 for (i = 0; i < oldsize + 1;) { |
| 170 if(I[i]<0) { | 178 if (I[i] < 0) { |
| 171 len-=I[i]; | 179 len -= I[i]; |
| 172 i-=I[i]; | 180 i -= I[i]; |
| 173 } else { | 181 } else { |
| 174 if(len) I[i-len]=-len; | 182 if (len) |
| 175 len=V[I[i]]+1-i; | 183 I[i - len] = -len; |
| 176 split<T>(I,V,i,i+len,h); | 184 len = V[I[i]] + 1 - i; |
| 177 i+=len; | 185 split<T>(I, V, i, i + len, h); |
| 178 len=0; | 186 i += len; |
| 187 len = 0; | |
| 179 }; | 188 }; |
| 180 }; | 189 }; |
| 181 if(len) I[i-len]=-len; | 190 if (len) |
| 191 I[i - len] = -len; | |
| 182 }; | 192 }; |
| 183 | 193 |
| 184 for(i=0;i<oldsize+1;i++) I[V[i]]=i; | 194 for (i = 0; i < oldsize + 1; i++) |
| 195 I[V[i]] = i; | |
| 185 } | 196 } |
| 186 | 197 |
| 187 static int | 198 static int matchlen(const unsigned char* old, |
| 188 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne wsize) | 199 int oldsize, |
| 189 { | 200 const unsigned char* newbuf, |
| 201 int newsize) { | |
| 190 int i; | 202 int i; |
| 191 | 203 |
| 192 for(i=0;(i<oldsize)&&(i<newsize);i++) | 204 for (i = 0; (i < oldsize) && (i < newsize); i++) |
| 193 if(old[i]!=newbuf[i]) break; | 205 if (old[i] != newbuf[i]) |
| 206 break; | |
| 194 | 207 |
| 195 return i; | 208 return i; |
| 196 } | 209 } |
| 197 | 210 |
| 198 template <class T> | 211 template <class T> |
| 199 static int search(T I, const unsigned char *old, int oldsize, | 212 static int search(T I, |
| 200 const unsigned char *newbuf, int newsize, int *pos) { | 213 const unsigned char* old, |
| 214 int oldsize, | |
| 215 const unsigned char* newbuf, | |
| 216 int newsize, | |
| 217 int* pos) { | |
| 201 int lo = 0; | 218 int lo = 0; |
| 202 int hi = oldsize; | 219 int hi = oldsize; |
| 203 while (hi - lo >= 2) { | 220 while (hi - lo >= 2) { |
| 204 int mid = (lo + hi) / 2; | 221 int mid = (lo + hi) / 2; |
| 205 if(memcmp(old+I[mid],newbuf,std::min(oldsize-I[mid],newsize))<0) { | 222 if (memcmp(old + I[mid], newbuf, std::min(oldsize - I[mid], newsize)) < 0) { |
| 206 lo = mid; | 223 lo = mid; |
| 207 } else { | 224 } else { |
| 208 hi = mid; | 225 hi = mid; |
| 209 } | 226 } |
| 210 } | 227 } |
| 211 | 228 |
| 212 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize); | 229 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize); |
| 213 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize); | 230 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize); |
| 214 if(x > y) { | 231 if (x > y) { |
| 215 *pos = I[lo]; | 232 *pos = I[lo]; |
| 216 return x; | 233 return x; |
| 217 } | 234 } |
| 218 *pos = I[hi]; | 235 *pos = I[hi]; |
| 219 return y; | 236 return y; |
| 220 } | 237 } |
| 221 | 238 |
| 222 // End of 'verbatim' code. | 239 // End of 'verbatim' code. |
| 223 // ------------------------------------------------------------------------ | 240 // ------------------------------------------------------------------------ |
| 224 | 241 |
| 225 } // namespace qsuf | 242 } // namespace qsuf |
| 226 } // namespace courgette | 243 } // namespace courgette |
| OLD | NEW |