Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2003, 2004 Colin Percival | |
| 2 // All rights reserved | |
| 3 // | |
| 4 // Redistribution and use in source and binary forms, with or without | |
| 5 // modification, are permitted providing that the following conditions | |
| 6 // are met: | |
| 7 // 1. Redistributions of source code must retain the above copyright | |
| 8 // notice, this list of conditions and the following disclaimer. | |
| 9 // 2. Redistributions in binary form must reproduce the above copyright | |
| 10 // notice, this list of conditions and the following disclaimer in the | |
| 11 // documentation and/or other materials provided with the distribution. | |
| 12 // | |
| 13 // For the terms under which this work may be distributed, please see | |
| 14 // the adjoining file "LICENSE". | |
| 15 // | |
| 16 // bsdiff_search.h -- Suffix array search. | |
| 17 // | |
| 18 // ChangeLog: | |
| 19 // 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | |
| 20 // values throughout. | |
| 21 // --Benjamin Smedberg <benjamin@smedbergs.us> | |
| 22 // 2015-08-03 - Change search() to template to allow PagedArray usage. | |
| 23 // --Samuel Huang <huangs@chromium.org> | |
| 24 // 2015-08-19 - Optimized search(). | |
| 25 // --Samuel Huang <huangs@chromium.org> | |
| 26 // 2016-06-02 - Move matchlen() and search() to a new file; comment; format. | |
| 27 // --Samuel Huang <huangs@chromium.org> | |
| 28 | |
| 29 // Copyright 2016 The Chromium Authors. All rights reserved. | |
| 30 // Use of this source code is governed by a BSD-style license that can be | |
| 31 // found in the LICENSE file. | |
| 32 | |
| 33 #include <algorithm> | |
| 34 #include <cstring> | |
| 35 | |
| 36 namespace bsdiff { | |
| 37 | |
| 38 // Similar to ::memcmp(), but returns match length instead. | |
| 39 inline int matchlen(const unsigned char *srcbuf, const unsigned char *keybuf, | |
| 40 int size) { | |
| 41 for (int i = 0; i < size; ++i) | |
| 42 if (srcbuf[i] != keybuf[i]) | |
| 43 return i; | |
| 44 return size; | |
| 45 } | |
| 46 | |
| 47 // Finds a suffix in |srcbuf| that has the longest common prefix with |keybuf|, | |
| 48 // aided by suffix array |I| of |srcbuf|. Returns the length of the match, and | |
| 49 // writes the position within |srcbuf| to |pos|. | |
| 50 template <class T> | |
| 51 int search(T I, const unsigned char *srcbuf, int srcize, | |
| 52 const unsigned char *keybuf, int keysize, int *pos) { | |
| 53 // We use binary search here, but the desired position is not the usual | |
|
huangs
2016/06/06 04:53:35
In retrospect this is ridiculously verbose. Will
| |
| 54 // position from std::lower_bound()-like binary search. | |
| 55 // For example, given sorted suffixes of |srcbuf|: | |
| 56 // SAAA... | |
| 57 // SNNN... | |
| 58 // TAAA... | |
| 59 // Applying std::lower_bound() binary search results in: | |
| 60 // |keybuf| = "SNBB" => "SNNN...". | |
| 61 // |keybuf| = "SNZZ" => "TAAA...". | |
| 62 // But the desired result for both cases is "SNNN...", with "SN" as the | |
| 63 // longest common prefix. To solve this: | |
| 64 // Case 1: |keysize| == 0: We'd return 0, although |pos| is arbitrary. | |
| 65 // Case 2: |srcsize| == 0: We'd return 0, and |pos| = 0. | |
| 66 // Case 3: |srcsize| > 0 and |keysize| > 0: We'd assert rank-|lo| suffix | |
| 67 // of |srcbuf| is always strictly lexicographically less than |keybuf|. This | |
| 68 // holds for |lo| = 0, which corresponds to "". The search terminates when | |
| 69 // hi == lo + 1. At this point, suffixes at rank |lo| and rank |hi| both | |
| 70 // may be the desired match, so we check both. In case of tie, there's NO | |
| 71 // guarantee that we take the solution with minimum rank. | |
| 72 // Finding the minimum rank solution is not needed, and it would create more | |
| 73 // complexity. For example, with suffixes "MA...", "MB...", "MC...", "ME...", | |
| 74 // searching look for "MD" would yield |lo| -> "MC...", and the minimum rank | |
| 75 // solution "MA...", would be rather far away. | |
| 76 // | |
| 77 // TODO(huangs): Optimize this: match length at |t| is >= match length at | |
| 78 // |lo| and at |hi|. If we track this then we can skip comparisons! | |
| 79 int lo = 0; | |
| 80 int hi = srcize; | |
| 81 while (hi - lo > 1) { | |
| 82 int t = (lo + hi) / 2; | |
| 83 if (::memcmp(srcbuf + I[t], keybuf, std::min(srcize - I[t], keysize)) < 0) | |
| 84 lo = t; | |
| 85 else | |
| 86 hi = t; | |
| 87 } | |
| 88 | |
| 89 int x = matchlen(srcbuf + I[lo], keybuf, std::min(srcize - I[lo], keysize)); | |
| 90 int y = matchlen(srcbuf + I[hi], keybuf, std::min(srcize - I[hi], keysize)); | |
| 91 if (x > y) { | |
| 92 *pos = I[lo]; | |
| 93 return x; | |
| 94 } | |
| 95 *pos = I[hi]; | |
| 96 return y; | |
| 97 } | |
| 98 | |
| 99 } // namespace bsdiff | |
| OLD | NEW |