OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2003, 2004 Colin Percival | |
2 // All rights reserved | |
3 // | |
4 // Redistribution and use in source and binary forms, with or without | |
5 // modification, are permitted providing that the following conditions | |
6 // are met: | |
7 // 1. Redistributions of source code must retain the above copyright | |
8 // notice, this list of conditions and the following disclaimer. | |
9 // 2. Redistributions in binary form must reproduce the above copyright | |
10 // notice, this list of conditions and the following disclaimer in the | |
11 // documentation and/or other materials provided with the distribution. | |
12 // | |
13 // For the terms under which this work may be distributed, please see | |
14 // the adjoining file "LICENSE". | |
15 // | |
16 // bsdiff_search.h -- Suffix array search. | |
17 // | |
18 // ChangeLog: | |
19 // 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | |
20 // values throughout. | |
21 // --Benjamin Smedberg <benjamin@smedbergs.us> | |
22 // 2015-08-03 - Change search() to template to allow PagedArray usage. | |
23 // --Samuel Huang <huangs@chromium.org> | |
24 // 2015-08-19 - Optimized search(). | |
25 // --Samuel Huang <huangs@chromium.org> | |
26 // 2016-06-02 - Move matchlen() and search() to a new file; comment; format. | |
27 // --Samuel Huang <huangs@chromium.org> | |
28 | |
29 // Copyright 2016 The Chromium Authors. All rights reserved. | |
30 // Use of this source code is governed by a BSD-style license that can be | |
31 // found in the LICENSE file. | |
32 | |
33 #include <algorithm> | |
34 #include <cstring> | |
35 | |
36 namespace bsdiff { | |
37 | |
38 // Similar to ::memcmp(), but returns match length instead. | |
39 inline int matchlen(const unsigned char *srcbuf, const unsigned char *keybuf, | |
40 int size) { | |
41 for (int i = 0; i < size; ++i) | |
42 if (srcbuf[i] != keybuf[i]) | |
43 return i; | |
44 return size; | |
45 } | |
46 | |
47 // Finds a suffix in |srcbuf| that has the longest common prefix with |keybuf|, | |
48 // aided by suffix array |I| of |srcbuf|. Returns the length of the match, and | |
49 // writes the position within |srcbuf| to |pos|. | |
50 template <class T> | |
51 int search(T I, const unsigned char *srcbuf, int srcize, | |
52 const unsigned char *keybuf, int keysize, int *pos) { | |
53 // We use binary search here, but the desired position is not the usual | |
huangs
2016/06/06 04:53:35
In retrospect this is ridiculously verbose. Will
| |
54 // position from std::lower_bound()-like binary search. | |
55 // For example, given sorted suffixes of |srcbuf|: | |
56 // SAAA... | |
57 // SNNN... | |
58 // TAAA... | |
59 // Applying std::lower_bound() binary search results in: | |
60 // |keybuf| = "SNBB" => "SNNN...". | |
61 // |keybuf| = "SNZZ" => "TAAA...". | |
62 // But the desired result for both cases is "SNNN...", with "SN" as the | |
63 // longest common prefix. To solve this: | |
64 // Case 1: |keysize| == 0: We'd return 0, although |pos| is arbitrary. | |
65 // Case 2: |srcsize| == 0: We'd return 0, and |pos| = 0. | |
66 // Case 3: |srcsize| > 0 and |keysize| > 0: We'd assert rank-|lo| suffix | |
67 // of |srcbuf| is always strictly lexicographically less than |keybuf|. This | |
68 // holds for |lo| = 0, which corresponds to "". The search terminates when | |
69 // hi == lo + 1. At this point, suffixes at rank |lo| and rank |hi| both | |
70 // may be the desired match, so we check both. In case of tie, there's NO | |
71 // guarantee that we take the solution with minimum rank. | |
72 // Finding the minimum rank solution is not needed, and it would create more | |
73 // complexity. For example, with suffixes "MA...", "MB...", "MC...", "ME...", | |
74 // searching look for "MD" would yield |lo| -> "MC...", and the minimum rank | |
75 // solution "MA...", would be rather far away. | |
76 // | |
77 // TODO(huangs): Optimize this: match length at |t| is >= match length at | |
78 // |lo| and at |hi|. If we track this then we can skip comparisons! | |
79 int lo = 0; | |
80 int hi = srcize; | |
81 while (hi - lo > 1) { | |
82 int t = (lo + hi) / 2; | |
83 if (::memcmp(srcbuf + I[t], keybuf, std::min(srcize - I[t], keysize)) < 0) | |
84 lo = t; | |
85 else | |
86 hi = t; | |
87 } | |
88 | |
89 int x = matchlen(srcbuf + I[lo], keybuf, std::min(srcize - I[lo], keysize)); | |
90 int y = matchlen(srcbuf + I[hi], keybuf, std::min(srcize - I[hi], keysize)); | |
91 if (x > y) { | |
92 *pos = I[lo]; | |
93 return x; | |
94 } | |
95 *pos = I[hi]; | |
96 return y; | |
97 } | |
98 | |
99 } // namespace bsdiff | |
OLD | NEW |