sync/internal_api/public/base/unique_position.cc - Issue 2130453004: [Sync] Move //sync to //components/sync.

Unified Diff: sync/internal_api/public/base/unique_position.cc

Issue 2130453004: [Sync] Move //sync to //components/sync. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebase. Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: sync/internal_api/public/base/unique_position.cc

diff --git a/sync/internal_api/public/base/unique_position.cc b/sync/internal_api/public/base/unique_position.cc

deleted file mode 100644

index 9924cb4396197527d8386899cc12d13e25db98ec..0000000000000000000000000000000000000000

--- a/sync/internal_api/public/base/unique_position.cc

+++ /dev/null

@@ -1,633 +0,0 @@

-// Use of this source code is governed by a BSD-style license that can be

-// found in the LICENSE file.

-#include "sync/internal_api/public/base/unique_position.h"

-#include <stddef.h>

-#include <stdint.h>

-#include <algorithm>

-#include <limits>

-#include "base/logging.h"

-#include "base/rand_util.h"

-#include "base/stl_util.h"

-#include "base/strings/string_number_conversions.h"

-#include "sync/protocol/unique_position.pb.h"

-#include "third_party/zlib/zlib.h"

-namespace syncer {

-const size_t UniquePosition::kSuffixLength = 28;

-const size_t UniquePosition::kCompressBytesThreshold = 128;

-// static.

-bool UniquePosition::IsValidSuffix(const std::string& suffix) {

- // The suffix must be exactly the specified length, otherwise unique suffixes

- // are not sufficient to guarantee unique positions (because prefix + suffix

- // == p + refixsuffix).

- return suffix.length() == kSuffixLength

- && suffix[kSuffixLength-1] != 0;

-// static.

-bool UniquePosition::IsValidBytes(const std::string& bytes) {

- // The first condition ensures that our suffix uniqueness is sufficient to

- // guarantee position uniqueness. Otherwise, it's possible the end of some

- // prefix + some short suffix == some long suffix.

- // The second condition ensures that FindSmallerWithSuffix can always return a

- // result.

- return bytes.length() >= kSuffixLength

- && bytes[bytes.length()-1] != 0;

-// static.

-std::string UniquePosition::RandomSuffix() {

- // Users random data for all but the last byte. The last byte must not be

- // zero. We arbitrarily set it to 0x7f.

- return base::RandBytesAsString(kSuffixLength - 1) + "\x7f";

-// static.

-UniquePosition UniquePosition::CreateInvalid() {

- UniquePosition pos;

- DCHECK(!pos.IsValid());

- return pos;

-// static.

-UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) {

- if (proto.has_custom_compressed_v1()) {

- return UniquePosition(proto.custom_compressed_v1());

- } else if (proto.has_value() && !proto.value().empty()) {

- return UniquePosition(Compress(proto.value()));

- } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) {

- uLongf uncompressed_len = proto.uncompressed_length();

- std::string un_gzipped;

- un_gzipped.resize(uncompressed_len);

- int result = uncompress(

- reinterpret_cast<Bytef*>(string_as_array(&un_gzipped)),

- &uncompressed_len,

- reinterpret_cast<const Bytef*>(proto.compressed_value().data()),

- proto.compressed_value().size());

- if (result != Z_OK) {

- DLOG(ERROR) << "Unzip failed " << result;

- return UniquePosition::CreateInvalid();

- }

- if (uncompressed_len != proto.uncompressed_length()) {

- DLOG(ERROR)

- << "Uncompressed length " << uncompressed_len

- << " did not match specified length " << proto.uncompressed_length();

- return UniquePosition::CreateInvalid();

- }

- return UniquePosition(Compress(un_gzipped));

- } else {

- return UniquePosition::CreateInvalid();

- }

-// static.

-UniquePosition UniquePosition::FromInt64(int64_t x, const std::string& suffix) {

- uint64_t y = static_cast<uint64_t>(x);

- y ^= 0x8000000000000000ULL; // Make it non-negative.

- std::string bytes(8, 0);

- for (int i = 7; i >= 0; --i) {

- bytes[i] = static_cast<uint8_t>(y);

- y >>= 8;

- }

- return UniquePosition(bytes + suffix, suffix);

-// static.

-UniquePosition UniquePosition::InitialPosition(

- const std::string& suffix) {

- DCHECK(IsValidSuffix(suffix));

- return UniquePosition(suffix, suffix);

-// static.

-UniquePosition UniquePosition::Before(

- const UniquePosition& x,

- const std::string& suffix) {

- DCHECK(IsValidSuffix(suffix));

- DCHECK(x.IsValid());

- const std::string& before = FindSmallerWithSuffix(

- Uncompress(x.compressed_), suffix);

- return UniquePosition(before + suffix, suffix);

-// static.

-UniquePosition UniquePosition::After(

- const UniquePosition& x,

- const std::string& suffix) {

- DCHECK(IsValidSuffix(suffix));

- DCHECK(x.IsValid());

- const std::string& after = FindGreaterWithSuffix(

- Uncompress(x.compressed_), suffix);

- return UniquePosition(after + suffix, suffix);

-// static.

-UniquePosition UniquePosition::Between(

- const UniquePosition& before,

- const UniquePosition& after,

- const std::string& suffix) {

- DCHECK(before.IsValid());

- DCHECK(after.IsValid());

- DCHECK(before.LessThan(after));

- DCHECK(IsValidSuffix(suffix));

- const std::string& mid = FindBetweenWithSuffix(

- Uncompress(before.compressed_),

- Uncompress(after.compressed_),

- suffix);

- return UniquePosition(mid + suffix, suffix);

-UniquePosition::UniquePosition() : is_valid_(false) {}

-bool UniquePosition::LessThan(const UniquePosition& other) const {

- DCHECK(this->IsValid());

- DCHECK(other.IsValid());

- return compressed_ < other.compressed_;

-bool UniquePosition::Equals(const UniquePosition& other) const {

- if (!this->IsValid() && !other.IsValid())

- return true;

- return compressed_ == other.compressed_;

-void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const {

- proto->Clear();

- // This is the current preferred foramt.

- proto->set_custom_compressed_v1(compressed_);

- // Older clients used to write other formats. We don't bother doing that

- // anymore because that form of backwards compatibility is expensive. We no

- // longer want to pay that price just too support clients that have been

- // obsolete for a long time. See the proto definition for details.

-void UniquePosition::SerializeToString(std::string* blob) const {

- DCHECK(blob);

- sync_pb::UniquePosition proto;

- ToProto(&proto);

- proto.SerializeToString(blob);

-int64_t UniquePosition::ToInt64() const {

- uint64_t y = 0;

- const std::string& s = Uncompress(compressed_);

- size_t l = sizeof(int64_t);

- if (s.length() < l) {

- NOTREACHED();

- l = s.length();

- }

- for (size_t i = 0; i < l; ++i) {

- const uint8_t byte = s[l - i - 1];

- y |= static_cast<uint64_t>(byte) << (i * 8);

- }

- y ^= 0x8000000000000000ULL;

- // This is technically implementation-defined if y > INT64_MAX, so

- // we're assuming that we're on a twos-complement machine.

- return static_cast<int64_t>(y);

-bool UniquePosition::IsValid() const {

- return is_valid_;

-std::string UniquePosition::ToDebugString() const {

- const std::string bytes = Uncompress(compressed_);

- if (bytes.empty())

- return std::string("INVALID[]");

- std::string debug_string = base::HexEncode(bytes.data(), bytes.length());

- if (!IsValid()) {

- debug_string = "INVALID[" + debug_string + "]";

- }

- std::string compressed_string =

- base::HexEncode(compressed_.data(), compressed_.length());

- debug_string.append(", compressed: " + compressed_string);

- return debug_string;

-std::string UniquePosition::GetSuffixForTest() const {

- const std::string bytes = Uncompress(compressed_);

- const size_t prefix_len = bytes.length() - kSuffixLength;

- return bytes.substr(prefix_len, std::string::npos);

-std::string UniquePosition::FindSmallerWithSuffix(

- const std::string& reference,

- const std::string& suffix) {

- size_t ref_zeroes = reference.find_first_not_of('\0');

- size_t suffix_zeroes = suffix.find_first_not_of('\0');

- // Neither of our inputs are allowed to have trailing zeroes, so the following

- // must be true.

- DCHECK_NE(ref_zeroes, std::string::npos);

- DCHECK_NE(suffix_zeroes, std::string::npos);

- if (suffix_zeroes > ref_zeroes) {

- // Implies suffix < ref.

- return std::string();

- }

- if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) {

- // Prepend zeroes so the result has as many zero digits as |reference|.

- return std::string(ref_zeroes - suffix_zeroes, '\0');

- } else if (suffix_zeroes > 1) {

- // Prepend zeroes so the result has one more zero digit than |reference|.

- // We could also take the "else" branch below, but taking this branch will

- // give us a smaller result.

- return std::string(ref_zeroes - suffix_zeroes + 1, '\0');

- } else {

- // Prepend zeroes to match those in the |reference|, then something smaller

- // than the first non-zero digit in |reference|.

- char lt_digit = static_cast<uint8_t>(reference[ref_zeroes]) / 2;

- return std::string(ref_zeroes, '\0') + lt_digit;

- }

-// static

-std::string UniquePosition::FindGreaterWithSuffix(

- const std::string& reference,

- const std::string& suffix) {

- size_t ref_FFs =

- reference.find_first_not_of(std::numeric_limits<uint8_t>::max());

- size_t suffix_FFs =

- suffix.find_first_not_of(std::numeric_limits<uint8_t>::max());

- if (ref_FFs == std::string::npos) {

- ref_FFs = reference.length();

- }

- if (suffix_FFs == std::string::npos) {

- suffix_FFs = suffix.length();

- }

- if (suffix_FFs > ref_FFs) {

- // Implies suffix > reference.

- return std::string();

- }

- if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) {

- // Prepend FF digits to match those in |reference|.

- return std::string(ref_FFs - suffix_FFs,

- std::numeric_limits<uint8_t>::max());

- } else if (suffix_FFs > 1) {

- // Prepend enough leading FF digits so result has one more of them than

- // |reference| does. We could also take the "else" branch below, but this

- // gives us a smaller result.

- return std::string(ref_FFs - suffix_FFs + 1,

- std::numeric_limits<uint8_t>::max());

- } else {

- // Prepend FF digits to match those in |reference|, then something larger

- // than the first non-FF digit in |reference|.

- char gt_digit = static_cast<uint8_t>(reference[ref_FFs]) +

- (std::numeric_limits<uint8_t>::max() -

- static_cast<uint8_t>(reference[ref_FFs]) + 1) /

- 2;

- return std::string(ref_FFs, std::numeric_limits<uint8_t>::max()) + gt_digit;

- }

-// static

-std::string UniquePosition::FindBetweenWithSuffix(

- const std::string& before,

- const std::string& after,

- const std::string& suffix) {

- DCHECK(IsValidSuffix(suffix));

- DCHECK_NE(before, after);

- DCHECK_LT(before, after);

- std::string mid;

- // Sometimes our suffix puts us where we want to be.

- if (before < suffix && suffix < after) {

- return std::string();

- }

- size_t i = 0;

- for ( ; i < std::min(before.length(), after.length()); ++i) {

- uint8_t a_digit = before[i];

- uint8_t b_digit = after[i];

- if (b_digit - a_digit >= 2) {

- mid.push_back(a_digit + (b_digit - a_digit)/2);

- return mid;

- } else if (a_digit == b_digit) {

- mid.push_back(a_digit);

- // Both strings are equal so far. Will appending the suffix at this point

- // give us the comparison we're looking for?

- if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) {

- return mid;

- }

- } else {

- DCHECK_EQ(b_digit - a_digit, 1); // Implied by above if branches.

- // The two options are off by one digit. The choice of whether to round

- // up or down here will have consequences on what we do with the remaining

- // digits. Exploring both options is an optimization and is not required

- // for the correctness of this algorithm.

- // Option A: Round down the current digit. This makes our |mid| <

- // |after|, no matter what we append afterwards. We then focus on

- // appending digits until |mid| > |before|.

- std::string mid_a = mid;

- mid_a.push_back(a_digit);

- mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix));

- // Option B: Round up the current digit. This makes our |mid| > |before|,

- // no matter what we append afterwards. We then focus on appending digits

- // until |mid| < |after|. Note that this option may not be viable if the

- // current digit is the last one in |after|, so we skip the option in that

- // case.

- if (after.length() > i+1) {

- std::string mid_b = mid;

- mid_b.push_back(b_digit);

- mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix));

- // Does this give us a shorter position value? If so, use it.

- if (mid_b.length() < mid_a.length()) {

- return mid_b;

- }

- return mid_a;

- }

- // If we haven't found a midpoint yet, the following must be true:

- DCHECK_EQ(before.substr(0, i), after.substr(0, i));

- DCHECK_EQ(before, mid);

- DCHECK_LT(before.length(), after.length());

- // We know that we'll need to append at least one more byte to |mid| in the

- // process of making it < |after|. Appending any digit, regardless of the

- // value, will make |before| < |mid|. Therefore, the following will get us a

- // valid position.

- mid.append(FindSmallerWithSuffix(after.substr(i), suffix));

- return mid;

-UniquePosition::UniquePosition(const std::string& internal_rep)

- : compressed_(internal_rep),

- is_valid_(IsValidBytes(Uncompress(internal_rep))) {

-UniquePosition::UniquePosition(

- const std::string& uncompressed,

- const std::string& suffix)

- : compressed_(Compress(uncompressed)),

- is_valid_(IsValidBytes(uncompressed)) {

- DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length());

- DCHECK(IsValidSuffix(suffix));

- DCHECK(IsValid());

-// On custom compression:

-//

-// Let C(x) be the compression function and U(x) be the uncompression function.

-//

-// This compression scheme has a few special properties. For one, it is

-// order-preserving. For any two valid position strings x and y:

-// x < y <=> C(x) < C(y)

-// This allows us keep the position strings compressed as we sort them.

-//

-// The compressed format and the decode algorithm:

-//

-// The compressed string is a series of blocks, almost all of which are 8 bytes

-// in length. The only exception is the last block in the compressed string,

-// which may be a remainder block, which has length no greater than 7. The

-// full-length blocks are either repeated character blocks or plain data blocks.

-// All blocks are entirely self-contained. Their decoded values are independent

-// from that of their neighbours.

-//

-// A repeated character block is encoded into eight bytes and represents between

-// 4 and 2^31 repeated instances of a given character in the unencoded stream.

-// The encoding consists of a single character repeated four times, followed by

-// an encoded count. The encoded count is stored as a big-endian 32 bit

-// integer. There are 2^31 possible count values, and two encodings for each.

-// The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc =

-// count'. At compression time, the algorithm will choose between the two

-// encodings based on which of the two will maintain the appropriate sort

-// ordering (by a process which will be described below). The decompression

-// algorithm need not concern itself with which encoding was used; it needs only

-// to decode it. The decoded value of this block is "count" instances of the

-// character that was repeated four times in the first half of this block.

-//

-// A plain data block is encoded into eight bytes and represents exactly eight

-// bytes of data in the unencoded stream. The plain data block must not begin

-// with the same character repeated four times. It is allowed to contain such a

-// four-character sequence, just not at the start of the block. The decoded

-// value of a plain data block is identical to its encoded value.

-//

-// A remainder block has length of at most seven. It is a shorter version of

-// the plain data block. It occurs only at the end of the encoded stream and

-// represents exactly as many bytes of unencoded data as its own length. Like a

-// plain data block, the remainder block never begins with the same character

-// repeated four times. The decoded value of this block is identical to its

-// encoded value.

-//

-// The encode algorithm:

-//

-// From the above description, it can be seen that there may be more than one

-// way to encode a given input string. The encoder must be careful to choose

-// the encoding that guarantees sort ordering.

-//

-// The rules for the encoder are as follows:

-// 1. Iterate through the input string and produce output blocks one at a time.

-// 2. Where possible (ie. where the next four bytes of input consist of the

-// same character repeated four times), produce a repeated data block of

-// maximum possible length.

-// 3. If there is at least 8 bytes of data remaining and it is not possible

-// to produce a repeated character block, produce a plain data block.

-// 4. If there are less than 8 bytes of data remaining and it is not possible

-// to produce a repeated character block, produce a remainder block.

-// 5. When producing a repeated character block, the count encoding must be

-// chosen in such a way that the sort ordering is maintained. The choice is

-// best illustrated by way of example:

-//

-// When comparing two strings, the first of which begins with of 8

-// instances of the letter 'B' and the second with 10 instances of the

-// letter 'B', which of the two should compare lower? The result depends

-// on the 9th character of the first string, since it will be compared

-// against the 9th 'B' in the second string. If that character is an 'A',

-// then the first string will compare lower. If it is a 'C', then the

-// first string will compare higher.

-//

-// The key insight is that the comparison value of a repeated character block

-// depends on the value of the character that follows it. If the character

-// follows the repeated character has a value greater than the repeated

-// character itself, then a shorter run length should translate to a higher

-// comparison value. Therefore, we encode its count using the low encoding.

-// Similarly, if the following character is lower, we use the high encoding.

-namespace {

-// Appends an encoded run length to |output_str|.

-static void WriteEncodedRunLength(uint32_t length,

- bool high_encoding,

- std::string* output_str) {

- CHECK_GE(length, 4U);

- CHECK_LT(length, 0x80000000);

- // Step 1: Invert the count, if necessary, to account for the following digit.

- uint32_t encoded_length;

- if (high_encoding) {

- encoded_length = 0xffffffff - length;

- } else {

- encoded_length = length;

- }

- // Step 2: Write it as big-endian so it compares correctly with memcmp(3).

- output_str->append(1, 0xff & (encoded_length >> 24U));

- output_str->append(1, 0xff & (encoded_length >> 16U));

- output_str->append(1, 0xff & (encoded_length >> 8U));

- output_str->append(1, 0xff & (encoded_length >> 0U));

-// Reads an encoded run length for |str| at position |i|.

-static uint32_t ReadEncodedRunLength(const std::string& str, size_t i) {

- DCHECK_LE(i + 4, str.length());

- // Step 1: Extract the big-endian count.

- uint32_t encoded_length =

- ((uint8_t)(str[i + 3]) << 0) | ((uint8_t)(str[i + 2]) << 8) |

- ((uint8_t)(str[i + 1]) << 16) | ((uint8_t)(str[i + 0]) << 24);

- // Step 2: If this was an inverted count, un-invert it.

- uint32_t length;

- if (encoded_length & 0x80000000) {

- length = 0xffffffff - encoded_length;

- } else {

- length = encoded_length;

- }

- return length;

-// A series of four identical chars at the beginning of a block indicates

-// the beginning of a repeated character block.

-static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) {

- return chars[start_index] == chars[start_index+1]

- && chars[start_index] == chars[start_index+2]

- && chars[start_index] == chars[start_index+3];

-} // namespace

-// static

-// Wraps the CompressImpl function with a bunch of DCHECKs.

-std::string UniquePosition::Compress(const std::string& str) {

- DCHECK(IsValidBytes(str));

- std::string compressed = CompressImpl(str);

- DCHECK(IsValidCompressed(compressed));

- DCHECK_EQ(str, Uncompress(compressed));

- return compressed;

-// static

-// Performs the order preserving run length compression of a given input string.

-std::string UniquePosition::CompressImpl(const std::string& str) {

- std::string output;

- // The compressed length will usually be at least as long as the suffix (28),

- // since the suffix bytes are mostly random. Most are a few bytes longer; a

- // small few are tens of bytes longer. Some early tests indicated that

- // roughly 99% had length 40 or smaller. We guess that pre-sizing for 48 is a

- // good trade-off, but that has not been confirmed through profiling.

- output.reserve(48);

- // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the

- // length of a string of identical digits starting at i.

- for (size_t i = 0; i < str.length(); ) {

- if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) {

- // Four identical bytes in a row at this position means that we must start

- // a repeated character block. Begin by outputting those four bytes.

- output.append(str, i, 4);

- // Determine the size of the run.

- const char rep_digit = str[i];

- const size_t runs_until = str.find_first_not_of(rep_digit, i+4);

- // Handle the 'runs until end' special case specially.

- size_t run_length;

- bool encode_high; // True if the next byte is greater than |rep_digit|.

- if (runs_until == std::string::npos) {

- run_length = str.length() - i;

- encode_high = false;

- } else {

- run_length = runs_until - i;

- encode_high = static_cast<uint8_t>(str[runs_until]) >

- static_cast<uint8_t>(rep_digit);

- }

- DCHECK_LT(run_length,

- static_cast<size_t>(std::numeric_limits<int32_t>::max()))

- << "This implementation can't encode run-lengths greater than 2^31.";

- WriteEncodedRunLength(run_length, encode_high, &output);

- i += run_length; // Jump forward by the size of the run length.

- } else {

- // Output up to eight bytes without any encoding.

- const size_t len = std::min(static_cast<size_t>(8), str.length() - i);

- output.append(str, i, len);

- i += len; // Jump forward by the amount of input consumed (usually 8).

- }

- return output;

-// static

-// Uncompresses strings that were compresed with UniquePosition::Compress.

-std::string UniquePosition::Uncompress(const std::string& str) {

- std::string output;

- size_t i = 0;

- // Iterate through the compressed string one block at a time.

- for (i = 0; i + 8 <= str.length(); i += 8) {

- if (IsRepeatedCharPrefix(str, i)) {

- // Found a repeated character block. Expand it.

- const char rep_digit = str[i];

- uint32_t length = ReadEncodedRunLength(str, i + 4);

- output.append(length, rep_digit);

- } else {

- // Found a regular block. Copy it.

- output.append(str, i, 8);

- }

- // Copy the remaining bytes that were too small to form a block.

- output.append(str, i, std::string::npos);

- return output;

-bool UniquePosition::IsValidCompressed(const std::string& str) {

- for (size_t i = 0; i + 8 <= str.length(); i += 8) {

- if (IsRepeatedCharPrefix(str, i)) {

- uint32_t count = ReadEncodedRunLength(str, i + 4);

- if (count < 4) {

- // A repeated character block should at least represent the four

- // characters that started it.

- return false;

- }

- if (str[i] == str[i+4]) {

- // Does the next digit after a count match the repeated character? Then

- // this is not the highest possible count.

- return false;

- }

- // We don't bother looking for the existence or checking the validity of

- // any partial blocks. There's no way they could be invalid anyway.

- return true;

-} // namespace syncer

« no previous file with comments | « sync/internal_api/public/base/unique_position.h ('k') | sync/internal_api/public/base/unique_position_unittest.cc » ('j') | no next file with comments »