// Copyright 2008 Google Inc. All Rights Reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Internals shared between the Snappy implementation and its unittest.
#if SNAPPY_HAVE_SSSE3 // Please do not replace with <x86intrin.h> or with headers that assume more // advanced SSE versions without checking with all the OWNERS. #include <emmintrin.h> #include <tmmintrin.h> #endif
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE #if SNAPPY_HAVE_SSSE3 using V128 = __m128i; #elif SNAPPY_HAVE_NEON using V128 = uint8x16_t; #endif
// Load 128 bits of integer data. `src` must be 16-byte aligned. inline V128 V128_Load(const V128* src);
// Load 128 bits of integer data. `src` does not need to be aligned. inline V128 V128_LoadU(const V128* src);
// Store 128 bits of integer data. `dst` does not need to be aligned. inlinevoid V128_StoreU(V128* dst, V128 val);
// Shuffle packed 8-bit integers using a shuffle mask. // Each packed integer in the shuffle mask must be in [0,16). inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
// Working memory performs a single allocation to hold all scratch space // required for compression. class WorkingMemory { public: explicit WorkingMemory(size_t input_size);
~WorkingMemory();
// Allocates and clears a hash table using memory in "*this", // stores the number of buckets in "*table_size" and returns a pointer to // the base of the hash table.
uint16_t* GetHashTable(size_t fragment_size, int* table_size) const; char* GetScratchInput() const { return input_; } char* GetScratchOutput() const { return output_; }
private: char* mem_; // the allocated memory, never nullptr
size_t size_; // the size of the allocated memory, never 0
uint16_t* table_; // the pointer to the hashtable char* input_; // the pointer to the input scratch buffer char* output_; // the pointer to the output scratch buffer
// No copying
WorkingMemory(const WorkingMemory&); voidoperator=(const WorkingMemory&);
};
// Flat array compression that does not emit the "uncompressed length" // prefix. Compresses "input" string to the "*op" buffer. // // REQUIRES: "input_length <= kBlockSize" // REQUIRES: "op" points to an array of memory that is at least // "MaxCompressedLength(input_length)" in size. // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero. // REQUIRES: "table_size" is a power of two // // Returns an "end" pointer into "op" buffer. // "end - op" is the compressed size of "input". char* CompressFragment(constchar* input,
size_t input_length, char* op,
uint16_t* table, constint table_size);
// Find the largest n such that // // s1[0,n-1] == s2[0,n-1] // and n <= (s2_limit - s2). // // Return make_pair(n, n < 8). // Does not read *s2_limit or beyond. // Does not read *(s1 + (s2_limit - s2)) or beyond. // Requires that s2_limit >= s2. // // In addition populate *data with the next 5 bytes from the end of the match. // This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is // that on some arch's this can be done faster in this routine than subsequent // loading from s2 + n. // // Separate implementation for 64-bit, little-endian cpus. #if !SNAPPY_IS_BIG_ENDIAN && \
(defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \ defined(ARCH_ARM)) staticinline std::pair<size_t, bool> FindMatchLength(constchar* s1, constchar* s2, constchar* s2_limit,
uint64_t* data) {
assert(s2_limit >= s2);
size_t matched = 0;
// This block isn't necessary for correctness; we could just start looping // immediately. As an optimization though, it is useful. It creates some not // uncommon code paths that determine, without extra effort, whether the match // length is less than 8. In short, we are hoping to avoid a conditional // branch, and perhaps get better code layout from the C++ compiler. if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
uint64_t a1 = UNALIGNED_LOAD64(s1);
uint64_t a2 = UNALIGNED_LOAD64(s2); if (SNAPPY_PREDICT_TRUE(a1 != a2)) { // This code is critical for performance. The reason is that it determines // how much to advance `ip` (s2). This obviously depends on both the loads // from the `candidate` (s1) and `ip`. Furthermore the next `candidate` // depends on the advanced `ip` calculated here through a load, hash and // new candidate hash lookup (a lot of cycles). This makes s1 (ie. // `candidate`) the variable that limits throughput. This is the reason we // go through hoops to have this function update `data` for the next iter. // The straightforward code would use *data, given by // // *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles), // // as input for the hash table lookup to find next candidate. However // this forces the load on the data dependency chain of s1, because // matched_bytes directly depends on s1. However matched_bytes is 0..7, so // we can also calculate *data by // // *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8), // matched_bytes); // // The loads do not depend on s1 anymore and are thus off the bottleneck. // The straightforward implementation on x86_64 would be to use // // shrd rax, rdx, cl (cl being matched_bytes * 8) // // unfortunately shrd with a variable shift has a 4 cycle latency. So this // only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable // shift instruction but can only shift 64 bits. If we focus on just // obtaining the least significant 4 bytes, we can obtain this by // // *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2), // UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8); // // Writen like above this is not a big win, the conditional move would be // a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle). // However matched_bytes < 4 is equal to // static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional // move (2 cycles) can execute in parallel with FindLSBSetNonZero64 // (tzcnt), which takes 3 cycles.
uint64_t xorval = a1 ^ a2; int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); #ifndef __x86_64__
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; #else // Ideally this would just be // // a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; // // However clang correctly infers that the above statement participates on // a critical data dependency chain and thus, unfortunately, refuses to // use a conditional move (it's tuned to cut data dependencies). In this // case there is a longer parallel chain anyway AND this will be fairly // unpredictable. asm("testl %k2, %k2\n\t" "cmovzq %1, %0\n\t"
: "+r"(a2)
: "r"(a3), "r"(xorval)
: "cc"); #endif
*data = a2 >> (shift & (3 * 8)); return std::pair<size_t, bool>(matched_bytes, true);
} else {
matched = 8;
s2 += 8;
}
}
SNAPPY_PREFETCH(s1 + 64);
SNAPPY_PREFETCH(s2 + 64);
// Find out how long the match is. We loop over the data 64 bits at a // time until we find a 64-bit block that doesn't match; then we find // the first non-matching bit and use that to calculate the total // length of the match. while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
uint64_t a2 = UNALIGNED_LOAD64(s2); if (a1 == a2) {
s2 += 8;
matched += 8;
} else {
uint64_t xorval = a1 ^ a2; int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); #ifndef __x86_64__
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; #else asm("testl %k2, %k2\n\t" "cmovzq %1, %0\n\t"
: "+r"(a2)
: "r"(a3), "r"(xorval)
: "cc"); #endif
*data = a2 >> (shift & (3 * 8));
matched += matched_bytes;
assert(matched >= 8); return std::pair<size_t, bool>(matched, false);
}
} while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) { if (s1[matched] == *s2) {
++s2;
++matched;
} else { if (s2 <= s2_limit - 8) {
*data = UNALIGNED_LOAD64(s2);
} return std::pair<size_t, bool>(matched, matched < 8);
}
} return std::pair<size_t, bool>(matched, matched < 8);
} #else staticinline std::pair<size_t, bool> FindMatchLength(constchar* s1, constchar* s2, constchar* s2_limit,
uint64_t* data) { // Implementation based on the x86-64 version, above.
assert(s2_limit >= s2); int matched = 0;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.