Fingerprint.h 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. /*
  2. * Copyright 2012-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /**
  17. * Compute 64-, 96-, and 128-bit Rabin fingerprints, as described in
  18. * Michael O. Rabin (1981)
  19. * Fingerprinting by Random Polynomials
  20. * Center for Research in Computing Technology, Harvard University
  21. * Tech Report TR-CSE-03-01
  22. *
  23. * The implementation follows the optimization described in
  24. * Andrei Z. Broder (1993)
  25. * Some applications of Rabin's fingerprinting method
  26. *
  27. * extended for fingerprints larger than 64 bits, and modified to use
  28. * 64-bit instead of 32-bit integers for computation.
  29. *
  30. * The precomputed tables are in Fingerprint.cpp.
  31. *
  32. * Benchmarked on 10/13/2009 on a 2.5GHz quad-core Xeon L5420,
  33. * - Fingerprint<64>::update64() takes about 12ns
  34. * - Fingerprint<96>::update64() takes about 30ns
  35. * - Fingerprint<128>::update128() takes about 30ns
  36. * (unsurprisingly, Fingerprint<96> and Fingerprint<128> take the
  37. * same amount of time, as they both use 128-bit operations; the least
  38. * significant 32 bits of Fingerprint<96> will always be 0)
  39. *
  40. * @author Tudor Bosman (tudorb@facebook.com)
  41. */
  42. #pragma once
  43. #include <array>
  44. #include <cstdint>
  45. #include <folly/Range.h>
  46. namespace folly {
  47. namespace detail {
  48. constexpr size_t poly_size(size_t bits) {
  49. return 1 + (bits - 1) / 64;
  50. }
  51. template <size_t Deg>
  52. using poly_table =
  53. std::array<std::array<std::array<uint64_t, poly_size(Deg)>, 256>, 8>;
  54. template <int BITS>
  55. struct FingerprintTable {
  56. static const uint64_t poly[poly_size(BITS)];
  57. static const poly_table<BITS> table;
  58. };
  59. template <int BITS>
  60. const uint64_t FingerprintTable<BITS>::poly[poly_size(BITS)] = {};
  61. template <int BITS>
  62. const poly_table<BITS> FingerprintTable<BITS>::table = {};
  63. #ifndef _MSC_VER
  64. // MSVC 2015 can't handle these extern specialization declarations,
  65. // but they aren't needed for things to work right, so we just don't
  66. // declare them in the header for MSVC.
  67. #define FOLLY_DECLARE_FINGERPRINT_TABLES(BITS) \
  68. template <> \
  69. const uint64_t FingerprintTable<BITS>::poly[poly_size(BITS)]; \
  70. template <> \
  71. const poly_table<BITS> FingerprintTable<BITS>::table
  72. FOLLY_DECLARE_FINGERPRINT_TABLES(64);
  73. FOLLY_DECLARE_FINGERPRINT_TABLES(96);
  74. FOLLY_DECLARE_FINGERPRINT_TABLES(128);
  75. #undef FOLLY_DECLARE_FINGERPRINT_TABLES
  76. #endif
  77. } // namespace detail
  78. /**
  79. * Compute the Rabin fingerprint.
  80. *
  81. * TODO(tudorb): Extend this to allow removing values from the computed
  82. * fingerprint (so we can fingerprint a sliding window, as in the Rabin-Karp
  83. * string matching algorithm)
  84. *
  85. * update* methods return *this, so you can chain them together:
  86. * Fingerprint<96>().update8(x).update(str).update64(val).write(output);
  87. */
  88. template <int BITS>
  89. class Fingerprint {
  90. public:
  91. Fingerprint() {
  92. // Use a non-zero starting value. We'll use (1 << (BITS-1))
  93. fp_[0] = 1ULL << 63;
  94. for (int i = 1; i < size(); i++) {
  95. fp_[i] = 0;
  96. }
  97. }
  98. Fingerprint& update8(uint8_t v) {
  99. uint8_t out = shlor8(v);
  100. xortab(detail::FingerprintTable<BITS>::table[0][out]);
  101. return *this;
  102. }
  103. // update32 and update64 are convenience functions to update the fingerprint
  104. // with 4 and 8 bytes at a time. They are faster than calling update8
  105. // in a loop. They process the bytes in big-endian order.
  106. Fingerprint& update32(uint32_t v) {
  107. uint32_t out = shlor32(v);
  108. for (int i = 0; i < 4; i++) {
  109. xortab(detail::FingerprintTable<BITS>::table[i][out & 0xff]);
  110. out >>= 8;
  111. }
  112. return *this;
  113. }
  114. Fingerprint& update64(uint64_t v) {
  115. uint64_t out = shlor64(v);
  116. for (int i = 0; i < 8; i++) {
  117. xortab(detail::FingerprintTable<BITS>::table[i][out & 0xff]);
  118. out >>= 8;
  119. }
  120. return *this;
  121. }
  122. Fingerprint& update(StringPiece str) {
  123. // TODO(tudorb): We could be smart and do update64 or update32 if aligned
  124. for (auto c : str) {
  125. update8(uint8_t(c));
  126. }
  127. return *this;
  128. }
  129. /**
  130. * Return the number of uint64s needed to hold the fingerprint value.
  131. */
  132. constexpr static int size() {
  133. return detail::poly_size(BITS);
  134. }
  135. /**
  136. * Write the computed fingeprint to an array of size() uint64_t's.
  137. * For Fingerprint<64>, size()==1; we write 64 bits in out[0]
  138. * For Fingerprint<96>, size()==2; we write 64 bits in out[0] and
  139. * the most significant 32 bits of out[1]
  140. * For Fingerprint<128>, size()==2; we write 64 bits in out[0] and
  141. * 64 bits in out[1].
  142. */
  143. void write(uint64_t* out) const {
  144. for (int i = 0; i < size(); i++) {
  145. out[i] = fp_[i];
  146. }
  147. }
  148. private:
  149. // XOR the fingerprint with a value from one of the tables.
  150. void xortab(std::array<uint64_t, detail::poly_size(BITS)> const& tab) {
  151. for (int i = 0; i < size(); i++) {
  152. fp_[i] ^= tab[i];
  153. }
  154. }
  155. // Helper functions: shift the fingerprint value left by 8/32/64 bits,
  156. // return the "out" value (the bits that were shifted out), and add "v"
  157. // in the bits on the right.
  158. uint8_t shlor8(uint8_t v);
  159. uint32_t shlor32(uint32_t v);
  160. uint64_t shlor64(uint64_t v);
  161. uint64_t fp_[detail::poly_size(BITS)];
  162. };
  163. // Convenience functions
  164. /**
  165. * Return the 64-bit Rabin fingerprint of a string.
  166. */
  167. inline uint64_t fingerprint64(StringPiece str) {
  168. uint64_t fp;
  169. Fingerprint<64>().update(str).write(&fp);
  170. return fp;
  171. }
  172. /**
  173. * Compute the 96-bit Rabin fingerprint of a string.
  174. * Return the 64 most significant bits in *msb, and the 32 least significant
  175. * bits in *lsb.
  176. */
  177. inline void fingerprint96(StringPiece str, uint64_t* msb, uint32_t* lsb) {
  178. uint64_t fp[2];
  179. Fingerprint<96>().update(str).write(fp);
  180. *msb = fp[0];
  181. *lsb = (uint32_t)(fp[1] >> 32);
  182. }
  183. /**
  184. * Compute the 128-bit Rabin fingerprint of a string.
  185. * Return the 64 most significant bits in *msb, and the 64 least significant
  186. * bits in *lsb.
  187. */
  188. inline void fingerprint128(StringPiece str, uint64_t* msb, uint64_t* lsb) {
  189. uint64_t fp[2];
  190. Fingerprint<128>().update(str).write(fp);
  191. *msb = fp[0];
  192. *lsb = fp[1];
  193. }
  194. template <>
  195. inline uint8_t Fingerprint<64>::shlor8(uint8_t v) {
  196. uint8_t out = (uint8_t)(fp_[0] >> 56);
  197. fp_[0] = (fp_[0] << 8) | ((uint64_t)v);
  198. return out;
  199. }
  200. template <>
  201. inline uint32_t Fingerprint<64>::shlor32(uint32_t v) {
  202. uint32_t out = (uint32_t)(fp_[0] >> 32);
  203. fp_[0] = (fp_[0] << 32) | ((uint64_t)v);
  204. return out;
  205. }
  206. template <>
  207. inline uint64_t Fingerprint<64>::shlor64(uint64_t v) {
  208. uint64_t out = fp_[0];
  209. fp_[0] = v;
  210. return out;
  211. }
  212. template <>
  213. inline uint8_t Fingerprint<96>::shlor8(uint8_t v) {
  214. uint8_t out = (uint8_t)(fp_[0] >> 56);
  215. fp_[0] = (fp_[0] << 8) | (fp_[1] >> 56);
  216. fp_[1] = (fp_[1] << 8) | ((uint64_t)v << 32);
  217. return out;
  218. }
  219. template <>
  220. inline uint32_t Fingerprint<96>::shlor32(uint32_t v) {
  221. uint32_t out = (uint32_t)(fp_[0] >> 32);
  222. fp_[0] = (fp_[0] << 32) | (fp_[1] >> 32);
  223. fp_[1] = ((uint64_t)v << 32);
  224. return out;
  225. }
  226. template <>
  227. inline uint64_t Fingerprint<96>::shlor64(uint64_t v) {
  228. uint64_t out = fp_[0];
  229. fp_[0] = fp_[1] | (v >> 32);
  230. fp_[1] = v << 32;
  231. return out;
  232. }
  233. template <>
  234. inline uint8_t Fingerprint<128>::shlor8(uint8_t v) {
  235. uint8_t out = (uint8_t)(fp_[0] >> 56);
  236. fp_[0] = (fp_[0] << 8) | (fp_[1] >> 56);
  237. fp_[1] = (fp_[1] << 8) | ((uint64_t)v);
  238. return out;
  239. }
  240. template <>
  241. inline uint32_t Fingerprint<128>::shlor32(uint32_t v) {
  242. uint32_t out = (uint32_t)(fp_[0] >> 32);
  243. fp_[0] = (fp_[0] << 32) | (fp_[1] >> 32);
  244. fp_[1] = (fp_[1] << 32) | ((uint64_t)v);
  245. return out;
  246. }
  247. template <>
  248. inline uint64_t Fingerprint<128>::shlor64(uint64_t v) {
  249. uint64_t out = fp_[0];
  250. fp_[0] = fp_[1];
  251. fp_[1] = v;
  252. return out;
  253. }
  254. } // namespace folly