RangeSse42.cpp 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. /*
  2. * Copyright 2015-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <folly/detail/RangeSse42.h>
  17. #include <glog/logging.h>
  18. #include <folly/Portability.h>
  19. // Essentially, two versions of this file: one with an SSE42 implementation
  20. // and one with a fallback implementation. We determine which version to use by
  21. // testing for the presence of the required headers.
  22. //
  23. // TODO: Maybe this should be done by the build system....
  24. #if !FOLLY_SSE_PREREQ(4, 2)
  25. namespace folly {
  26. namespace detail {
  27. size_t qfind_first_byte_of_sse42(
  28. const StringPieceLite haystack,
  29. const StringPieceLite needles) {
  30. return qfind_first_byte_of_nosse(haystack, needles);
  31. }
  32. } // namespace detail
  33. } // namespace folly
  34. #else
  35. #include <cstdint>
  36. #include <limits>
  37. #include <string>
  38. #include <emmintrin.h>
  39. #include <nmmintrin.h>
  40. #include <smmintrin.h>
  41. #include <folly/Likely.h>
  42. #include <folly/detail/Sse.h>
  43. namespace folly {
  44. namespace detail {
  45. // It's okay if pages are bigger than this (as powers of two), but they should
  46. // not be smaller.
  47. static constexpr size_t kMinPageSize = 4096;
  48. static_assert(
  49. kMinPageSize >= 16,
  50. "kMinPageSize must be at least SSE register size");
  51. template <typename T>
  52. static inline uintptr_t page_for(T* addr) {
  53. return reinterpret_cast<uintptr_t>(addr) / kMinPageSize;
  54. }
  55. static inline size_t nextAlignedIndex(const char* arr) {
  56. auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
  57. return 1 + // add 1 because the index starts at 'arr'
  58. ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
  59. - firstPossible;
  60. }
  61. // helper method for case where needles.size() <= 16
  62. size_t qfind_first_byte_of_needles16(
  63. const StringPieceLite haystack,
  64. const StringPieceLite needles) {
  65. DCHECK_GT(haystack.size(), 0u);
  66. DCHECK_GT(needles.size(), 0u);
  67. DCHECK_LE(needles.size(), 16u);
  68. if ((needles.size() <= 2 && haystack.size() >= 256) ||
  69. // must bail if we can't even SSE-load a single segment of haystack
  70. (haystack.size() < 16 &&
  71. page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) ||
  72. // can't load needles into SSE register if it could cross page boundary
  73. page_for(needles.end() - 1) != page_for(needles.data() + 15)) {
  74. return detail::qfind_first_byte_of_nosse(haystack, needles);
  75. }
  76. auto arr2 = _mm_loadu_si128_unchecked(
  77. reinterpret_cast<const __m128i*>(needles.data()));
  78. // do an unaligned load for first block of haystack
  79. auto arr1 = _mm_loadu_si128_unchecked(
  80. reinterpret_cast<const __m128i*>(haystack.data()));
  81. auto index =
  82. _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0);
  83. if (index < 16) {
  84. return size_t(index);
  85. }
  86. // Now, we can do aligned loads hereafter...
  87. size_t i = nextAlignedIndex(haystack.data());
  88. for (; i < haystack.size(); i += 16) {
  89. arr1 = _mm_load_si128_unchecked(
  90. reinterpret_cast<const __m128i*>(haystack.data() + i));
  91. index = _mm_cmpestri(
  92. arr2, int(needles.size()), arr1, int(haystack.size() - i), 0);
  93. if (index < 16) {
  94. return i + index;
  95. }
  96. }
  97. return std::string::npos;
  98. }
  99. // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
  100. // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
  101. // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
  102. // block.
  103. template <bool HAYSTACK_ALIGNED>
  104. size_t scanHaystackBlock(
  105. const StringPieceLite haystack,
  106. const StringPieceLite needles,
  107. uint64_t blockStartIdx) {
  108. DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method
  109. DCHECK(
  110. blockStartIdx + 16 <= haystack.size() ||
  111. (page_for(haystack.data() + blockStartIdx) ==
  112. page_for(haystack.data() + blockStartIdx + 15)));
  113. __m128i arr1;
  114. if (HAYSTACK_ALIGNED) {
  115. arr1 = _mm_load_si128_unchecked(
  116. reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
  117. } else {
  118. arr1 = _mm_loadu_si128_unchecked(
  119. reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
  120. }
  121. // This load is safe because needles.size() >= 16
  122. auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data()));
  123. auto b =
  124. _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
  125. size_t j = nextAlignedIndex(needles.data());
  126. for (; j < needles.size(); j += 16) {
  127. arr2 = _mm_load_si128_unchecked(
  128. reinterpret_cast<const __m128i*>(needles.data() + j));
  129. auto index = _mm_cmpestri(
  130. arr2,
  131. int(needles.size() - j),
  132. arr1,
  133. int(haystack.size() - blockStartIdx),
  134. 0);
  135. b = std::min(index, b);
  136. }
  137. if (b < 16) {
  138. return blockStartIdx + b;
  139. }
  140. return std::string::npos;
  141. }
  142. size_t qfind_first_byte_of_sse42(
  143. const StringPieceLite haystack,
  144. const StringPieceLite needles);
  145. size_t qfind_first_byte_of_sse42(
  146. const StringPieceLite haystack,
  147. const StringPieceLite needles) {
  148. if (UNLIKELY(needles.empty() || haystack.empty())) {
  149. return std::string::npos;
  150. } else if (needles.size() <= 16) {
  151. // we can save some unnecessary load instructions by optimizing for
  152. // the common case of needles.size() <= 16
  153. return qfind_first_byte_of_needles16(haystack, needles);
  154. }
  155. if (haystack.size() < 16 &&
  156. page_for(haystack.end() - 1) != page_for(haystack.data() + 16)) {
  157. // We can't safely SSE-load haystack. Use a different approach.
  158. if (haystack.size() <= 2) {
  159. return qfind_first_byte_of_std(haystack, needles);
  160. }
  161. return qfind_first_byte_of_byteset(haystack, needles);
  162. }
  163. auto ret = scanHaystackBlock<false>(haystack, needles, 0);
  164. if (ret != std::string::npos) {
  165. return ret;
  166. }
  167. size_t i = nextAlignedIndex(haystack.data());
  168. for (; i < haystack.size(); i += 16) {
  169. ret = scanHaystackBlock<true>(haystack, needles, i);
  170. if (ret != std::string::npos) {
  171. return ret;
  172. }
  173. }
  174. return std::string::npos;
  175. }
  176. } // namespace detail
  177. } // namespace folly
  178. #endif