Instructions.h 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. /*
  2. * Copyright 2015-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #pragma once
  17. #include <glog/logging.h>
  18. #ifdef _MSC_VER
  19. #include <immintrin.h>
  20. #endif
  21. #include <folly/CpuId.h>
  22. #include <folly/Portability.h>
  23. #include <folly/lang/Assume.h>
  24. #include <folly/portability/Builtins.h>
  25. namespace folly {
  26. namespace compression {
  27. namespace instructions {
  28. // NOTE: It's recommended to compile EF coding with -msse4.2, starting
  29. // with Nehalem, Intel CPUs support POPCNT instruction and gcc will emit
  30. // it for __builtin_popcountll intrinsic.
  31. // But we provide an alternative way for the client code: it can switch to
  32. // the appropriate version of EliasFanoReader<> at runtime (client should
  33. // implement this switching logic itself) by specifying instruction set to
  34. // use explicitly.
  35. struct Default {
  36. static bool supported(const folly::CpuId& /* cpuId */ = {}) {
  37. return true;
  38. }
  39. static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) {
  40. return uint64_t(__builtin_popcountll(value));
  41. }
  42. static FOLLY_ALWAYS_INLINE int ctz(uint64_t value) {
  43. DCHECK_GT(value, 0u);
  44. return __builtin_ctzll(value);
  45. }
  46. static FOLLY_ALWAYS_INLINE int clz(uint64_t value) {
  47. DCHECK_GT(value, 0u);
  48. return __builtin_clzll(value);
  49. }
  50. static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) {
  51. return value & (value - 1);
  52. }
  53. // Extract `length` bits starting from `start` from value. Only bits [0:63]
  54. // will be extracted. All higher order bits in the
  55. // result will be zeroed. If no bits are extracted, return 0.
  56. static FOLLY_ALWAYS_INLINE uint64_t
  57. bextr(uint64_t value, uint32_t start, uint32_t length) {
  58. if (start > 63) {
  59. return 0ULL;
  60. }
  61. if (start + length > 64) {
  62. length = 64 - start;
  63. }
  64. return (value >> start) &
  65. ((length == 64) ? (~0ULL) : ((1ULL << length) - 1ULL));
  66. }
  67. // Clear high bits starting at position index.
  68. static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) {
  69. if (index > 63) {
  70. return 0;
  71. }
  72. return value & ((uint64_t(1) << index) - 1);
  73. }
  74. };
  75. struct Nehalem : public Default {
  76. static bool supported(const folly::CpuId& cpuId = {}) {
  77. return cpuId.popcnt();
  78. }
  79. static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) {
  80. // POPCNT is supported starting with Intel Nehalem, AMD K10.
  81. #if defined(__GNUC__) || defined(__clang__)
  82. // GCC and Clang won't inline the intrinsics.
  83. uint64_t result;
  84. asm("popcntq %1, %0" : "=r"(result) : "r"(value));
  85. return result;
  86. #else
  87. return uint64_t(_mm_popcnt_u64(value));
  88. #endif
  89. }
  90. };
  91. struct Haswell : public Nehalem {
  92. static bool supported(const folly::CpuId& cpuId = {}) {
  93. return Nehalem::supported(cpuId) && cpuId.bmi1() && cpuId.bmi2();
  94. }
  95. static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) {
  96. // BMI1 is supported starting with Intel Haswell, AMD Piledriver.
  97. // BLSR combines two instructions into one and reduces register pressure.
  98. #if defined(__GNUC__) || defined(__clang__)
  99. // GCC and Clang won't inline the intrinsics.
  100. uint64_t result;
  101. asm("blsrq %1, %0" : "=r"(result) : "r"(value));
  102. return result;
  103. #else
  104. return _blsr_u64(value);
  105. #endif
  106. }
  107. static FOLLY_ALWAYS_INLINE uint64_t
  108. bextr(uint64_t value, uint32_t start, uint32_t length) {
  109. #if defined(__GNUC__) || defined(__clang__)
  110. // GCC and Clang won't inline the intrinsics.
  111. // Encode parameters in `pattern` where `pattern[0:7]` is `start` and
  112. // `pattern[8:15]` is `length`.
  113. // Ref: Intel Advanced Vector Extensions Programming Reference
  114. uint64_t pattern = start & 0xFF;
  115. pattern = pattern | ((length & 0xFF) << 8);
  116. uint64_t result;
  117. asm("bextrq %2, %1, %0" : "=r"(result) : "r"(value), "r"(pattern));
  118. return result;
  119. #else
  120. return _bextr_u64(value, start, length);
  121. #endif
  122. }
  123. static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) {
  124. #if defined(__GNUC__) || defined(__clang__)
  125. // GCC and Clang won't inline the intrinsics.
  126. const uint64_t index64 = index;
  127. uint64_t result;
  128. asm("bzhiq %2, %1, %0" : "=r"(result) : "r"(value), "r"(index64));
  129. return result;
  130. #else
  131. return _bzhi_u64(value, index);
  132. #endif
  133. }
  134. };
  135. enum class Type {
  136. DEFAULT,
  137. NEHALEM,
  138. HASWELL,
  139. };
  140. inline Type detect() {
  141. const static Type type = [] {
  142. if (instructions::Haswell::supported()) {
  143. VLOG(2) << "Will use folly::compression::instructions::Haswell";
  144. return Type::HASWELL;
  145. } else if (instructions::Nehalem::supported()) {
  146. VLOG(2) << "Will use folly::compression::instructions::Nehalem";
  147. return Type::NEHALEM;
  148. } else {
  149. VLOG(2) << "Will use folly::compression::instructions::Default";
  150. return Type::DEFAULT;
  151. }
  152. }();
  153. return type;
  154. }
  155. template <class F>
  156. auto dispatch(Type type, F&& f) -> decltype(f(std::declval<Default>())) {
  157. switch (type) {
  158. case Type::HASWELL:
  159. return f(Haswell());
  160. case Type::NEHALEM:
  161. return f(Nehalem());
  162. case Type::DEFAULT:
  163. return f(Default());
  164. }
  165. assume_unreachable();
  166. }
  167. template <class F>
  168. auto dispatch(F&& f) -> decltype(f(std::declval<Default>())) {
  169. return dispatch(detect(), std::forward<F>(f));
  170. }
  171. } // namespace instructions
  172. } // namespace compression
  173. } // namespace folly