MathBenchmark.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. /*
  2. * Copyright 2016-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <folly/Math.h>
  17. #include <algorithm>
  18. #include <random>
  19. #include <folly/Benchmark.h>
  20. namespace {
  21. template <typename T>
  22. T brokenButWidespreadDivCeil(T num, T denom) {
  23. return (num + denom - 1) / denom;
  24. }
  25. template <typename T>
  26. T viaFloatDivCeil(T num, T denom) {
  27. return static_cast<T>(ceilf(static_cast<float>(num) / denom));
  28. }
  29. template <typename T>
  30. T viaDoubleDivCeil(T num, T denom) {
  31. return static_cast<T>(ceil(static_cast<double>(num) / denom));
  32. }
  33. template <typename T>
  34. T viaLongDoubleDivCeil(T num, T denom) {
  35. return static_cast<T>(ceill(static_cast<long double>(num) / denom));
  36. }
  37. template <typename T>
  38. std::vector<T> divValues() {
  39. std::vector<T> rv;
  40. for (T i = 1; i < std::numeric_limits<T>::max() && i <= 1000; ++i) {
  41. rv.push_back(i);
  42. rv.push_back(-i);
  43. rv.push_back(std::numeric_limits<T>::max() / i);
  44. auto x = std::numeric_limits<T>::min() / i;
  45. if (x != 0) {
  46. rv.push_back(x);
  47. }
  48. }
  49. return rv;
  50. }
  51. template <typename T, typename F>
  52. void runDivTests(const F& func, size_t iters) {
  53. std::vector<T> denoms;
  54. std::vector<T> numers;
  55. BENCHMARK_SUSPEND {
  56. denoms = divValues<T>();
  57. numers = denoms;
  58. numers.push_back(0);
  59. std::mt19937 rnd(1234);
  60. std::shuffle(denoms.begin(), denoms.end(), rnd);
  61. std::shuffle(numers.begin(), numers.end(), rnd);
  62. }
  63. T dep = 0;
  64. while (true) {
  65. for (T d : denoms) {
  66. for (T n : numers) {
  67. n ^= dep;
  68. if (std::is_signed<T>::value && n == std::numeric_limits<T>::min() &&
  69. d == -1) {
  70. // min / -1 overflows in two's complement
  71. d = -2;
  72. }
  73. dep = func(n, d);
  74. if (--iters == 0) {
  75. folly::doNotOptimizeAway(dep);
  76. return;
  77. }
  78. }
  79. }
  80. }
  81. }
  82. } // namespace
  83. BENCHMARK_DRAW_LINE();
  84. BENCHMARK(divTruncInt8, iters) {
  85. runDivTests<int8_t>(&folly::divTrunc<int8_t, int8_t>, iters);
  86. }
  87. BENCHMARK(divFloorInt8, iters) {
  88. runDivTests<int8_t>(&folly::divFloor<int8_t, int8_t>, iters);
  89. }
  90. BENCHMARK(divCeilInt8, iters) {
  91. runDivTests<int8_t>(&folly::divCeil<int8_t, int8_t>, iters);
  92. }
  93. BENCHMARK_RELATIVE(branchlessDivCeilInt8, iters) {
  94. runDivTests<int8_t>(&folly::detail::divCeilBranchless<int8_t>, iters);
  95. }
  96. BENCHMARK_RELATIVE(branchfulDivCeilInt8, iters) {
  97. runDivTests<int8_t>(&folly::detail::divCeilBranchful<int8_t>, iters);
  98. }
  99. BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt8, iters) {
  100. runDivTests<int8_t>(&brokenButWidespreadDivCeil<int8_t>, iters);
  101. }
  102. BENCHMARK_RELATIVE(viaFloatDivCeilInt8, iters) {
  103. runDivTests<int8_t>(&viaFloatDivCeil<int8_t>, iters);
  104. }
  105. BENCHMARK_RELATIVE(viaDoubleDivCeilInt8, iters) {
  106. runDivTests<int8_t>(&viaDoubleDivCeil<int8_t>, iters);
  107. }
  108. BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt8, iters) {
  109. runDivTests<int8_t>(&viaLongDoubleDivCeil<int8_t>, iters);
  110. }
  111. BENCHMARK(divRoundAwayInt8, iters) {
  112. runDivTests<int8_t>(&folly::divRoundAway<int8_t, int8_t>, iters);
  113. }
  114. BENCHMARK_DRAW_LINE();
  115. BENCHMARK(divTruncInt16, iters) {
  116. runDivTests<int16_t>(&folly::divTrunc<int16_t, int16_t>, iters);
  117. }
  118. BENCHMARK(divFloorInt16, iters) {
  119. runDivTests<int16_t>(&folly::divFloor<int16_t, int16_t>, iters);
  120. }
  121. BENCHMARK(divCeilInt16, iters) {
  122. runDivTests<int16_t>(&folly::divCeil<int16_t, int16_t>, iters);
  123. }
  124. BENCHMARK_RELATIVE(branchlessDivCeilInt16, iters) {
  125. runDivTests<int16_t>(&folly::detail::divCeilBranchless<int16_t>, iters);
  126. }
  127. BENCHMARK_RELATIVE(branchfulDivCeilInt16, iters) {
  128. runDivTests<int16_t>(&folly::detail::divCeilBranchful<int16_t>, iters);
  129. }
  130. BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt16, iters) {
  131. runDivTests<int16_t>(&brokenButWidespreadDivCeil<int16_t>, iters);
  132. }
  133. BENCHMARK_RELATIVE(viaFloatDivCeilInt16, iters) {
  134. runDivTests<int16_t>(&viaFloatDivCeil<int16_t>, iters);
  135. }
  136. BENCHMARK_RELATIVE(viaDoubleDivCeilInt16, iters) {
  137. runDivTests<int16_t>(&viaDoubleDivCeil<int16_t>, iters);
  138. }
  139. BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt16, iters) {
  140. runDivTests<int16_t>(&viaLongDoubleDivCeil<int16_t>, iters);
  141. }
  142. BENCHMARK(divRoundAwayInt16, iters) {
  143. runDivTests<int16_t>(&folly::divRoundAway<int16_t, int16_t>, iters);
  144. }
  145. BENCHMARK_DRAW_LINE();
  146. BENCHMARK(divTruncInt32, iters) {
  147. runDivTests<int32_t>(&folly::divTrunc<int32_t, int32_t>, iters);
  148. }
  149. BENCHMARK(divFloorInt32, iters) {
  150. runDivTests<int32_t>(&folly::divFloor<int32_t, int32_t>, iters);
  151. }
  152. BENCHMARK(divCeilInt32, iters) {
  153. runDivTests<int32_t>(&folly::divCeil<int32_t, int32_t>, iters);
  154. }
  155. BENCHMARK_RELATIVE(branchlessDivCeilInt32, iters) {
  156. runDivTests<int32_t>(&folly::detail::divCeilBranchless<int32_t>, iters);
  157. }
  158. BENCHMARK_RELATIVE(branchfulDivCeilInt32, iters) {
  159. runDivTests<int32_t>(&folly::detail::divCeilBranchful<int32_t>, iters);
  160. }
  161. BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt32, iters) {
  162. runDivTests<int32_t>(&brokenButWidespreadDivCeil<int32_t>, iters);
  163. }
  164. BENCHMARK_RELATIVE(approxViaFloatDivCeilInt32, iters) {
  165. runDivTests<int32_t>(&viaFloatDivCeil<int32_t>, iters);
  166. }
  167. BENCHMARK_RELATIVE(viaDoubleDivCeilInt32, iters) {
  168. runDivTests<int32_t>(&viaDoubleDivCeil<int32_t>, iters);
  169. }
  170. BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt32, iters) {
  171. runDivTests<int32_t>(&viaLongDoubleDivCeil<int32_t>, iters);
  172. }
  173. BENCHMARK(divRoundAwayInt32, iters) {
  174. runDivTests<int32_t>(&folly::divRoundAway<int32_t, int32_t>, iters);
  175. }
  176. BENCHMARK_DRAW_LINE();
  177. BENCHMARK(divTruncInt64, iters) {
  178. runDivTests<int64_t>(&folly::divTrunc<int64_t, int64_t>, iters);
  179. }
  180. BENCHMARK(divFloorInt64, iters) {
  181. runDivTests<int64_t>(&folly::divFloor<int64_t, int64_t>, iters);
  182. }
  183. BENCHMARK(divCeilInt64, iters) {
  184. runDivTests<int64_t>(&folly::divCeil<int64_t, int64_t>, iters);
  185. }
  186. BENCHMARK_RELATIVE(branchlessDivCeilInt64, iters) {
  187. runDivTests<int64_t>(&folly::detail::divCeilBranchless<int64_t>, iters);
  188. }
  189. BENCHMARK_RELATIVE(branchfulDivCeilInt64, iters) {
  190. runDivTests<int64_t>(&folly::detail::divCeilBranchful<int64_t>, iters);
  191. }
  192. BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt64, iters) {
  193. runDivTests<int64_t>(&brokenButWidespreadDivCeil<int64_t>, iters);
  194. }
  195. BENCHMARK_RELATIVE(approxViaFloatDivCeilInt64, iters) {
  196. runDivTests<int64_t>(&viaFloatDivCeil<int64_t>, iters);
  197. }
  198. BENCHMARK_RELATIVE(approxViaDoubleDivCeilInt64, iters) {
  199. runDivTests<int64_t>(&viaDoubleDivCeil<int64_t>, iters);
  200. }
  201. BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt64, iters) {
  202. runDivTests<int64_t>(&viaLongDoubleDivCeil<int64_t>, iters);
  203. }
  204. BENCHMARK(divRoundAwayInt64, iters) {
  205. runDivTests<int64_t>(&folly::divRoundAway<int64_t, int64_t>, iters);
  206. }
  207. BENCHMARK_DRAW_LINE();
  208. BENCHMARK(divTruncUint8, iters) {
  209. runDivTests<uint8_t>(&folly::divTrunc<uint8_t, uint8_t>, iters);
  210. }
  211. BENCHMARK(divFloorUint8, iters) {
  212. runDivTests<uint8_t>(&folly::divFloor<uint8_t, uint8_t>, iters);
  213. }
  214. BENCHMARK(divCeilUint8, iters) {
  215. runDivTests<uint8_t>(&folly::divCeil<uint8_t, uint8_t>, iters);
  216. }
  217. BENCHMARK_RELATIVE(branchlessDivCeilUint8, iters) {
  218. runDivTests<uint8_t>(&folly::detail::divCeilBranchless<uint8_t>, iters);
  219. }
  220. BENCHMARK_RELATIVE(branchfulDivCeilUint8, iters) {
  221. runDivTests<uint8_t>(&folly::detail::divCeilBranchful<uint8_t>, iters);
  222. }
  223. BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint8, iters) {
  224. runDivTests<uint8_t>(&brokenButWidespreadDivCeil<uint8_t>, iters);
  225. }
  226. BENCHMARK_RELATIVE(viaFloatDivCeilUint8, iters) {
  227. runDivTests<uint8_t>(&viaFloatDivCeil<uint8_t>, iters);
  228. }
  229. BENCHMARK_RELATIVE(viaDoubleDivCeilUint8, iters) {
  230. runDivTests<uint8_t>(&viaDoubleDivCeil<uint8_t>, iters);
  231. }
  232. BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint8, iters) {
  233. runDivTests<uint8_t>(&viaLongDoubleDivCeil<uint8_t>, iters);
  234. }
  235. BENCHMARK(divRoundAwayUint8, iters) {
  236. runDivTests<uint8_t>(&folly::divRoundAway<uint8_t, uint8_t>, iters);
  237. }
  238. BENCHMARK_DRAW_LINE();
  239. BENCHMARK(divTruncUint16, iters) {
  240. runDivTests<uint16_t>(&folly::divTrunc<uint16_t, uint16_t>, iters);
  241. }
  242. BENCHMARK(divFloorUint16, iters) {
  243. runDivTests<uint16_t>(&folly::divFloor<uint16_t, uint16_t>, iters);
  244. }
  245. BENCHMARK(divCeilUint16, iters) {
  246. runDivTests<uint16_t>(&folly::divCeil<uint16_t, uint16_t>, iters);
  247. }
  248. BENCHMARK_RELATIVE(branchlessDivCeilUint16, iters) {
  249. runDivTests<uint16_t>(&folly::detail::divCeilBranchless<uint16_t>, iters);
  250. }
  251. BENCHMARK_RELATIVE(branchfulDivCeilUint16, iters) {
  252. runDivTests<uint16_t>(&folly::detail::divCeilBranchful<uint16_t>, iters);
  253. }
  254. BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint16, iters) {
  255. runDivTests<uint16_t>(&brokenButWidespreadDivCeil<uint16_t>, iters);
  256. }
  257. BENCHMARK_RELATIVE(viaFloatDivCeilUint16, iters) {
  258. runDivTests<uint16_t>(&viaFloatDivCeil<uint16_t>, iters);
  259. }
  260. BENCHMARK_RELATIVE(viaDoubleDivCeilUint16, iters) {
  261. runDivTests<uint16_t>(&viaDoubleDivCeil<uint16_t>, iters);
  262. }
  263. BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint16, iters) {
  264. runDivTests<uint16_t>(&viaLongDoubleDivCeil<uint16_t>, iters);
  265. }
  266. BENCHMARK(divRoundAwayUint16, iters) {
  267. runDivTests<uint16_t>(&folly::divRoundAway<uint16_t, uint16_t>, iters);
  268. }
  269. BENCHMARK_DRAW_LINE();
  270. BENCHMARK(divTruncUint32, iters) {
  271. runDivTests<uint32_t>(&folly::divTrunc<uint32_t, uint32_t>, iters);
  272. }
  273. BENCHMARK(divFloorUint32, iters) {
  274. runDivTests<uint32_t>(&folly::divFloor<uint32_t, uint32_t>, iters);
  275. }
  276. BENCHMARK(divCeilUint32, iters) {
  277. runDivTests<uint32_t>(&folly::divCeil<uint32_t, uint32_t>, iters);
  278. }
  279. BENCHMARK_RELATIVE(branchlessDivCeilUint32, iters) {
  280. runDivTests<uint32_t>(&folly::detail::divCeilBranchless<uint32_t>, iters);
  281. }
  282. BENCHMARK_RELATIVE(branchfulDivCeilUint32, iters) {
  283. runDivTests<uint32_t>(&folly::detail::divCeilBranchful<uint32_t>, iters);
  284. }
  285. BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint32, iters) {
  286. runDivTests<uint32_t>(&brokenButWidespreadDivCeil<uint32_t>, iters);
  287. }
  288. BENCHMARK_RELATIVE(approxViaFloatDivCeilUint32, iters) {
  289. runDivTests<uint32_t>(&viaFloatDivCeil<uint32_t>, iters);
  290. }
  291. BENCHMARK_RELATIVE(viaDoubleDivCeilUint32, iters) {
  292. runDivTests<uint32_t>(&viaDoubleDivCeil<uint32_t>, iters);
  293. }
  294. BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint32, iters) {
  295. runDivTests<uint32_t>(&viaLongDoubleDivCeil<uint32_t>, iters);
  296. }
  297. BENCHMARK(divRoundAwayUint32, iters) {
  298. runDivTests<uint32_t>(&folly::divRoundAway<uint32_t, uint32_t>, iters);
  299. }
  300. BENCHMARK_DRAW_LINE();
  301. BENCHMARK(divTruncUint64, iters) {
  302. runDivTests<uint64_t>(&folly::divTrunc<uint64_t, uint64_t>, iters);
  303. }
  304. BENCHMARK(divFloorUint64, iters) {
  305. runDivTests<uint64_t>(&folly::divFloor<uint64_t, uint64_t>, iters);
  306. }
  307. BENCHMARK(divCeilUint64, iters) {
  308. runDivTests<uint64_t>(&folly::divCeil<uint64_t, uint64_t>, iters);
  309. }
  310. BENCHMARK_RELATIVE(branchlessDivCeilUint64, iters) {
  311. runDivTests<uint64_t>(&folly::detail::divCeilBranchless<uint64_t>, iters);
  312. }
  313. BENCHMARK_RELATIVE(branchfulDivCeilUint64, iters) {
  314. runDivTests<uint64_t>(&folly::detail::divCeilBranchful<uint64_t>, iters);
  315. }
  316. BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint64, iters) {
  317. runDivTests<uint64_t>(&brokenButWidespreadDivCeil<uint64_t>, iters);
  318. }
  319. BENCHMARK_RELATIVE(approxViaFloatDivCeilUint64, iters) {
  320. runDivTests<uint64_t>(&viaFloatDivCeil<uint64_t>, iters);
  321. }
  322. BENCHMARK_RELATIVE(approxViaDoubleDivCeilUint64, iters) {
  323. runDivTests<uint64_t>(&viaDoubleDivCeil<uint64_t>, iters);
  324. }
  325. BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint64, iters) {
  326. runDivTests<uint64_t>(&viaLongDoubleDivCeil<uint64_t>, iters);
  327. }
  328. BENCHMARK(divRoundAwayUint64, iters) {
  329. runDivTests<uint64_t>(&folly::divRoundAway<uint64_t, uint64_t>, iters);
  330. }
  331. int main(int argc, char** argv) {
  332. gflags::ParseCommandLineFlags(&argc, &argv, true);
  333. folly::runBenchmarks();
  334. return 0;
  335. }
  336. /*
  337. Benchmarks run single-threaded on a dual Xeon E5-2660 @ 2.2 Ghz with
  338. hyperthreading (16 physical cores, 20 MB cache per socket, 256 GB RAM)
  339. Benchmarks used --bm_min_iters=10000000.
  340. divTrunc is just a native integral division. viaDoubleViaCeil doesn't
  341. have full accuracy for Int64 or Uint64. There is a loop-carried
  342. dependency for all of the div* tests, but there is a bit of extra slack
  343. (a predictable call, a load that should be from the L1, and a predictable
  344. not-taken branch in addition to the loop's branch) in the driving loop,
  345. so the benchmark driver's attempt to subtract the overhead of the loop
  346. might mean that the latency numbers here are slightly too low or too high.
  347. The branchful implementation's branch is very predictable in this
  348. microbenchmark for unsigned types, since it only needs to predict a
  349. zero numerator. That's likely to be true in real life as well, so we
  350. make this the default.
  351. I was surprised at the speed of float and double division, but
  352. the only case where it actually wins by much and is correct is for
  353. int16_t. (float + ceil is faster for the 32-bit case, but is only
  354. an approximation.) I ran a similar benchmark setup for ARM and ARM64.
  355. On ARM the conditional versions win by quite a bit. 32-bit ARM doesn't
  356. have a native integer divide, so getting the remainder after a division
  357. (to see if truncation occurred) is more work than preconditioning the
  358. numerator to make truncation go in the correct direction. 64-bit ARM
  359. had the same winners and losers as x86_64, at least on the two physical
  360. instances I tested.
  361. ============================================================================
  362. folly/test/MathBenchmark.cpp relative time/iter iters/s
  363. ============================================================================
  364. ----------------------------------------------------------------------------
  365. divTruncInt8 8.89ns 112.44M
  366. divFloorInt8 10.99ns 91.00M
  367. divCeilInt8 10.95ns 91.33M
  368. branchlessDivCeilInt8 100.40% 10.91ns 91.69M
  369. branchfulDivCeilInt8 88.87% 12.32ns 81.16M
  370. brokenButWidespreadDivCeilInt8 109.20% 10.03ns 99.73M
  371. viaFloatDivCeilInt8 109.68% 9.98ns 100.17M
  372. viaDoubleDivCeilInt8 95.47% 11.47ns 87.19M
  373. viaLongDoubleDivCeilInt8 31.65% 34.59ns 28.91M
  374. divRoundAwayInt8 10.42ns 95.97M
  375. ----------------------------------------------------------------------------
  376. divTruncInt16 8.68ns 115.17M
  377. divFloorInt16 10.94ns 91.38M
  378. divCeilInt16 10.91ns 91.70M
  379. branchlessDivCeilInt16 99.44% 10.97ns 91.18M
  380. branchfulDivCeilInt16 81.68% 13.35ns 74.90M
  381. brokenButWidespreadDivCeilInt16 109.50% 9.96ns 100.40M
  382. viaFloatDivCeilInt16 108.04% 10.09ns 99.07M
  383. viaDoubleDivCeilInt16 85.38% 12.77ns 78.29M
  384. viaLongDoubleDivCeilInt16 29.99% 36.36ns 27.50M
  385. divRoundAwayInt16 10.59ns 94.46M
  386. ----------------------------------------------------------------------------
  387. divTruncInt32 8.38ns 119.29M
  388. divFloorInt32 11.01ns 90.84M
  389. divCeilInt32 11.12ns 89.91M
  390. branchlessDivCeilInt32 101.94% 10.91ns 91.66M
  391. branchfulDivCeilInt32 84.67% 13.14ns 76.12M
  392. brokenButWidespreadDivCeilInt32 117.61% 9.46ns 105.75M
  393. approxViaFloatDivCeilInt32 115.98% 9.59ns 104.28M
  394. viaDoubleDivCeilInt32 89.86% 12.38ns 80.79M
  395. viaLongDoubleDivCeilInt32 30.84% 36.06ns 27.73M
  396. divRoundAwayInt32 11.30ns 88.50M
  397. ----------------------------------------------------------------------------
  398. divTruncInt64 16.07ns 62.21M
  399. divFloorInt64 18.37ns 54.45M
  400. divCeilInt64 18.61ns 53.74M
  401. branchlessDivCeilInt64 100.43% 18.53ns 53.97M
  402. branchfulDivCeilInt64 84.65% 21.98ns 45.49M
  403. brokenButWidespreadDivCeilInt64 108.47% 17.16ns 58.29M
  404. approxViaFloatDivCeilInt64 190.99% 9.74ns 102.64M
  405. approxViaDoubleDivCeilInt64 148.64% 12.52ns 79.88M
  406. viaLongDoubleDivCeilInt64 52.01% 35.77ns 27.95M
  407. divRoundAwayInt64 18.79ns 53.21M
  408. ----------------------------------------------------------------------------
  409. divTruncUint8 7.76ns 128.89M
  410. divFloorUint8 8.29ns 120.61M
  411. divCeilUint8 9.61ns 104.09M
  412. branchlessDivCeilUint8 112.00% 8.58ns 116.58M
  413. branchfulDivCeilUint8 114.01% 8.43ns 118.67M
  414. brokenButWidespreadDivCeilUint8 100.48% 9.56ns 104.58M
  415. viaFloatDivCeilUint8 103.53% 9.28ns 107.76M
  416. viaDoubleDivCeilUint8 85.75% 11.20ns 89.26M
  417. viaLongDoubleDivCeilUint8 27.72% 34.65ns 28.86M
  418. divRoundAwayUint8 9.60ns 104.11M
  419. ----------------------------------------------------------------------------
  420. divTruncUint16 8.39ns 119.19M
  421. divFloorUint16 8.28ns 120.82M
  422. divCeilUint16 9.90ns 100.96M
  423. branchlessDivCeilUint16 100.23% 9.88ns 101.19M
  424. branchfulDivCeilUint16 107.83% 9.19ns 108.87M
  425. brokenButWidespreadDivCeilUint16 99.89% 9.92ns 100.85M
  426. viaFloatDivCeilUint16 100.54% 9.85ns 101.50M
  427. viaDoubleDivCeilUint16 77.38% 12.80ns 78.13M
  428. viaLongDoubleDivCeilUint16 27.30% 36.28ns 27.56M
  429. divRoundAwayUint16 9.82ns 101.85M
  430. ----------------------------------------------------------------------------
  431. divTruncUint32 8.12ns 123.20M
  432. divFloorUint32 8.09ns 123.58M
  433. divCeilUint32 8.44ns 118.55M
  434. branchlessDivCeilUint32 88.27% 9.56ns 104.64M
  435. branchfulDivCeilUint32 98.91% 8.53ns 117.25M
  436. brokenButWidespreadDivCeilUint32 93.48% 9.02ns 110.82M
  437. approxViaFloatDivCeilUint32 86.29% 9.78ns 102.30M
  438. viaDoubleDivCeilUint32 66.76% 12.63ns 79.15M
  439. viaLongDoubleDivCeilUint32 23.35% 36.13ns 27.68M
  440. divRoundAwayUint32 8.47ns 118.03M
  441. ----------------------------------------------------------------------------
  442. divTruncUint64 12.38ns 80.79M
  443. divFloorUint64 12.27ns 81.47M
  444. divCeilUint64 12.66ns 78.99M
  445. branchlessDivCeilUint64 93.46% 13.55ns 73.83M
  446. branchfulDivCeilUint64 100.30% 12.62ns 79.23M
  447. brokenButWidespreadDivCeilUint64 99.41% 12.73ns 78.53M
  448. approxViaFloatDivCeilUint64 106.59% 11.88ns 84.19M
  449. approxViaDoubleDivCeilUint64 92.14% 13.74ns 72.78M
  450. viaLongDoubleDivCeilUint64 33.51% 37.78ns 26.47M
  451. divRoundAwayUint64 12.34ns 81.02M
  452. ============================================================================
  453. */