Histogram-defs.h 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. /*
  2. * Copyright 2013-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #pragma once
  17. #include <folly/Conv.h>
  18. #include <folly/stats/Histogram.h>
  19. #include <glog/logging.h>
  20. namespace folly {
  21. namespace detail {
  22. template <typename T, typename BucketT>
  23. HistogramBuckets<T, BucketT>::HistogramBuckets(
  24. ValueType bucketSize,
  25. ValueType min,
  26. ValueType max,
  27. const BucketType& defaultBucket)
  28. : bucketSize_(bucketSize), min_(min), max_(max) {
  29. CHECK_GT(bucketSize_, ValueType(0));
  30. CHECK_LT(min_, max_);
  31. // Deliberately make this a signed type, because we're about
  32. // to compare it against max-min, which is nominally signed, too.
  33. int64_t numBuckets = int64_t((max - min) / bucketSize);
  34. // Round up if the bucket size does not fit evenly
  35. if (numBuckets * bucketSize < max - min) {
  36. ++numBuckets;
  37. }
  38. // Add 2 for the extra 'below min' and 'above max' buckets
  39. numBuckets += 2;
  40. buckets_.assign(size_t(numBuckets), defaultBucket);
  41. }
  42. template <typename T, typename BucketType>
  43. size_t HistogramBuckets<T, BucketType>::getBucketIdx(ValueType value) const {
  44. if (value < min_) {
  45. return 0;
  46. } else if (value >= max_) {
  47. return buckets_.size() - 1;
  48. } else {
  49. // the 1 is the below_min bucket
  50. return size_t(((value - min_) / bucketSize_) + 1);
  51. }
  52. }
  53. template <typename T, typename BucketType>
  54. template <typename CountFn>
  55. uint64_t HistogramBuckets<T, BucketType>::computeTotalCount(
  56. CountFn countFromBucket) const {
  57. uint64_t count = 0;
  58. for (size_t n = 0; n < buckets_.size(); ++n) {
  59. count += countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  60. }
  61. return count;
  62. }
  63. template <typename T, typename BucketType>
  64. template <typename CountFn>
  65. size_t HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
  66. double pct,
  67. CountFn countFromBucket,
  68. double* lowPct,
  69. double* highPct) const {
  70. CHECK_GE(pct, 0.0);
  71. CHECK_LE(pct, 1.0);
  72. auto numBuckets = buckets_.size();
  73. // Compute the counts in each bucket
  74. std::vector<uint64_t> counts(numBuckets);
  75. uint64_t totalCount = 0;
  76. for (size_t n = 0; n < numBuckets; ++n) {
  77. uint64_t bucketCount =
  78. countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  79. counts[n] = bucketCount;
  80. totalCount += bucketCount;
  81. }
  82. // If there are no elements, just return the lowest bucket.
  83. // Note that we return bucket 1, which is the first bucket in the
  84. // histogram range; bucket 0 is for all values below min_.
  85. if (totalCount == 0) {
  86. // Set lowPct and highPct both to 0.
  87. // getPercentileEstimate() will recognize this to mean that the histogram
  88. // is empty.
  89. if (lowPct) {
  90. *lowPct = 0.0;
  91. }
  92. if (highPct) {
  93. *highPct = 0.0;
  94. }
  95. return 1;
  96. }
  97. // Loop through all the buckets, keeping track of each bucket's
  98. // percentile range: [0,10], [10,17], [17,45], etc. When we find a range
  99. // that includes our desired percentile, we return that bucket index.
  100. double prevPct = 0.0;
  101. double curPct = 0.0;
  102. uint64_t curCount = 0;
  103. size_t idx;
  104. for (idx = 0; idx < numBuckets; ++idx) {
  105. if (counts[idx] == 0) {
  106. // skip empty buckets
  107. continue;
  108. }
  109. prevPct = curPct;
  110. curCount += counts[idx];
  111. curPct = static_cast<double>(curCount) / totalCount;
  112. if (pct <= curPct) {
  113. // This is the desired bucket
  114. break;
  115. }
  116. }
  117. if (lowPct) {
  118. *lowPct = prevPct;
  119. }
  120. if (highPct) {
  121. *highPct = curPct;
  122. }
  123. return idx;
  124. }
  125. template <typename T, typename BucketType>
  126. template <typename CountFn, typename AvgFn>
  127. T HistogramBuckets<T, BucketType>::getPercentileEstimate(
  128. double pct,
  129. CountFn countFromBucket,
  130. AvgFn avgFromBucket) const {
  131. // Find the bucket where this percentile falls
  132. double lowPct;
  133. double highPct;
  134. size_t bucketIdx =
  135. getPercentileBucketIdx(pct, countFromBucket, &lowPct, &highPct);
  136. if (lowPct == 0.0 && highPct == 0.0) {
  137. // Invalid range -- the buckets must all be empty
  138. // Return the default value for ValueType.
  139. return ValueType();
  140. }
  141. if (lowPct == highPct) {
  142. // Unlikely to have exact equality,
  143. // but just return the bucket average in this case.
  144. // We handle this here to avoid division by 0 below.
  145. return avgFromBucket(buckets_[bucketIdx]);
  146. }
  147. CHECK_GE(pct, lowPct);
  148. CHECK_LE(pct, highPct);
  149. CHECK_LT(lowPct, highPct);
  150. // Compute information about this bucket
  151. ValueType avg = avgFromBucket(buckets_[bucketIdx]);
  152. ValueType low;
  153. ValueType high;
  154. if (bucketIdx == 0) {
  155. if (avg > min_) {
  156. // This normally shouldn't happen. This bucket is only supposed to track
  157. // values less than min_. Most likely this means that integer overflow
  158. // occurred, and the code in avgFromBucket() returned a huge value
  159. // instead of a small one. Just return the minimum possible value for
  160. // now.
  161. //
  162. // (Note that if the counter keeps being decremented, eventually it will
  163. // wrap and become small enough that we won't detect this any more, and
  164. // we will return bogus information.)
  165. LOG(ERROR) << "invalid average value in histogram minimum bucket: " << avg
  166. << " > " << min_ << ": possible integer overflow?";
  167. return getBucketMin(bucketIdx);
  168. }
  169. // For the below-min bucket, just assume the lowest value ever seen is
  170. // twice as far away from min_ as avg.
  171. high = min_;
  172. low = high - (2 * (high - avg));
  173. // Adjust low in case it wrapped
  174. if (low > avg) {
  175. low = std::numeric_limits<ValueType>::min();
  176. }
  177. } else if (bucketIdx == buckets_.size() - 1) {
  178. if (avg < max_) {
  179. // Most likely this means integer overflow occurred. See the comments
  180. // above in the minimum case.
  181. LOG(ERROR) << "invalid average value in histogram maximum bucket: " << avg
  182. << " < " << max_ << ": possible integer overflow?";
  183. return getBucketMax(bucketIdx);
  184. }
  185. // Similarly for the above-max bucket, assume the highest value ever seen
  186. // is twice as far away from max_ as avg.
  187. low = max_;
  188. high = low + (2 * (avg - low));
  189. // Adjust high in case it wrapped
  190. if (high < avg) {
  191. high = std::numeric_limits<ValueType>::max();
  192. }
  193. } else {
  194. low = getBucketMin(bucketIdx);
  195. high = getBucketMax(bucketIdx);
  196. if (avg < low || avg > high) {
  197. // Most likely this means an integer overflow occurred.
  198. // See the comments above. Return the midpoint between low and high
  199. // as a best guess, since avg is meaningless.
  200. LOG(ERROR) << "invalid average value in histogram bucket: " << avg
  201. << " not in range [" << low << ", " << high
  202. << "]: possible integer overflow?";
  203. return (low + high) / 2;
  204. }
  205. }
  206. // Since we know the average value in this bucket, we can do slightly better
  207. // than just assuming the data points in this bucket are uniformly
  208. // distributed between low and high.
  209. //
  210. // Assume that the median value in this bucket is the same as the average
  211. // value.
  212. double medianPct = (lowPct + highPct) / 2.0;
  213. if (pct < medianPct) {
  214. // Assume that the data points lower than the median of this bucket
  215. // are uniformly distributed between low and avg
  216. double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
  217. return T(low + ((avg - low) * pctThroughSection));
  218. } else {
  219. // Assume that the data points greater than the median of this bucket
  220. // are uniformly distributed between avg and high
  221. double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
  222. return T(avg + ((high - avg) * pctThroughSection));
  223. }
  224. }
  225. } // namespace detail
  226. template <typename T>
  227. std::string Histogram<T>::debugString() const {
  228. std::string ret = folly::to<std::string>(
  229. "num buckets: ",
  230. buckets_.getNumBuckets(),
  231. ", bucketSize: ",
  232. buckets_.getBucketSize(),
  233. ", min: ",
  234. buckets_.getMin(),
  235. ", max: ",
  236. buckets_.getMax(),
  237. "\n");
  238. for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
  239. folly::toAppend(
  240. " ",
  241. buckets_.getBucketMin(i),
  242. ": ",
  243. buckets_.getByIndex(i).count,
  244. "\n",
  245. &ret);
  246. }
  247. return ret;
  248. }
  249. template <typename T>
  250. void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
  251. for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
  252. // Do not output empty buckets in order to reduce data file size.
  253. if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
  254. continue;
  255. }
  256. const auto& bucket = getBucketByIndex(i);
  257. out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t' << bucket.count
  258. << '\t' << bucket.sum << '\n';
  259. }
  260. }
  261. } // namespace folly