TDigest.h 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. /*
  2. * Copyright 2012-present Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #pragma once
  17. #include <cmath>
  18. #include <vector>
  19. #include <folly/Range.h>
  20. #include <folly/Utility.h>
  21. namespace folly {
  22. /*
  23. * TDigests are a biased quantile estimator designed to estimate the values of
  24. * the quantiles of streaming data with high accuracy and low memory,
  25. * particularly for quantiles at the tails (p0.1, p1, p99, p99.9). See
  26. * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf
  27. * for an explanation of what the purpose of TDigests is, and how they work.
  28. *
  29. * There is a notable difference between the implementation here and the
  30. * implementation in the paper. In the paper, the recommended scaling function
  31. * for bucketing centroids is an arcsin function. The arcsin function provides
  32. * high accuracy for low memory, but comes at a relatively high compute cost.
  33. * A good choice algorithm has the following properties:
  34. * - The value of the function k(0, delta) = 0, and k(1, delta) = delta.
  35. * This is a requirement for any t-digest function.
  36. * - The limit of the derivative of the function dk/dq at 0 is inf, and at
  37. * 1 is inf. This provides bias to improve accuracy at the tails.
  38. * - For any q <= 0.5, dk/dq(q) = dk/dq(1-q). This ensures that the accuracy
  39. * of upper and lower quantiles are equivalent.
  40. * As such, TDigest uses a sqrt function with these properties, which is faster
  41. * than arcsin. There is a small, but relatively negligible impact to accuracy
  42. * at the tail. In empirical tests, accuracy of the sqrt approach has been
  43. * adequate.
  44. */
  45. class TDigest {
  46. public:
  47. class Centroid {
  48. public:
  49. explicit Centroid(double mean = 0.0, double weight = 1.0)
  50. : mean_(mean), weight_(weight) {
  51. DCHECK_GT(weight, 0);
  52. }
  53. inline double mean() const {
  54. return mean_;
  55. }
  56. inline double weight() const {
  57. return weight_;
  58. }
  59. /*
  60. * Adds the sum/weight to this centroid, and returns the new sum.
  61. */
  62. inline double add(double sum, double weight);
  63. inline bool operator<(const Centroid& other) const {
  64. return mean() < other.mean();
  65. }
  66. private:
  67. double mean_;
  68. double weight_;
  69. };
  70. explicit TDigest(size_t maxSize = 100)
  71. : maxSize_(maxSize), sum_(0.0), count_(0.0), max_(NAN), min_(NAN) {}
  72. explicit TDigest(
  73. std::vector<Centroid> centroids,
  74. double sum,
  75. double count,
  76. double max_val,
  77. double min_val,
  78. size_t maxSize = 100);
  79. /*
  80. * Returns a new TDigest constructed with values merged from the current
  81. * digest and the given sortedValues.
  82. */
  83. TDigest merge(presorted_t, Range<const double*> sortedValues) const;
  84. TDigest merge(Range<const double*> unsortedValues) const;
  85. /*
  86. * Returns a new TDigest constructed with values merged from the given
  87. * digests.
  88. */
  89. static TDigest merge(Range<const TDigest*> digests);
  90. /*
  91. * Estimates the value of the given quantile.
  92. */
  93. double estimateQuantile(double q) const;
  94. double mean() const {
  95. return count_ ? sum_ / count_ : 0;
  96. }
  97. double sum() const {
  98. return sum_;
  99. }
  100. double count() const {
  101. return count_;
  102. }
  103. double min() const {
  104. return min_;
  105. }
  106. double max() const {
  107. return max_;
  108. }
  109. bool empty() const {
  110. return centroids_.empty();
  111. }
  112. const std::vector<Centroid>& getCentroids() const {
  113. return centroids_;
  114. }
  115. size_t maxSize() const {
  116. return maxSize_;
  117. }
  118. private:
  119. std::vector<Centroid> centroids_;
  120. size_t maxSize_;
  121. double sum_;
  122. double count_;
  123. double max_;
  124. double min_;
  125. };
  126. } // namespace folly