caio.co/de/go-tdigest

Introduce a parameter-less New()

Now `tdigest.New()` gives a sane ready-to-use-in-most-cases digest.

Configuration should be done via self referential functions. Ex:

    // create a digest with compression of 200
    tdigest.New(tdigest.Compression(200))

Notice that New() can still panic, which means that deserialization
if still more dangerous than it should.
Id
e3cc95f78bce83e7f57ca929f9fede9aa63f1a21
Author
Caio
Commit time
2017-10-25T14:19:04+02:00

Modified README.md

@@ -41,7 +41,7
)

func main() {
- var t = tdigest.New(100)
+ var t = tdigest.New()

for i := 0; i < 10000; i++ {
t.Add(rand.Float64(), 1)

Modified serialization.go

@@ -76,7 +76,7
return nil, err
}

- t := New(compression)
+ t := New(Compression(uint32(compression)))

var numCentroids int32
err = binary.Read(buf, endianess, &numCentroids)

Modified serialization_test.go

@@ -26,9 +26,7
}

func TestSerialization(t *testing.T) {
- // NOTE Using a high compression value and adding few items
- // so we don't end up compressing automatically
- t1 := New(100)
+ t1 := New()
for i := 0; i < 100; i++ {
_ = t1.Add(rand.Float64(), 1)
}

Modified tdigest.go

@@ -21,24 +21,21
}

// New creates a new digest.
-// The compression parameter rules the threshold in which samples are
-// merged together - the more often distinct samples are merged the more
-// precision is lost. Compression should be tuned according to your data
-// distribution, but a value of 100 is often good enough. A higher
-// compression value means holding more centroids in memory (thus: better
-// precision), which means a bigger serialization payload and higher
-// memory footprint.
-// Compression must be a value greater of equal to 1, will panic
-// otherwise.
-func New(compression float64) *TDigest {
- if compression < 1 {
- panic("Compression must be >= 1.0")
- }
- return &TDigest{
- compression: compression,
- summary: newSummary(estimateCapacity(compression)),
+//
+// By default the digest is constructed with a configuration that
+// should be useful for most use-cases.
+func New(options ...tdigestOption) *TDigest {
+ tdigest := &TDigest{
+ compression: 100,
count: 0,
}
+
+ for _, option := range options {
+ option(tdigest)
+ }
+
+ tdigest.summary = newSummary(estimateCapacity(tdigest.compression))
+ return tdigest
}

func _quantile(index float64, previousIndex float64, nextIndex float64, previousMean float64, nextMean float64) float64 {

Modified tdigest_test.go

@@ -18,7 +18,7
// for all tests. So, no test concurrency here.

func TestTInternals(t *testing.T) {
- tdigest := New(100)
+ tdigest := New()

if !math.IsNaN(tdigest.Quantile(0.1)) {
t.Errorf("Quantile() on an empty digest should return NaN. Got: %.4f", tdigest.Quantile(0.1))
@@ -59,7 +59,7
}

func TestUniformDistribution(t *testing.T) {
- tdigest := New(100)
+ tdigest := New()

for i := 0; i < 100000; i++ {
_ = tdigest.Add(rand.Float64(), 1)
@@ -87,7 +87,7
}

func TestSequentialInsertion(t *testing.T) {
- tdigest := New(100)
+ tdigest := New()

data := make([]float64, 10000)
for i := 0; i < len(data); i++ {
@@ -110,7 +110,7
}

func TestNonSequentialInsertion(t *testing.T) {
- tdigest := New(100)
+ tdigest := New()

// Not quite a uniform distribution, but close.
data := make([]float64, 1000)
@@ -150,7 +150,7
}

func TestSingletonInACrowd(t *testing.T) {
- tdigest := New(100)
+ tdigest := New()
for i := 0; i < 10000; i++ {
tdigest.Add(10, 1)
}
@@ -176,7 +176,7
}

func TestRespectBounds(t *testing.T) {
- tdigest := New(10)
+ tdigest := New(Compression(10))

data := []float64{0, 279, 2, 281}
for _, f := range data {
@@ -196,7 +196,7
}

func TestWeights(t *testing.T) {
- tdigest := New(10)
+ tdigest := New(Compression(10))

// Create data slice with repeats matching weights we gave to tdigest
data := []float64{}
@@ -220,7 +220,7
}

func TestIntegers(t *testing.T) {
- tdigest := New(100)
+ tdigest := New()

_ = tdigest.Add(1, 1)
_ = tdigest.Add(2, 1)
@@ -230,7 +230,7
t.Errorf("Expected p(0.5) = 2, Got %.2f instead", tdigest.Quantile(0.5))
}

- tdigest = New(100)
+ tdigest = New()

for _, i := range []float64{1, 2, 2, 2, 2, 2, 2, 2, 3} {
_ = tdigest.Add(i, 1)
@@ -276,10 +276,10

subs := make([]*TDigest, numSubs)
for i := 0; i < numSubs; i++ {
- subs[i] = New(100)
+ subs[i] = New()
}

- dist := New(100)
+ dist := New()
for i := 0; i < numItems; i++ {
num := rand.Float64()

@@ -290,7 +290,7

dist.Compress()

- dist2 := New(100)
+ dist2 := New()
for i := 0; i < numSubs; i++ {
dist2.Merge(subs[i])
}
@@ -326,7 +326,7
}

func TestCompressDoesntChangeCount(t *testing.T) {
- tdigest := New(100)
+ tdigest := New()

for i := 0; i < 1000; i++ {
_ = tdigest.Add(rand.Float64(), 1)
@@ -355,11 +355,7
}

func TestPanic(t *testing.T) {
- shouldPanic(func() {
- New(0.5)
- }, t, "Compression < 1 should panic!")
-
- tdigest := New(100)
+ tdigest := New()

shouldPanic(func() {
tdigest.Quantile(-42)
@@ -371,7 +367,7
}

func TestForEachCentroid(t *testing.T) {
- tdigest := New(10)
+ tdigest := New(Compression(10))

for i := 0; i < 100; i++ {
_ = tdigest.Add(float64(i), 1)
@@ -398,8 +394,8
}
}

-func benchmarkAdd(compression float64, b *testing.B) {
- t := New(compression)
+func benchmarkAdd(compression uint32, b *testing.B) {
+ t := New(Compression(compression))

data := make([]float64, b.N)
for n := 0; n < b.N; n++ {

Created options.go

@@ -1,0 +1,25
+package tdigest
+
+type tdigestOption func(*TDigest)
+
+// Compression sets the digest compression
+//
+// The compression parameter rules the threshold in which samples are
+// merged together - the more often distinct samples are merged the more
+// precision is lost. Compression should be tuned according to your data
+// distribution, but a value of 100 (the default) is often good enough.
+//
+// A higher compression value means holding more centroids in memory
+// (thus: better precision), which means a bigger serialization payload,
+// higher memory footprint and slower addition of new samples.
+//
+// Compression must be a value greater of equal to 1, will panic
+// otherwise.
+func Compression(compression uint32) tdigestOption {
+ if compression < 1 {
+ panic("Compression should be >= 1")
+ }
+ return func(t *TDigest) {
+ t.compression = float64(compression)
+ }
+}

Created options_test.go

@@ -1,0 +1,19
+package tdigest
+
+import "testing"
+
+func TestDefaults(t *testing.T) {
+ digest := New()
+
+ if digest.compression != 100 {
+ t.Errorf("The default compression should be 100")
+ }
+}
+
+func TestCompression(t *testing.T) {
+ if New(Compression(40)).compression != 40 {
+ t.Errorf("The compression option should change the new digest compression")
+ }
+
+ shouldPanic(func() { Compression(0) }, t, "Compression < 1 should panic")
+}