Use rand.Perm to shuffle data
- Id
- c401c2adc6f7963e34de06574e5434c2ce190ca4
- Author
- Vladimir Mihailenco
- Commit time
- 2018-11-08T14:32:53+02:00
Modified rng.go
type RNG interface {
Float32() float32
Intn(int) int
+ Perm(n int) []int
}
type globalRNG struct{}
func (r *globalRNG) Intn(i int) int {
return rand.Intn(i)
+}
+
+func (r *globalRNG) Perm(n int) []int {
+ return rand.Perm(n)
}
type localRNG struct {
func (r *localRNG) Intn(i int) int {
return r.localRand.Intn(i)
+}
+
+func (r *localRNG) Perm(n int) []int {
+ return rand.Perm(n)
}
Modified summary.go
return s
}
-func (s summary) Len() int {
+func (s *summary) Len() int {
return len(s.means)
}
func (s *summary) Add(key float64, value uint32) error {
-
if math.IsNaN(key) {
return fmt.Errorf("Key must not be NaN")
}
-
if value == 0 {
return fmt.Errorf("Count must be >0")
}
- idx := s.FindInsertionIndex(key)
+ idx := s.findInsertionIndex(key)
s.means = append(s.means, math.NaN())
s.counts = append(s.counts, 0)
return nil
}
-func (s summary) Floor(x float64) int {
- return sort.Search(len(s.means), func(i int) bool {
- return s.means[i] >= x
- }) - 1
-}
-
// Always insert to the right
-func (s summary) FindInsertionIndex(x float64) int {
+func (s *summary) findInsertionIndex(x float64) int {
+ // Binary search is only worthwhile if we have a lot of keys.
+ if len(s.means) < 250 {
+ for i, mean := range s.means {
+ if mean > x {
+ return i
+ }
+ }
+ return len(s.means)
+ }
+
return sort.Search(len(s.means), func(i int) bool {
return s.means[i] > x
})
// This method is the hotspot when calling Add(), which in turn is called by
// Compress() and Merge().
-func (s summary) HeadSum(idx int) (sum float64) {
+func (s *summary) HeadSum(idx int) (sum float64) {
return float64(sumUntilIndex(s.counts, idx))
}
-func (s summary) FindIndex(x float64) int {
- idx := sort.Search(len(s.means), func(i int) bool {
- return s.means[i] >= x
- })
- if idx < s.Len() && s.means[idx] == x {
- return idx
- }
- return s.Len()
+func (s *summary) Floor(x float64) int {
+ return s.findIndex(x) - 1
}
-func (s summary) Mean(uncheckedIndex int) float64 {
+func (s *summary) findIndex(x float64) int {
+ // Binary search is only worthwhile if we have a lot of keys.
+ if len(s.means) < 250 {
+ for i, mean := range s.means {
+ if mean >= x {
+ return i
+ }
+ }
+ return len(s.means)
+ }
+
+ return sort.Search(len(s.means), func(i int) bool {
+ return s.means[i] >= x
+ })
+}
+
+func (s *summary) Mean(uncheckedIndex int) float64 {
return s.means[uncheckedIndex]
}
-func (s summary) Count(uncheckedIndex int) uint32 {
+func (s *summary) Count(uncheckedIndex int) uint32 {
return s.counts[uncheckedIndex]
}
// case no centroid satisfies the requirement.
// Since it's cheap, this also returns the `HeadSum` until
// the found index (i.e. cumSum = HeadSum(FloorSum(x)))
-func (s summary) FloorSum(sum float64) (index int, cumSum float64) {
+func (s *summary) FloorSum(sum float64) (index int, cumSum float64) {
index = -1
- for i := 0; i < s.Len(); i++ {
+ for i, count := range s.counts {
if cumSum <= sum {
index = i
} else {
break
}
- cumSum += float64(s.counts[i])
+ cumSum += float64(count)
}
if index != -1 {
cumSum -= float64(s.counts[index])
}
}
-func (s summary) ForEach(f func(float64, uint32) bool) {
- for i := 0; i < len(s.means); i++ {
+func (s *summary) ForEach(f func(float64, uint32) bool) {
+ for i, mean := range s.means {
+ if !f(mean, s.counts[i]) {
+ break
+ }
+ }
+}
+
+func (s *summary) Perm(rng RNG, f func(float64, uint32) bool) {
+ for _, i := range rng.Perm(s.Len()) {
if !f(s.means[i], s.counts[i]) {
break
}
}
}
-func (s summary) Clone() *summary {
+func (s *summary) Clone() *summary {
return &summary{
means: append([]float64{}, s.means...),
counts: append([]uint32{}, s.counts...),
Modified summary_test.go
}
for k, v := range testData {
- i := s.FindIndex(k)
+ i := s.findIndex(k)
if i == s.Len() {
t.Errorf("Couldn't find previously added key on summary")
Modified tdigest.go
}
oldTree := t.summary
- t.summary = newSummary(t.summary.Len())
+ t.summary = newSummary(estimateCapacity(t.compression))
t.count = 0
- shuffle(oldTree.means, oldTree.counts, t.rng)
- oldTree.ForEach(func(mean float64, count uint32) bool {
+ oldTree.Perm(t.rng, func(mean float64, count uint32) bool {
err = t.AddWeighted(mean, count)
return err == nil
})
-
return err
}
return nil
}
- // We must keep the other digest intact
- data := other.summary.Clone()
- shuffle(data.means, data.counts, t.rng)
-
- data.ForEach(func(mean float64, count uint32) bool {
+ other.summary.Perm(t.rng, func(mean float64, count uint32) bool {
err = t.AddWeighted(mean, count)
return err == nil
})
return 0
}
return trimmedSum / trimmedCount
-}
-
-func shuffle(means []float64, counts []uint32, rng RNG) {
- for i := len(means) - 1; i > 1; i-- {
- j := rng.Intn(i + 1)
- means[i], means[j], counts[i], counts[j] = means[j], means[i], counts[j], counts[i]
- }
}
func estimateCapacity(compression float64) int {