Skip to content

Commit

Permalink
optimize allocations by using pre allocated buffer for priorities (#2006
Browse files Browse the repository at this point in the history
)

## Problem
Badger allocates a lot of objects over time. I created a simple
reproducer and measured allocations after 10 minutes of running it.

```
(pprof) top
Showing nodes accounting for 267006, 99.54% of 268253 total
Dropped 71 nodes (cum <= 1341)
Showing top 10 nodes out of 14
      flat  flat%   sum%        cum   cum%
    155255 57.88% 57.88%     155255 57.88%  github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels.func1 (inline)
     65539 24.43% 82.31%      65539 24.43%  github.com/dgraph-io/badger/v4.(*levelsController).levelTargets
     43691 16.29% 98.60%     264485 98.60%  github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels
      2521  0.94% 99.54%       2521  0.94%  os.(*File).Stat
         0     0% 99.54%     264485 98.60%  github.com/dgraph-io/badger/v4.(*levelsController).runCompactor
         0     0% 99.54%     264485 98.60%  github.com/dgraph-io/badger/v4.(*levelsController).runCompactor.func3
         0     0% 99.54%       2521  0.94%  github.com/dgraph-io/badger/v4.(*logFile).open
         0     0% 99.54%       2521  0.94%  github.com/dgraph-io/badger/v4.(*valueLog).open
         0     0% 99.54%       2528  0.94%  github.com/dgraph-io/badger/v4.Open
         0     0% 99.54%       2521  0.94%  github.com/dgraph-io/ristretto/z.OpenMmapFile
(pprof) sample_index=alloc_space
(pprof) top
Showing nodes accounting for 238.72MB, 98.59% of 242.14MB total
Dropped 51 nodes (cum <= 1.21MB)
Showing top 10 nodes out of 34
      flat  flat%   sum%        cum   cum%
  166.41MB 68.72% 68.72%   166.41MB 68.72%  github.com/dgraph-io/badger/v4/skl.newArena (inline)
   59.04MB 24.38% 93.10%    59.04MB 24.38%  github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels.func1 (inline)
       4MB  1.65% 94.75%        4MB  1.65%  github.com/dgraph-io/ristretto/z.Calloc (inline)
       4MB  1.65% 96.41%        4MB  1.65%  github.com/dgraph-io/badger/v4.(*levelsController).levelTargets
    3.01MB  1.24% 97.65%     3.01MB  1.24%  github.com/google/flatbuffers/go.NewBuilder (inline)
    1.27MB  0.52% 98.17%     1.27MB  0.52%  github.com/dgraph-io/ristretto.newCmRow
       1MB  0.41% 98.59%    64.04MB 26.45%  github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels
         0     0% 98.59%     7.01MB  2.89%  github.com/dgraph-io/badger/v4.(*DB).flushMemtable
         0     0% 98.59%     7.01MB  2.89%  github.com/dgraph-io/badger/v4.(*DB).handleMemTableFlush
         0     0% 98.59%    83.20MB 34.36%  github.com/dgraph-io/badger/v4.(*DB).newMemTable
```

We see that pickCompactLevels makes a pretty high number of allocations
due to appending to slice over and over again:
```
(pprof) list pickCompactLevels
Total: 268253
ROUTINE ======================== github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels in /Users/deff/go/pkg/mod/github.com/dgraph-io/badger/v4@v4.2.0/levels.go
     43691     264485 (flat, cum) 98.60% of Total
         .          .    539:func (s *levelsController) pickCompactLevels() (prios []compactionPriority) {
         .      65539    540:	t := s.levelTargets()
         .          .    541:	addPriority := func(level int, score float64) {
         .          .    542:		pri := compactionPriority{
         .          .    543:			level:    level,
         .          .    544:			score:    score,
         .          .    545:			adjusted: score,
         .          .    546:			t:        t,
         .          .    547:		}
         .          .    548:		prios = append(prios, pri)
         .          .    549:	}
         .          .    550:
         .          .    551:	// Add L0 priority based on the number of tables.
         .      42134    552:	addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables))
         .          .    553:
         .          .    554:	// All other levels use size to calculate priority.
         .          .    555:	for i := 1; i < len(s.levels); i++ {
         .          .    556:		// Don't consider those tables that are already being compacted right now.
         .          .    557:		delSize := s.cstatus.delSize(i)
         .          .    558:
         .          .    559:		l := s.levels[i]
         .          .    560:		sz := l.getTotalSize() - delSize
         .     113121    561:		addPriority(i, float64(sz)/float64(t.targetSz[i]))
         .          .    562:	}
         .          .    563:	y.AssertTrue(len(prios) == len(s.levels))
         .          .    564:
         .          .    565:	// The following code is borrowed from PebbleDB and results in healthier LSM tree structure.
         .          .    566:	// If Li-1 has score > 1.0, then we'll divide Li-1 score by Li. If Li score is >= 1.0, then Li-1
         .          .    567:	// score is reduced, which means we'll prioritize the compaction of lower levels (L5, L4 and so
         .          .    568:	// on) over the higher levels (L0, L1 and so on). On the other hand, if Li score is < 1.0, then
         .          .    569:	// we'll increase the priority of Li-1.
         .          .    570:	// Overall what this means is, if the bottom level is already overflowing, then de-prioritize
         .          .    571:	// compaction of the above level. If the bottom level is not full, then increase the priority of
         .          .    572:	// above level.
         .          .    573:	var prevLevel int
         .          .    574:	for level := t.baseLevel; level < len(s.levels); level++ {
         .          .    575:		if prios[prevLevel].adjusted >= 1 {
         .          .    576:			// Avoid absurdly large scores by placing a floor on the score that we'll
         .          .    577:			// adjust a level by. The value of 0.01 was chosen somewhat arbitrarily
         .          .    578:			const minScore = 0.01
         .          .    579:			if prios[level].score >= minScore {
         .          .    580:				prios[prevLevel].adjusted /= prios[level].adjusted
         .          .    581:			} else {
         .          .    582:				prios[prevLevel].adjusted /= minScore
         .          .    583:			}
         .          .    584:		}
         .          .    585:		prevLevel = level
         .          .    586:	}
         .          .    587:
         .          .    588:	// Pick all the levels whose original score is >= 1.0, irrespective of their adjusted score.
         .          .    589:	// We'll still sort them by their adjusted score below. Having both these scores allows us to
         .          .    590:	// make better decisions about compacting L0. If we see a score >= 1.0, we can do L0->L0
         .          .    591:	// compactions. If the adjusted score >= 1.0, then we can do L0->Lbase compactions.
         .          .    592:	out := prios[:0]
         .          .    593:	for _, p := range prios[:len(prios)-1] {
         .          .    594:		if p.score >= 1.0 {
         .          .    595:			out = append(out, p)
         .          .    596:		}
         .          .    597:	}
         .          .    598:	prios = out
         .          .    599:
         .          .    600:	// Sort by the adjusted score.
     43691      43691    601:	sort.Slice(prios, func(i, j int) bool {
         .          .    602:		return prios[i].adjusted > prios[j].adjusted
         .          .    603:	})
         .          .    604:	return prios
         .          .    605:}
         .          .    606:
ROUTINE ======================== github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels.func1 in /Users/deff/go/pkg/mod/github.com/dgraph-io/badger/v4@v4.2.0/levels.go
    155255     155255 (flat, cum) 57.88% of Total
         .          .    541:	addPriority := func(level int, score float64) {
         .          .    542:		pri := compactionPriority{
         .          .    543:			level:    level,
         .          .    544:			score:    score,
         .          .    545:			adjusted: score,
         .          .    546:			t:        t,
         .          .    547:		}
    155255     155255    548:		prios = append(prios, pri)
         .          .    549:	}
         .          .    550:
         .          .    551:	// Add L0 priority based on the number of tables.
         .          .    552:	addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables))
         .          .    553:
```

 <!--
 Please add a description with these things:
 1. Explain the problem by providing a good description.
 2. If it fixes any GitHub issues, say "Fixes #GitHubIssue".
 3. If it corresponds to a Jira issue, say "Fixes DGRAPH-###".
4. If this is a breaking change, please prefix `[Breaking]` in the
title. In the description, please put a note with exactly who these
changes are breaking for.
 -->

## Solution
I suggest two optimizations:
1. Pre-allocate `prios` capacity according to numbers of `s.levels`
2. Reuse `prios` memory in compaction process, thanks to one-threaded
logic of compactor

Results after optimization (10 min run of reproducer):
```
(pprof) top
Showing nodes accounting for 165466, 99.84% of 165735 total
Dropped 27 nodes (cum <= 828)
Showing top 10 nodes out of 48
      flat  flat%   sum%        cum   cum%
     40962 24.72% 24.72%      40962 24.72%  github.com/dgraph-io/badger/v4.(*levelsController).levelTargets
     32768 19.77% 44.49%      32768 19.77%  github.com/dgraph-io/badger/v4/skl.(*Arena).putNode
     32768 19.77% 64.26%      32768 19.77%  github.com/dgraph-io/badger/v4/y.KeyWithTs (inline)
     21845 13.18% 77.44%      62807 37.90%  github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels
     21845 13.18% 90.62%      21845 13.18%  github.com/dgraph-io/badger/v4.(*logFile).encodeEntry
      8192  4.94% 95.56%       8192  4.94%  github.com/dgraph-io/badger/v4/table.(*Builder).addHelper
      4681  2.82% 98.39%       4681  2.82%  regexp/syntax.(*Regexp).Simplify
      2341  1.41% 99.80%       2341  1.41%  runtime/pprof.allFrames
        64 0.039% 99.84%      32832 19.81%  github.com/dgraph-io/badger/v4.(*Txn).commitAndSend
         0     0% 99.84%      32832 19.81%  github.com/dgraph-io/badger/v4.(*DB).Update
(pprof) sample_index=alloc_space
(pprof) top
Showing nodes accounting for 180.47MB, 97.79% of 184.54MB total
Dropped 22 nodes (cum <= 0.92MB)
Showing top 10 nodes out of 53
      flat  flat%   sum%        cum   cum%
  166.41MB 90.17% 90.17%   166.41MB 90.17%  github.com/dgraph-io/badger/v4/skl.newArena
       4MB  2.17% 92.34%        4MB  2.17%  github.com/dgraph-io/ristretto/z.Calloc
    3.01MB  1.63% 93.97%     3.01MB  1.63%  github.com/google/flatbuffers/go.NewBuilder
    2.50MB  1.35% 95.32%     2.50MB  1.35%  github.com/dgraph-io/badger/v4.(*levelsController).levelTargets
    1.76MB  0.96% 96.28%     2.97MB  1.61%  compress/flate.NewWriter (inline)
    1.16MB  0.63% 96.91%     1.16MB  0.63%  github.com/dgraph-io/ristretto/z.(*Bloom).Size
    0.64MB  0.34% 97.25%     1.20MB  0.65%  compress/flate.(*compressor).init
    0.50MB  0.27% 97.52%        1MB  0.54%  github.com/dgraph-io/badger/v4.(*Txn).commitAndSend
    0.50MB  0.27% 97.79%        3MB  1.63%  github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels
         0     0% 97.79%     2.97MB  1.61%  compress/gzip.(*Writer).Write
```

And inside pickCompactLevels:
```
ROUTINE ======================== github.com/dgraph-io/badger/v4.(*levelsController).pickCompactLevels in /Users/deff/dev/work/badger/levels.go
     21845      62807 (flat, cum) 37.90% of Total
         .          .    544:func (s *levelsController) pickCompactLevels(prios []compactionPriority) []compactionPriority {
         .      40962    545:	t := s.levelTargets()
         .          .    546:	addPriority := func(level int, score float64) {
         .          .    547:		pri := compactionPriority{
         .          .    548:			level:    level,
         .          .    549:			score:    score,
         .          .    550:			adjusted: score,
         .          .    551:			t:        t,
         .          .    552:		}
         .          .    553:		prios = append(prios, pri)
         .          .    554:	}
         .          .    555:
         .          .    556:	if cap(prios) < len(s.levels) {
         .          .    557:		prios = make([]compactionPriority, 0, len(s.levels))
         .          .    558:	}
         .          .    559:	prios = prios[:0]
         .          .    560:
         .          .    561:	// Add L0 priority based on the number of tables.
         .          .    562:	addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables))
         .          .    563:
         .          .    564:	// All other levels use size to calculate priority.
         .          .    565:	for i := 1; i < len(s.levels); i++ {
         .          .    566:		// Don't consider those tables that are already being compacted right now.
         .          .    567:		delSize := s.cstatus.delSize(i)
         .          .    568:
         .          .    569:		l := s.levels[i]
         .          .    570:		sz := l.getTotalSize() - delSize
         .          .    571:		addPriority(i, float64(sz)/float64(t.targetSz[i]))
         .          .    572:	}
         .          .    573:	y.AssertTrue(len(prios) == len(s.levels))
         .          .    574:
         .          .    575:	// The following code is borrowed from PebbleDB and results in healthier LSM tree structure.
         .          .    576:	// If Li-1 has score > 1.0, then we'll divide Li-1 score by Li. If Li score is >= 1.0, then Li-1
         .          .    577:	// score is reduced, which means we'll prioritize the compaction of lower levels (L5, L4 and so
         .          .    578:	// on) over the higher levels (L0, L1 and so on). On the other hand, if Li score is < 1.0, then
         .          .    579:	// we'll increase the priority of Li-1.
         .          .    580:	// Overall what this means is, if the bottom level is already overflowing, then de-prioritize
         .          .    581:	// compaction of the above level. If the bottom level is not full, then increase the priority of
         .          .    582:	// above level.
         .          .    583:	var prevLevel int
         .          .    584:	for level := t.baseLevel; level < len(s.levels); level++ {
         .          .    585:		if prios[prevLevel].adjusted >= 1 {
         .          .    586:			// Avoid absurdly large scores by placing a floor on the score that we'll
         .          .    587:			// adjust a level by. The value of 0.01 was chosen somewhat arbitrarily
         .          .    588:			const minScore = 0.01
         .          .    589:			if prios[level].score >= minScore {
         .          .    590:				prios[prevLevel].adjusted /= prios[level].adjusted
         .          .    591:			} else {
         .          .    592:				prios[prevLevel].adjusted /= minScore
         .          .    593:			}
         .          .    594:		}
         .          .    595:		prevLevel = level
         .          .    596:	}
         .          .    597:
         .          .    598:	// Pick all the levels whose original score is >= 1.0, irrespective of their adjusted score.
         .          .    599:	// We'll still sort them by their adjusted score below. Having both these scores allows us to
         .          .    600:	// make better decisions about compacting L0. If we see a score >= 1.0, we can do L0->L0
         .          .    601:	// compactions. If the adjusted score >= 1.0, then we can do L0->Lbase compactions.
         .          .    602:	out := prios[:0]
         .          .    603:	for _, p := range prios[:len(prios)-1] {
         .          .    604:		if p.score >= 1.0 {
         .          .    605:			out = append(out, p)
         .          .    606:		}
         .          .    607:	}
         .          .    608:	prios = out
         .          .    609:
         .          .    610:	// Sort by the adjusted score.
     21845      21845    611:	sort.Slice(prios, func(i, j int) bool {
         .          .    612:		return prios[i].adjusted > prios[j].adjusted
         .          .    613:	})
         .          .    614:	return prios
         .          .    615:}
         .          .    616:
```
 <!--
 Please add a description with these things:
 1. Explain the solution to make it easier to review the PR.
4. Make it easier for the reviewer by describing complex sections with
comments.
 -->

## Profile from real project
Both profiles are measured after 30 minutes from application start

### Before optimization:
```
(pprof) top
Showing nodes accounting for 621.02MB, 85.32% of 727.90MB total
Dropped 550 nodes (cum <= 3.64MB)
Showing top 10 nodes out of 146
      flat  flat%   sum%        cum   cum%
  380.72MB 52.30% 52.30%   380.72MB 52.30%  github.com/dgraph-io/badger/v3.(*levelsController).pickCompactLevels.func1
  104.01MB 14.29% 66.59%   104.01MB 14.29%  github.com/dgraph-io/badger/v3/skl.newArena
   33.27MB  4.57% 71.16%    33.27MB  4.57%  github.com/dgraph-io/ristretto.newCmRow
   27.05MB  3.72% 74.88%    27.05MB  3.72%  github.com/dgraph-io/badger/v3/y.SafeCopy
   23.50MB  3.23% 78.11%    23.50MB  3.23%  github.com/dgraph-io/badger/v3.(*levelsController).levelTargets
   18.31MB  2.52% 80.62%    18.31MB  2.52%  github.com/dgraph-io/ristretto/z.(*Bloom).Size
   18.02MB  2.48% 83.10%    18.02MB  2.48%  github.com/dgraph-io/badger/v3/y.(*Slice).Resize
       8MB  1.10% 84.20%   412.23MB 56.63%  github.com/dgraph-io/badger/v3.(*levelsController).pickCompactLevels
    4.12MB  0.57% 84.77%     8.13MB  1.12%  github.com/blevesearch/vellum.(*FSTIterator).next
       4MB  0.55% 85.32%        4MB  0.55%  github.com/blevesearch/vellum.(*decoderV1).stateAt
```
### After optimization:
```
Type: alloc_space
Time: Sep 11, 2023 at 5:50pm (+05)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top
Showing nodes accounting for 262.17MB, 66.88% of 391.99MB total
Dropped 453 nodes (cum <= 1.96MB)
Showing top 10 nodes out of 290
      flat  flat%   sum%        cum   cum%
  104.01MB 26.53% 26.53%   104.01MB 26.53%  github.com/dgraph-io/badger/v3/skl.newArena
   33.91MB  8.65% 35.18%    33.91MB  8.65%  github.com/dgraph-io/ristretto.newCmRow
   28.50MB  7.27% 42.45%    28.50MB  7.27%  github.com/dgraph-io/badger/v3.(*levelsController).levelTargets
   26.52MB  6.77% 49.22%    26.52MB  6.77%  github.com/dgraph-io/badger/v3/y.(*Slice).Resize
   25.03MB  6.38% 55.61%    25.03MB  6.38%  github.com/dgraph-io/badger/v3/y.SafeCopy
   17.16MB  4.38% 59.98%    17.16MB  4.38%  github.com/dgraph-io/ristretto/z.(*Bloom).Size
    7.12MB  1.82% 61.80%     9.12MB  2.33%  github.com/anyproto/go-chash.(*cHash).addMembers
    6.72MB  1.72% 63.51%    12.22MB  3.12%  github.com/blevesearch/vellum.(*FSTIterator).next
    6.71MB  1.71% 65.22%     6.71MB  1.71%  bytes.growSlice
    6.50MB  1.66% 66.88%     6.50MB  1.66%  github.com/blevesearch/vellum.(*builderNodePool).Get
```
# Reproducer
```
package main

import (
	"fmt"
	"os"
	"github.com/dgraph-io/badger/v4"
	_ "net/http/pprof"
	"net/http"
	"log"
)

func generateItems(db *badger.DB, n int) error {
	return db.Update(func(txn *badger.Txn) error {
		for i := 0; i < n; i++ {
			err := txn.Set([]byte(fmt.Sprintf("key-%d", i)), []byte(fmt.Sprintf("value-%d", i)))
			if err != nil {
				return err
			}
		}
		return nil
	})
}

func run() error {
	forever := make(chan struct{})

	db, err := badger.Open(badger.DefaultOptions("./tmp"))
	if err != nil {
		return fmt.Errorf("open badger: %w", err)
	}
	err = generateItems(db, 1000)
	if err != nil {
		return fmt.Errorf("generate items: %w", err)
	}

	go func() {
		log.Println(http.ListenAndServe("localhost:9000", nil))
	}()

	<-forever
	return nil
}

func main() {
	err := run()
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
}
```
  • Loading branch information
deff7 committed Oct 13, 2023
1 parent b84bc01 commit fb1b009
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
2 changes: 1 addition & 1 deletion db.go
Original file line number Diff line number Diff line change
Expand Up @@ -1617,7 +1617,7 @@ func (db *DB) Flatten(workers int) error {
}
}
if len(levels) <= 1 {
prios := db.lc.pickCompactLevels()
prios := db.lc.pickCompactLevels(nil)
if len(prios) == 0 || prios[0].score <= 1.0 {
db.opt.Infof("All tables consolidated into one level. Flattening done.\n")
return nil
Expand Down
19 changes: 16 additions & 3 deletions levels.go
Original file line number Diff line number Diff line change
Expand Up @@ -473,8 +473,13 @@ func (s *levelsController) runCompactor(id int, lc *z.Closer) {
}
return false
}

var priosBuffer []compactionPriority
runOnce := func() bool {
prios := s.pickCompactLevels()
prios := s.pickCompactLevels(priosBuffer)
defer func() {
priosBuffer = prios
}()
if id == 0 {
// Worker ID zero prefers to compact L0 always.
prios = moveL0toFront(prios)
Expand Down Expand Up @@ -536,7 +541,9 @@ func (s *levelsController) lastLevel() *levelHandler {

// pickCompactLevel determines which level to compact.
// Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
func (s *levelsController) pickCompactLevels() (prios []compactionPriority) {
// It tries to reuse priosBuffer to reduce memory allocation,
// passing nil is acceptable, then new memory will be allocated.
func (s *levelsController) pickCompactLevels(priosBuffer []compactionPriority) (prios []compactionPriority) {
t := s.levelTargets()
addPriority := func(level int, score float64) {
pri := compactionPriority{
Expand All @@ -548,6 +555,12 @@ func (s *levelsController) pickCompactLevels() (prios []compactionPriority) {
prios = append(prios, pri)
}

// Grow buffer to fit all levels.
if cap(priosBuffer) < len(s.levels) {
priosBuffer = make([]compactionPriority, 0, len(s.levels))
}
prios = priosBuffer[:0]

// Add L0 priority based on the number of tables.
addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables))

Expand Down Expand Up @@ -1707,7 +1720,7 @@ type LevelInfo struct {

func (s *levelsController) getLevelInfo() []LevelInfo {
t := s.levelTargets()
prios := s.pickCompactLevels()
prios := s.pickCompactLevels(nil)
result := make([]LevelInfo, len(s.levels))
for i, l := range s.levels {
l.RLock()
Expand Down

0 comments on commit fb1b009

Please sign in to comment.