You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
841 lines
24 KiB
841 lines
24 KiB
/* |
|
* Copyright 2017 Dgraph Labs, Inc. and Contributors |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
|
|
package badger |
|
|
|
import ( |
|
"fmt" |
|
"math" |
|
"math/rand" |
|
"os" |
|
"sort" |
|
"time" |
|
|
|
"golang.org/x/net/trace" |
|
|
|
"github.com/dgraph-io/badger/protos" |
|
"github.com/dgraph-io/badger/table" |
|
"github.com/dgraph-io/badger/y" |
|
"github.com/pkg/errors" |
|
) |
|
|
|
type levelsController struct { |
|
nextFileID uint64 // Atomic |
|
elog trace.EventLog |
|
|
|
// The following are initialized once and const. |
|
levels []*levelHandler |
|
kv *DB |
|
|
|
cstatus compactStatus |
|
} |
|
|
|
var ( |
|
// This is for getting timings between stalls. |
|
lastUnstalled time.Time |
|
) |
|
|
|
// revertToManifest checks that all necessary table files exist and removes all table files not |
|
// referenced by the manifest. idMap is a set of table file id's that were read from the directory |
|
// listing. |
|
func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error { |
|
// 1. Check all files in manifest exist. |
|
for id := range mf.Tables { |
|
if _, ok := idMap[id]; !ok { |
|
return fmt.Errorf("file does not exist for table %d", id) |
|
} |
|
} |
|
|
|
// 2. Delete files that shouldn't exist. |
|
for id := range idMap { |
|
if _, ok := mf.Tables[id]; !ok { |
|
kv.elog.Printf("Table file %d not referenced in MANIFEST\n", id) |
|
filename := table.NewFilename(id, kv.opt.Dir) |
|
if err := os.Remove(filename); err != nil { |
|
return y.Wrapf(err, "While removing table %d", id) |
|
} |
|
} |
|
} |
|
|
|
return nil |
|
} |
|
|
|
func newLevelsController(kv *DB, mf *Manifest) (*levelsController, error) { |
|
y.AssertTrue(kv.opt.NumLevelZeroTablesStall > kv.opt.NumLevelZeroTables) |
|
s := &levelsController{ |
|
kv: kv, |
|
elog: kv.elog, |
|
levels: make([]*levelHandler, kv.opt.MaxLevels), |
|
} |
|
s.cstatus.levels = make([]*levelCompactStatus, kv.opt.MaxLevels) |
|
|
|
for i := 0; i < kv.opt.MaxLevels; i++ { |
|
s.levels[i] = newLevelHandler(kv, i) |
|
if i == 0 { |
|
// Do nothing. |
|
} else if i == 1 { |
|
// Level 1 probably shouldn't be too much bigger than level 0. |
|
s.levels[i].maxTotalSize = kv.opt.LevelOneSize |
|
} else { |
|
s.levels[i].maxTotalSize = s.levels[i-1].maxTotalSize * int64(kv.opt.LevelSizeMultiplier) |
|
} |
|
s.cstatus.levels[i] = new(levelCompactStatus) |
|
} |
|
|
|
// Compare manifest against directory, check for existent/non-existent files, and remove. |
|
if err := revertToManifest(kv, mf, getIDMap(kv.opt.Dir)); err != nil { |
|
return nil, err |
|
} |
|
|
|
// Some files may be deleted. Let's reload. |
|
tables := make([][]*table.Table, kv.opt.MaxLevels) |
|
var maxFileID uint64 |
|
for fileID, tableManifest := range mf.Tables { |
|
fname := table.NewFilename(fileID, kv.opt.Dir) |
|
var flags uint32 = y.Sync |
|
if kv.opt.ReadOnly { |
|
flags |= y.ReadOnly |
|
} |
|
fd, err := y.OpenExistingFile(fname, flags) |
|
if err != nil { |
|
closeAllTables(tables) |
|
return nil, errors.Wrapf(err, "Opening file: %q", fname) |
|
} |
|
|
|
t, err := table.OpenTable(fd, kv.opt.TableLoadingMode) |
|
if err != nil { |
|
closeAllTables(tables) |
|
return nil, errors.Wrapf(err, "Opening table: %q", fname) |
|
} |
|
|
|
level := tableManifest.Level |
|
tables[level] = append(tables[level], t) |
|
|
|
if fileID > maxFileID { |
|
maxFileID = fileID |
|
} |
|
} |
|
s.nextFileID = maxFileID + 1 |
|
for i, tbls := range tables { |
|
s.levels[i].initTables(tbls) |
|
} |
|
|
|
// Make sure key ranges do not overlap etc. |
|
if err := s.validate(); err != nil { |
|
_ = s.cleanupLevels() |
|
return nil, errors.Wrap(err, "Level validation") |
|
} |
|
|
|
// Sync directory (because we have at least removed some files, or previously created the |
|
// manifest file). |
|
if err := syncDir(kv.opt.Dir); err != nil { |
|
_ = s.close() |
|
return nil, err |
|
} |
|
|
|
return s, nil |
|
} |
|
|
|
// Closes the tables, for cleanup in newLevelsController. (We Close() instead of using DecrRef() |
|
// because that would delete the underlying files.) We ignore errors, which is OK because tables |
|
// are read-only. |
|
func closeAllTables(tables [][]*table.Table) { |
|
for _, tableSlice := range tables { |
|
for _, table := range tableSlice { |
|
_ = table.Close() |
|
} |
|
} |
|
} |
|
|
|
func (s *levelsController) cleanupLevels() error { |
|
var firstErr error |
|
for _, l := range s.levels { |
|
if err := l.close(); err != nil && firstErr == nil { |
|
firstErr = err |
|
} |
|
} |
|
return firstErr |
|
} |
|
|
|
// This function picks all tables from all levels, creates a manifest changeset, |
|
// applies it, and then decrements the refs of these tables, which would result |
|
// in their deletion. It spares one table from L0, to keep the badgerHead key |
|
// persisted, so we don't lose where we are w.r.t. value log. |
|
// NOTE: This function in itself isn't sufficient to completely delete all the |
|
// data. After this, one would still need to iterate over the KV pairs and mark |
|
// them as deleted. |
|
func (s *levelsController) deleteLSMTree() (int, error) { |
|
var all []*table.Table |
|
var keepOne *table.Table |
|
for _, l := range s.levels { |
|
l.RLock() |
|
if l.level == 0 && len(l.tables) > 1 { |
|
// Skip the last table. We do this to keep the badgerMove key persisted. |
|
lastIdx := len(l.tables) - 1 |
|
keepOne = l.tables[lastIdx] |
|
all = append(all, l.tables[:lastIdx]...) |
|
} else { |
|
all = append(all, l.tables...) |
|
} |
|
l.RUnlock() |
|
} |
|
if len(all) == 0 { |
|
return 0, nil |
|
} |
|
|
|
// Generate the manifest changes. |
|
changes := []*protos.ManifestChange{} |
|
for _, table := range all { |
|
changes = append(changes, makeTableDeleteChange(table.ID())) |
|
} |
|
changeSet := protos.ManifestChangeSet{Changes: changes} |
|
if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil { |
|
return 0, err |
|
} |
|
|
|
for _, l := range s.levels { |
|
l.Lock() |
|
l.totalSize = 0 |
|
if l.level == 0 && len(l.tables) > 1 { |
|
l.tables = []*table.Table{keepOne} |
|
l.totalSize += keepOne.Size() |
|
} else { |
|
l.tables = l.tables[:0] |
|
} |
|
l.Unlock() |
|
} |
|
// Now allow deletion of tables. |
|
for _, table := range all { |
|
if err := table.DecrRef(); err != nil { |
|
return 0, err |
|
} |
|
} |
|
return len(all), nil |
|
} |
|
|
|
func (s *levelsController) startCompact(lc *y.Closer) { |
|
n := s.kv.opt.NumCompactors |
|
lc.AddRunning(n - 1) |
|
for i := 0; i < n; i++ { |
|
go s.runWorker(lc) |
|
} |
|
} |
|
|
|
func (s *levelsController) runWorker(lc *y.Closer) { |
|
defer lc.Done() |
|
if s.kv.opt.DoNotCompact { |
|
return |
|
} |
|
|
|
time.Sleep(time.Duration(rand.Int31n(1000)) * time.Millisecond) |
|
ticker := time.NewTicker(time.Second) |
|
defer ticker.Stop() |
|
|
|
for { |
|
select { |
|
// Can add a done channel or other stuff. |
|
case <-ticker.C: |
|
prios := s.pickCompactLevels() |
|
for _, p := range prios { |
|
// TODO: Handle error. |
|
didCompact, _ := s.doCompact(p) |
|
if didCompact { |
|
break |
|
} |
|
} |
|
case <-lc.HasBeenClosed(): |
|
return |
|
} |
|
} |
|
} |
|
|
|
// Returns true if level zero may be compacted, without accounting for compactions that already |
|
// might be happening. |
|
func (s *levelsController) isLevel0Compactable() bool { |
|
return s.levels[0].numTables() >= s.kv.opt.NumLevelZeroTables |
|
} |
|
|
|
// Returns true if the non-zero level may be compacted. delSize provides the size of the tables |
|
// which are currently being compacted so that we treat them as already having started being |
|
// compacted (because they have been, yet their size is already counted in getTotalSize). |
|
func (l *levelHandler) isCompactable(delSize int64) bool { |
|
return l.getTotalSize()-delSize >= l.maxTotalSize |
|
} |
|
|
|
type compactionPriority struct { |
|
level int |
|
score float64 |
|
} |
|
|
|
// pickCompactLevel determines which level to compact. |
|
// Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction |
|
func (s *levelsController) pickCompactLevels() (prios []compactionPriority) { |
|
// This function must use identical criteria for guaranteeing compaction's progress that |
|
// addLevel0Table uses. |
|
|
|
// cstatus is checked to see if level 0's tables are already being compacted |
|
if !s.cstatus.overlapsWith(0, infRange) && s.isLevel0Compactable() { |
|
pri := compactionPriority{ |
|
level: 0, |
|
score: float64(s.levels[0].numTables()) / float64(s.kv.opt.NumLevelZeroTables), |
|
} |
|
prios = append(prios, pri) |
|
} |
|
|
|
for i, l := range s.levels[1:] { |
|
// Don't consider those tables that are already being compacted right now. |
|
delSize := s.cstatus.delSize(i + 1) |
|
|
|
if l.isCompactable(delSize) { |
|
pri := compactionPriority{ |
|
level: i + 1, |
|
score: float64(l.getTotalSize()-delSize) / float64(l.maxTotalSize), |
|
} |
|
prios = append(prios, pri) |
|
} |
|
} |
|
sort.Slice(prios, func(i, j int) bool { |
|
return prios[i].score > prios[j].score |
|
}) |
|
return prios |
|
} |
|
|
|
// compactBuildTables merge topTables and botTables to form a list of new tables. |
|
func (s *levelsController) compactBuildTables( |
|
l int, cd compactDef) ([]*table.Table, func() error, error) { |
|
topTables := cd.top |
|
botTables := cd.bot |
|
|
|
var hasOverlap bool |
|
{ |
|
kr := getKeyRange(cd.top) |
|
for i, lh := range s.levels { |
|
if i <= l { // Skip upper levels. |
|
continue |
|
} |
|
lh.RLock() |
|
left, right := lh.overlappingTables(levelHandlerRLocked{}, kr) |
|
lh.RUnlock() |
|
if right-left > 0 { |
|
hasOverlap = true |
|
break |
|
} |
|
} |
|
cd.elog.LazyPrintf("Key range overlaps with lower levels: %v", hasOverlap) |
|
} |
|
|
|
// Try to collect stats so that we can inform value log about GC. That would help us find which |
|
// value log file should be GCed. |
|
discardStats := make(map[uint32]int64) |
|
updateStats := func(vs y.ValueStruct) { |
|
if vs.Meta&bitValuePointer > 0 { |
|
var vp valuePointer |
|
vp.Decode(vs.Value) |
|
discardStats[vp.Fid] += int64(vp.Len) |
|
} |
|
} |
|
|
|
// Create iterators across all the tables involved first. |
|
var iters []y.Iterator |
|
if l == 0 { |
|
iters = appendIteratorsReversed(iters, topTables, false) |
|
} else { |
|
y.AssertTrue(len(topTables) == 1) |
|
iters = []y.Iterator{topTables[0].NewIterator(false)} |
|
} |
|
|
|
// Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap. |
|
iters = append(iters, table.NewConcatIterator(botTables, false)) |
|
it := y.NewMergeIterator(iters, false) |
|
defer it.Close() // Important to close the iterator to do ref counting. |
|
|
|
it.Rewind() |
|
|
|
// Pick a discard ts, so we can discard versions below this ts. We should |
|
// never discard any versions starting from above this timestamp, because |
|
// that would affect the snapshot view guarantee provided by transactions. |
|
discardTs := s.kv.orc.discardAtOrBelow() |
|
|
|
// Start generating new tables. |
|
type newTableResult struct { |
|
table *table.Table |
|
err error |
|
} |
|
resultCh := make(chan newTableResult) |
|
var numBuilds, numVersions int |
|
var lastKey, skipKey []byte |
|
for it.Valid() { |
|
timeStart := time.Now() |
|
builder := table.NewTableBuilder() |
|
var numKeys, numSkips uint64 |
|
for ; it.Valid(); it.Next() { |
|
// See if we need to skip this key. |
|
if len(skipKey) > 0 { |
|
if y.SameKey(it.Key(), skipKey) { |
|
numSkips++ |
|
updateStats(it.Value()) |
|
continue |
|
} else { |
|
skipKey = skipKey[:0] |
|
} |
|
} |
|
|
|
if !y.SameKey(it.Key(), lastKey) { |
|
if builder.ReachedCapacity(s.kv.opt.MaxTableSize) { |
|
// Only break if we are on a different key, and have reached capacity. We want |
|
// to ensure that all versions of the key are stored in the same sstable, and |
|
// not divided across multiple tables at the same level. |
|
break |
|
} |
|
lastKey = y.SafeCopy(lastKey, it.Key()) |
|
numVersions = 0 |
|
} |
|
|
|
vs := it.Value() |
|
version := y.ParseTs(it.Key()) |
|
if version <= discardTs { |
|
// Keep track of the number of versions encountered for this key. Only consider the |
|
// versions which are below the minReadTs, otherwise, we might end up discarding the |
|
// only valid version for a running transaction. |
|
numVersions++ |
|
lastValidVersion := vs.Meta&bitDiscardEarlierVersions > 0 |
|
if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) || |
|
numVersions > s.kv.opt.NumVersionsToKeep || |
|
lastValidVersion { |
|
// If this version of the key is deleted or expired, skip all the rest of the |
|
// versions. Ensure that we're only removing versions below readTs. |
|
skipKey = y.SafeCopy(skipKey, it.Key()) |
|
|
|
if lastValidVersion { |
|
// Add this key. We have set skipKey, so the following key versions |
|
// would be skipped. |
|
} else if hasOverlap { |
|
// If this key range has overlap with lower levels, then keep the deletion |
|
// marker with the latest version, discarding the rest. We have set skipKey, |
|
// so the following key versions would be skipped. |
|
} else { |
|
// If no overlap, we can skip all the versions, by continuing here. |
|
numSkips++ |
|
updateStats(vs) |
|
continue // Skip adding this key. |
|
} |
|
} |
|
} |
|
numKeys++ |
|
y.Check(builder.Add(it.Key(), it.Value())) |
|
} |
|
// It was true that it.Valid() at least once in the loop above, which means we |
|
// called Add() at least once, and builder is not Empty(). |
|
cd.elog.LazyPrintf("Added %d keys. Skipped %d keys.", numKeys, numSkips) |
|
cd.elog.LazyPrintf("LOG Compact. Iteration took: %v\n", time.Since(timeStart)) |
|
if !builder.Empty() { |
|
numBuilds++ |
|
fileID := s.reserveFileID() |
|
go func(builder *table.Builder) { |
|
defer builder.Close() |
|
|
|
fd, err := y.CreateSyncedFile(table.NewFilename(fileID, s.kv.opt.Dir), true) |
|
if err != nil { |
|
resultCh <- newTableResult{nil, errors.Wrapf(err, "While opening new table: %d", fileID)} |
|
return |
|
} |
|
|
|
if _, err := fd.Write(builder.Finish()); err != nil { |
|
resultCh <- newTableResult{nil, errors.Wrapf(err, "Unable to write to file: %d", fileID)} |
|
return |
|
} |
|
|
|
tbl, err := table.OpenTable(fd, s.kv.opt.TableLoadingMode) |
|
// decrRef is added below. |
|
resultCh <- newTableResult{tbl, errors.Wrapf(err, "Unable to open table: %q", fd.Name())} |
|
}(builder) |
|
} |
|
} |
|
|
|
newTables := make([]*table.Table, 0, 20) |
|
// Wait for all table builders to finish. |
|
var firstErr error |
|
for x := 0; x < numBuilds; x++ { |
|
res := <-resultCh |
|
newTables = append(newTables, res.table) |
|
if firstErr == nil { |
|
firstErr = res.err |
|
} |
|
} |
|
|
|
if firstErr == nil { |
|
// Ensure created files' directory entries are visible. We don't mind the extra latency |
|
// from not doing this ASAP after all file creation has finished because this is a |
|
// background operation. |
|
firstErr = syncDir(s.kv.opt.Dir) |
|
} |
|
|
|
if firstErr != nil { |
|
// An error happened. Delete all the newly created table files (by calling DecrRef |
|
// -- we're the only holders of a ref). |
|
for j := 0; j < numBuilds; j++ { |
|
if newTables[j] != nil { |
|
newTables[j].DecrRef() |
|
} |
|
} |
|
errorReturn := errors.Wrapf(firstErr, "While running compaction for: %+v", cd) |
|
return nil, nil, errorReturn |
|
} |
|
|
|
sort.Slice(newTables, func(i, j int) bool { |
|
return y.CompareKeys(newTables[i].Biggest(), newTables[j].Biggest()) < 0 |
|
}) |
|
s.kv.vlog.updateGCStats(discardStats) |
|
cd.elog.LazyPrintf("Discard stats: %v", discardStats) |
|
return newTables, func() error { return decrRefs(newTables) }, nil |
|
} |
|
|
|
func buildChangeSet(cd *compactDef, newTables []*table.Table) protos.ManifestChangeSet { |
|
changes := []*protos.ManifestChange{} |
|
for _, table := range newTables { |
|
changes = append(changes, makeTableCreateChange(table.ID(), cd.nextLevel.level)) |
|
} |
|
for _, table := range cd.top { |
|
changes = append(changes, makeTableDeleteChange(table.ID())) |
|
} |
|
for _, table := range cd.bot { |
|
changes = append(changes, makeTableDeleteChange(table.ID())) |
|
} |
|
return protos.ManifestChangeSet{Changes: changes} |
|
} |
|
|
|
type compactDef struct { |
|
elog trace.Trace |
|
|
|
thisLevel *levelHandler |
|
nextLevel *levelHandler |
|
|
|
top []*table.Table |
|
bot []*table.Table |
|
|
|
thisRange keyRange |
|
nextRange keyRange |
|
|
|
thisSize int64 |
|
} |
|
|
|
func (cd *compactDef) lockLevels() { |
|
cd.thisLevel.RLock() |
|
cd.nextLevel.RLock() |
|
} |
|
|
|
func (cd *compactDef) unlockLevels() { |
|
cd.nextLevel.RUnlock() |
|
cd.thisLevel.RUnlock() |
|
} |
|
|
|
func (s *levelsController) fillTablesL0(cd *compactDef) bool { |
|
cd.lockLevels() |
|
defer cd.unlockLevels() |
|
|
|
cd.top = make([]*table.Table, len(cd.thisLevel.tables)) |
|
copy(cd.top, cd.thisLevel.tables) |
|
if len(cd.top) == 0 { |
|
return false |
|
} |
|
cd.thisRange = infRange |
|
|
|
kr := getKeyRange(cd.top) |
|
left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, kr) |
|
cd.bot = make([]*table.Table, right-left) |
|
copy(cd.bot, cd.nextLevel.tables[left:right]) |
|
|
|
if len(cd.bot) == 0 { |
|
cd.nextRange = kr |
|
} else { |
|
cd.nextRange = getKeyRange(cd.bot) |
|
} |
|
|
|
if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { |
|
return false |
|
} |
|
|
|
return true |
|
} |
|
|
|
func (s *levelsController) fillTables(cd *compactDef) bool { |
|
cd.lockLevels() |
|
defer cd.unlockLevels() |
|
|
|
tbls := make([]*table.Table, len(cd.thisLevel.tables)) |
|
copy(tbls, cd.thisLevel.tables) |
|
if len(tbls) == 0 { |
|
return false |
|
} |
|
|
|
// Find the biggest table, and compact that first. |
|
// TODO: Try other table picking strategies. |
|
sort.Slice(tbls, func(i, j int) bool { |
|
return tbls[i].Size() > tbls[j].Size() |
|
}) |
|
|
|
for _, t := range tbls { |
|
cd.thisSize = t.Size() |
|
cd.thisRange = keyRange{ |
|
// We pick all the versions of the smallest and the biggest key. |
|
left: y.KeyWithTs(y.ParseKey(t.Smallest()), math.MaxUint64), |
|
// Note that version zero would be the rightmost key. |
|
right: y.KeyWithTs(y.ParseKey(t.Biggest()), 0), |
|
} |
|
if s.cstatus.overlapsWith(cd.thisLevel.level, cd.thisRange) { |
|
continue |
|
} |
|
cd.top = []*table.Table{t} |
|
left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange) |
|
|
|
cd.bot = make([]*table.Table, right-left) |
|
copy(cd.bot, cd.nextLevel.tables[left:right]) |
|
|
|
if len(cd.bot) == 0 { |
|
cd.bot = []*table.Table{} |
|
cd.nextRange = cd.thisRange |
|
if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { |
|
continue |
|
} |
|
return true |
|
} |
|
cd.nextRange = getKeyRange(cd.bot) |
|
|
|
if s.cstatus.overlapsWith(cd.nextLevel.level, cd.nextRange) { |
|
continue |
|
} |
|
|
|
if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { |
|
continue |
|
} |
|
return true |
|
} |
|
return false |
|
} |
|
|
|
func (s *levelsController) runCompactDef(l int, cd compactDef) (err error) { |
|
timeStart := time.Now() |
|
|
|
thisLevel := cd.thisLevel |
|
nextLevel := cd.nextLevel |
|
|
|
// Table should never be moved directly between levels, always be rewritten to allow discarding |
|
// invalid versions. |
|
|
|
newTables, decr, err := s.compactBuildTables(l, cd) |
|
if err != nil { |
|
return err |
|
} |
|
defer func() { |
|
// Only assign to err, if it's not already nil. |
|
if decErr := decr(); err == nil { |
|
err = decErr |
|
} |
|
}() |
|
changeSet := buildChangeSet(&cd, newTables) |
|
|
|
// We write to the manifest _before_ we delete files (and after we created files) |
|
if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil { |
|
return err |
|
} |
|
|
|
// See comment earlier in this function about the ordering of these ops, and the order in which |
|
// we access levels when reading. |
|
if err := nextLevel.replaceTables(newTables); err != nil { |
|
return err |
|
} |
|
if err := thisLevel.deleteTables(cd.top); err != nil { |
|
return err |
|
} |
|
|
|
// Note: For level 0, while doCompact is running, it is possible that new tables are added. |
|
// However, the tables are added only to the end, so it is ok to just delete the first table. |
|
|
|
cd.elog.LazyPrintf("LOG Compact %d->%d, del %d tables, add %d tables, took %v\n", |
|
l, l+1, len(cd.top)+len(cd.bot), len(newTables), time.Since(timeStart)) |
|
return nil |
|
} |
|
|
|
// doCompact picks some table on level l and compacts it away to the next level. |
|
func (s *levelsController) doCompact(p compactionPriority) (bool, error) { |
|
l := p.level |
|
y.AssertTrue(l+1 < s.kv.opt.MaxLevels) // Sanity check. |
|
|
|
cd := compactDef{ |
|
elog: trace.New(fmt.Sprintf("Badger.L%d", l), "Compact"), |
|
thisLevel: s.levels[l], |
|
nextLevel: s.levels[l+1], |
|
} |
|
cd.elog.SetMaxEvents(100) |
|
defer cd.elog.Finish() |
|
|
|
cd.elog.LazyPrintf("Got compaction priority: %+v", p) |
|
|
|
// While picking tables to be compacted, both levels' tables are expected to |
|
// remain unchanged. |
|
if l == 0 { |
|
if !s.fillTablesL0(&cd) { |
|
cd.elog.LazyPrintf("fillTables failed for level: %d\n", l) |
|
return false, nil |
|
} |
|
|
|
} else { |
|
if !s.fillTables(&cd) { |
|
cd.elog.LazyPrintf("fillTables failed for level: %d\n", l) |
|
return false, nil |
|
} |
|
} |
|
defer s.cstatus.delete(cd) // Remove the ranges from compaction status. |
|
|
|
cd.elog.LazyPrintf("Running for level: %d\n", cd.thisLevel.level) |
|
s.cstatus.toLog(cd.elog) |
|
if err := s.runCompactDef(l, cd); err != nil { |
|
// This compaction couldn't be done successfully. |
|
cd.elog.LazyPrintf("\tLOG Compact FAILED with error: %+v: %+v", err, cd) |
|
return false, err |
|
} |
|
|
|
s.cstatus.toLog(cd.elog) |
|
cd.elog.LazyPrintf("Compaction for level: %d DONE", cd.thisLevel.level) |
|
return true, nil |
|
} |
|
|
|
func (s *levelsController) addLevel0Table(t *table.Table) error { |
|
// We update the manifest _before_ the table becomes part of a levelHandler, because at that |
|
// point it could get used in some compaction. This ensures the manifest file gets updated in |
|
// the proper order. (That means this update happens before that of some compaction which |
|
// deletes the table.) |
|
err := s.kv.manifest.addChanges([]*protos.ManifestChange{ |
|
makeTableCreateChange(t.ID(), 0), |
|
}) |
|
if err != nil { |
|
return err |
|
} |
|
|
|
for !s.levels[0].tryAddLevel0Table(t) { |
|
// Stall. Make sure all levels are healthy before we unstall. |
|
var timeStart time.Time |
|
{ |
|
s.elog.Printf("STALLED STALLED STALLED STALLED STALLED STALLED STALLED STALLED: %v\n", |
|
time.Since(lastUnstalled)) |
|
s.cstatus.RLock() |
|
for i := 0; i < s.kv.opt.MaxLevels; i++ { |
|
s.elog.Printf("level=%d. Status=%s Size=%d\n", |
|
i, s.cstatus.levels[i].debug(), s.levels[i].getTotalSize()) |
|
} |
|
s.cstatus.RUnlock() |
|
timeStart = time.Now() |
|
} |
|
// Before we unstall, we need to make sure that level 0 and 1 are healthy. Otherwise, we |
|
// will very quickly fill up level 0 again and if the compaction strategy favors level 0, |
|
// then level 1 is going to super full. |
|
for i := 0; ; i++ { |
|
// Passing 0 for delSize to compactable means we're treating incomplete compactions as |
|
// not having finished -- we wait for them to finish. Also, it's crucial this behavior |
|
// replicates pickCompactLevels' behavior in computing compactability in order to |
|
// guarantee progress. |
|
if !s.isLevel0Compactable() && !s.levels[1].isCompactable(0) { |
|
break |
|
} |
|
time.Sleep(10 * time.Millisecond) |
|
if i%100 == 0 { |
|
prios := s.pickCompactLevels() |
|
s.elog.Printf("Waiting to add level 0 table. Compaction priorities: %+v\n", prios) |
|
i = 0 |
|
} |
|
} |
|
{ |
|
s.elog.Printf("UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED UNSTALLED: %v\n", |
|
time.Since(timeStart)) |
|
lastUnstalled = time.Now() |
|
} |
|
} |
|
|
|
return nil |
|
} |
|
|
|
func (s *levelsController) close() error { |
|
err := s.cleanupLevels() |
|
return errors.Wrap(err, "levelsController.Close") |
|
} |
|
|
|
// get returns the found value if any. If not found, we return nil. |
|
func (s *levelsController) get(key []byte) (y.ValueStruct, error) { |
|
// It's important that we iterate the levels from 0 on upward. The reason is, if we iterated |
|
// in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could |
|
// read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do |
|
// parallelize this, we will need to call the h.RLock() function by increasing order of level |
|
// number.) |
|
for _, h := range s.levels { |
|
vs, err := h.get(key) // Calls h.RLock() and h.RUnlock(). |
|
if err != nil { |
|
return y.ValueStruct{}, errors.Wrapf(err, "get key: %q", key) |
|
} |
|
if vs.Value == nil && vs.Meta == 0 { |
|
continue |
|
} |
|
return vs, nil |
|
} |
|
return y.ValueStruct{}, nil |
|
} |
|
|
|
func appendIteratorsReversed(out []y.Iterator, th []*table.Table, reversed bool) []y.Iterator { |
|
for i := len(th) - 1; i >= 0; i-- { |
|
// This will increment the reference of the table handler. |
|
out = append(out, th[i].NewIterator(reversed)) |
|
} |
|
return out |
|
} |
|
|
|
// appendIterators appends iterators to an array of iterators, for merging. |
|
// Note: This obtains references for the table handlers. Remember to close these iterators. |
|
func (s *levelsController) appendIterators( |
|
iters []y.Iterator, reversed bool) []y.Iterator { |
|
// Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing |
|
// data when there's a compaction. |
|
for _, level := range s.levels { |
|
iters = level.appendIterators(iters, reversed) |
|
} |
|
return iters |
|
} |
|
|
|
type TableInfo struct { |
|
ID uint64 |
|
Level int |
|
Left []byte |
|
Right []byte |
|
} |
|
|
|
func (s *levelsController) getTableInfo() (result []TableInfo) { |
|
for _, l := range s.levels { |
|
for _, t := range l.tables { |
|
info := TableInfo{ |
|
ID: t.ID(), |
|
Level: l.level, |
|
Left: t.Smallest(), |
|
Right: t.Biggest(), |
|
} |
|
result = append(result, info) |
|
} |
|
} |
|
sort.Slice(result, func(i, j int) bool { |
|
if result[i].Level != result[j].Level { |
|
return result[i].Level < result[j].Level |
|
} |
|
return result[i].ID < result[j].ID |
|
}) |
|
return |
|
}
|
|
|