You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
464 lines
11 KiB
464 lines
11 KiB
// Copyright (c) 2012, Suryandaru Triandana <[email protected]> |
|
// All rights reserved. |
|
// |
|
// Use of this source code is governed by a BSD-style license that can be |
|
// found in the LICENSE file. |
|
|
|
package leveldb |
|
|
|
import ( |
|
"sync/atomic" |
|
"time" |
|
|
|
"github.com/syndtr/goleveldb/leveldb/memdb" |
|
"github.com/syndtr/goleveldb/leveldb/opt" |
|
"github.com/syndtr/goleveldb/leveldb/util" |
|
) |
|
|
|
func (db *DB) writeJournal(batches []*Batch, seq uint64, sync bool) error { |
|
wr, err := db.journal.Next() |
|
if err != nil { |
|
return err |
|
} |
|
if err := writeBatchesWithHeader(wr, batches, seq); err != nil { |
|
return err |
|
} |
|
if err := db.journal.Flush(); err != nil { |
|
return err |
|
} |
|
if sync { |
|
return db.journalWriter.Sync() |
|
} |
|
return nil |
|
} |
|
|
|
func (db *DB) rotateMem(n int, wait bool) (mem *memDB, err error) { |
|
retryLimit := 3 |
|
retry: |
|
// Wait for pending memdb compaction. |
|
err = db.compTriggerWait(db.mcompCmdC) |
|
if err != nil { |
|
return |
|
} |
|
retryLimit-- |
|
|
|
// Create new memdb and journal. |
|
mem, err = db.newMem(n) |
|
if err != nil { |
|
if err == errHasFrozenMem { |
|
if retryLimit <= 0 { |
|
panic("BUG: still has frozen memdb") |
|
} |
|
goto retry |
|
} |
|
return |
|
} |
|
|
|
// Schedule memdb compaction. |
|
if wait { |
|
err = db.compTriggerWait(db.mcompCmdC) |
|
} else { |
|
db.compTrigger(db.mcompCmdC) |
|
} |
|
return |
|
} |
|
|
|
func (db *DB) flush(n int) (mdb *memDB, mdbFree int, err error) { |
|
delayed := false |
|
slowdownTrigger := db.s.o.GetWriteL0SlowdownTrigger() |
|
pauseTrigger := db.s.o.GetWriteL0PauseTrigger() |
|
flush := func() (retry bool) { |
|
mdb = db.getEffectiveMem() |
|
if mdb == nil { |
|
err = ErrClosed |
|
return false |
|
} |
|
defer func() { |
|
if retry { |
|
mdb.decref() |
|
mdb = nil |
|
} |
|
}() |
|
tLen := db.s.tLen(0) |
|
mdbFree = mdb.Free() |
|
switch { |
|
case tLen >= slowdownTrigger && !delayed: |
|
delayed = true |
|
time.Sleep(time.Millisecond) |
|
case mdbFree >= n: |
|
return false |
|
case tLen >= pauseTrigger: |
|
delayed = true |
|
// Set the write paused flag explicitly. |
|
atomic.StoreInt32(&db.inWritePaused, 1) |
|
err = db.compTriggerWait(db.tcompCmdC) |
|
// Unset the write paused flag. |
|
atomic.StoreInt32(&db.inWritePaused, 0) |
|
if err != nil { |
|
return false |
|
} |
|
default: |
|
// Allow memdb to grow if it has no entry. |
|
if mdb.Len() == 0 { |
|
mdbFree = n |
|
} else { |
|
mdb.decref() |
|
mdb, err = db.rotateMem(n, false) |
|
if err == nil { |
|
mdbFree = mdb.Free() |
|
} else { |
|
mdbFree = 0 |
|
} |
|
} |
|
return false |
|
} |
|
return true |
|
} |
|
start := time.Now() |
|
for flush() { |
|
} |
|
if delayed { |
|
db.writeDelay += time.Since(start) |
|
db.writeDelayN++ |
|
} else if db.writeDelayN > 0 { |
|
db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) |
|
atomic.AddInt32(&db.cWriteDelayN, int32(db.writeDelayN)) |
|
atomic.AddInt64(&db.cWriteDelay, int64(db.writeDelay)) |
|
db.writeDelay = 0 |
|
db.writeDelayN = 0 |
|
} |
|
return |
|
} |
|
|
|
type writeMerge struct { |
|
sync bool |
|
batch *Batch |
|
keyType keyType |
|
key, value []byte |
|
} |
|
|
|
func (db *DB) unlockWrite(overflow bool, merged int, err error) { |
|
for i := 0; i < merged; i++ { |
|
db.writeAckC <- err |
|
} |
|
if overflow { |
|
// Pass lock to the next write (that failed to merge). |
|
db.writeMergedC <- false |
|
} else { |
|
// Release lock. |
|
<-db.writeLockC |
|
} |
|
} |
|
|
|
// ourBatch is batch that we can modify. |
|
func (db *DB) writeLocked(batch, ourBatch *Batch, merge, sync bool) error { |
|
// Try to flush memdb. This method would also trying to throttle writes |
|
// if it is too fast and compaction cannot catch-up. |
|
mdb, mdbFree, err := db.flush(batch.internalLen) |
|
if err != nil { |
|
db.unlockWrite(false, 0, err) |
|
return err |
|
} |
|
defer mdb.decref() |
|
|
|
var ( |
|
overflow bool |
|
merged int |
|
batches = []*Batch{batch} |
|
) |
|
|
|
if merge { |
|
// Merge limit. |
|
var mergeLimit int |
|
if batch.internalLen > 128<<10 { |
|
mergeLimit = (1 << 20) - batch.internalLen |
|
} else { |
|
mergeLimit = 128 << 10 |
|
} |
|
mergeCap := mdbFree - batch.internalLen |
|
if mergeLimit > mergeCap { |
|
mergeLimit = mergeCap |
|
} |
|
|
|
merge: |
|
for mergeLimit > 0 { |
|
select { |
|
case incoming := <-db.writeMergeC: |
|
if incoming.batch != nil { |
|
// Merge batch. |
|
if incoming.batch.internalLen > mergeLimit { |
|
overflow = true |
|
break merge |
|
} |
|
batches = append(batches, incoming.batch) |
|
mergeLimit -= incoming.batch.internalLen |
|
} else { |
|
// Merge put. |
|
internalLen := len(incoming.key) + len(incoming.value) + 8 |
|
if internalLen > mergeLimit { |
|
overflow = true |
|
break merge |
|
} |
|
if ourBatch == nil { |
|
ourBatch = db.batchPool.Get().(*Batch) |
|
ourBatch.Reset() |
|
batches = append(batches, ourBatch) |
|
} |
|
// We can use same batch since concurrent write doesn't |
|
// guarantee write order. |
|
ourBatch.appendRec(incoming.keyType, incoming.key, incoming.value) |
|
mergeLimit -= internalLen |
|
} |
|
sync = sync || incoming.sync |
|
merged++ |
|
db.writeMergedC <- true |
|
|
|
default: |
|
break merge |
|
} |
|
} |
|
} |
|
|
|
// Release ourBatch if any. |
|
if ourBatch != nil { |
|
defer db.batchPool.Put(ourBatch) |
|
} |
|
|
|
// Seq number. |
|
seq := db.seq + 1 |
|
|
|
// Write journal. |
|
if err := db.writeJournal(batches, seq, sync); err != nil { |
|
db.unlockWrite(overflow, merged, err) |
|
return err |
|
} |
|
|
|
// Put batches. |
|
for _, batch := range batches { |
|
if err := batch.putMem(seq, mdb.DB); err != nil { |
|
panic(err) |
|
} |
|
seq += uint64(batch.Len()) |
|
} |
|
|
|
// Incr seq number. |
|
db.addSeq(uint64(batchesLen(batches))) |
|
|
|
// Rotate memdb if it's reach the threshold. |
|
if batch.internalLen >= mdbFree { |
|
db.rotateMem(0, false) |
|
} |
|
|
|
db.unlockWrite(overflow, merged, nil) |
|
return nil |
|
} |
|
|
|
// Write apply the given batch to the DB. The batch records will be applied |
|
// sequentially. Write might be used concurrently, when used concurrently and |
|
// batch is small enough, write will try to merge the batches. Set NoWriteMerge |
|
// option to true to disable write merge. |
|
// |
|
// It is safe to modify the contents of the arguments after Write returns but |
|
// not before. Write will not modify content of the batch. |
|
func (db *DB) Write(batch *Batch, wo *opt.WriteOptions) error { |
|
if err := db.ok(); err != nil || batch == nil || batch.Len() == 0 { |
|
return err |
|
} |
|
|
|
// If the batch size is larger than write buffer, it may justified to write |
|
// using transaction instead. Using transaction the batch will be written |
|
// into tables directly, skipping the journaling. |
|
if batch.internalLen > db.s.o.GetWriteBuffer() && !db.s.o.GetDisableLargeBatchTransaction() { |
|
tr, err := db.OpenTransaction() |
|
if err != nil { |
|
return err |
|
} |
|
if err := tr.Write(batch, wo); err != nil { |
|
tr.Discard() |
|
return err |
|
} |
|
return tr.Commit() |
|
} |
|
|
|
merge := !wo.GetNoWriteMerge() && !db.s.o.GetNoWriteMerge() |
|
sync := wo.GetSync() && !db.s.o.GetNoSync() |
|
|
|
// Acquire write lock. |
|
if merge { |
|
select { |
|
case db.writeMergeC <- writeMerge{sync: sync, batch: batch}: |
|
if <-db.writeMergedC { |
|
// Write is merged. |
|
return <-db.writeAckC |
|
} |
|
// Write is not merged, the write lock is handed to us. Continue. |
|
case db.writeLockC <- struct{}{}: |
|
// Write lock acquired. |
|
case err := <-db.compPerErrC: |
|
// Compaction error. |
|
return err |
|
case <-db.closeC: |
|
// Closed |
|
return ErrClosed |
|
} |
|
} else { |
|
select { |
|
case db.writeLockC <- struct{}{}: |
|
// Write lock acquired. |
|
case err := <-db.compPerErrC: |
|
// Compaction error. |
|
return err |
|
case <-db.closeC: |
|
// Closed |
|
return ErrClosed |
|
} |
|
} |
|
|
|
return db.writeLocked(batch, nil, merge, sync) |
|
} |
|
|
|
func (db *DB) putRec(kt keyType, key, value []byte, wo *opt.WriteOptions) error { |
|
if err := db.ok(); err != nil { |
|
return err |
|
} |
|
|
|
merge := !wo.GetNoWriteMerge() && !db.s.o.GetNoWriteMerge() |
|
sync := wo.GetSync() && !db.s.o.GetNoSync() |
|
|
|
// Acquire write lock. |
|
if merge { |
|
select { |
|
case db.writeMergeC <- writeMerge{sync: sync, keyType: kt, key: key, value: value}: |
|
if <-db.writeMergedC { |
|
// Write is merged. |
|
return <-db.writeAckC |
|
} |
|
// Write is not merged, the write lock is handed to us. Continue. |
|
case db.writeLockC <- struct{}{}: |
|
// Write lock acquired. |
|
case err := <-db.compPerErrC: |
|
// Compaction error. |
|
return err |
|
case <-db.closeC: |
|
// Closed |
|
return ErrClosed |
|
} |
|
} else { |
|
select { |
|
case db.writeLockC <- struct{}{}: |
|
// Write lock acquired. |
|
case err := <-db.compPerErrC: |
|
// Compaction error. |
|
return err |
|
case <-db.closeC: |
|
// Closed |
|
return ErrClosed |
|
} |
|
} |
|
|
|
batch := db.batchPool.Get().(*Batch) |
|
batch.Reset() |
|
batch.appendRec(kt, key, value) |
|
return db.writeLocked(batch, batch, merge, sync) |
|
} |
|
|
|
// Put sets the value for the given key. It overwrites any previous value |
|
// for that key; a DB is not a multi-map. Write merge also applies for Put, see |
|
// Write. |
|
// |
|
// It is safe to modify the contents of the arguments after Put returns but not |
|
// before. |
|
func (db *DB) Put(key, value []byte, wo *opt.WriteOptions) error { |
|
return db.putRec(keyTypeVal, key, value, wo) |
|
} |
|
|
|
// Delete deletes the value for the given key. Delete will not returns error if |
|
// key doesn't exist. Write merge also applies for Delete, see Write. |
|
// |
|
// It is safe to modify the contents of the arguments after Delete returns but |
|
// not before. |
|
func (db *DB) Delete(key []byte, wo *opt.WriteOptions) error { |
|
return db.putRec(keyTypeDel, key, nil, wo) |
|
} |
|
|
|
func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool { |
|
iter := mem.NewIterator(nil) |
|
defer iter.Release() |
|
return (max == nil || (iter.First() && icmp.uCompare(max, internalKey(iter.Key()).ukey()) >= 0)) && |
|
(min == nil || (iter.Last() && icmp.uCompare(min, internalKey(iter.Key()).ukey()) <= 0)) |
|
} |
|
|
|
// CompactRange compacts the underlying DB for the given key range. |
|
// In particular, deleted and overwritten versions are discarded, |
|
// and the data is rearranged to reduce the cost of operations |
|
// needed to access the data. This operation should typically only |
|
// be invoked by users who understand the underlying implementation. |
|
// |
|
// A nil Range.Start is treated as a key before all keys in the DB. |
|
// And a nil Range.Limit is treated as a key after all keys in the DB. |
|
// Therefore if both is nil then it will compact entire DB. |
|
func (db *DB) CompactRange(r util.Range) error { |
|
if err := db.ok(); err != nil { |
|
return err |
|
} |
|
|
|
// Lock writer. |
|
select { |
|
case db.writeLockC <- struct{}{}: |
|
case err := <-db.compPerErrC: |
|
return err |
|
case <-db.closeC: |
|
return ErrClosed |
|
} |
|
|
|
// Check for overlaps in memdb. |
|
mdb := db.getEffectiveMem() |
|
if mdb == nil { |
|
return ErrClosed |
|
} |
|
defer mdb.decref() |
|
if isMemOverlaps(db.s.icmp, mdb.DB, r.Start, r.Limit) { |
|
// Memdb compaction. |
|
if _, err := db.rotateMem(0, false); err != nil { |
|
<-db.writeLockC |
|
return err |
|
} |
|
<-db.writeLockC |
|
if err := db.compTriggerWait(db.mcompCmdC); err != nil { |
|
return err |
|
} |
|
} else { |
|
<-db.writeLockC |
|
} |
|
|
|
// Table compaction. |
|
return db.compTriggerRange(db.tcompCmdC, -1, r.Start, r.Limit) |
|
} |
|
|
|
// SetReadOnly makes DB read-only. It will stay read-only until reopened. |
|
func (db *DB) SetReadOnly() error { |
|
if err := db.ok(); err != nil { |
|
return err |
|
} |
|
|
|
// Lock writer. |
|
select { |
|
case db.writeLockC <- struct{}{}: |
|
db.compWriteLocking = true |
|
case err := <-db.compPerErrC: |
|
return err |
|
case <-db.closeC: |
|
return ErrClosed |
|
} |
|
|
|
// Set compaction read-only. |
|
select { |
|
case db.compErrSetC <- ErrReadOnly: |
|
case perr := <-db.compPerErrC: |
|
return perr |
|
case <-db.closeC: |
|
return ErrClosed |
|
} |
|
|
|
return nil |
|
}
|
|
|