You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
604 lines
17 KiB
604 lines
17 KiB
/* |
|
* Copyright 2017 Dgraph Labs, Inc. and Contributors |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
|
|
package badger |
|
|
|
import ( |
|
"bytes" |
|
"math" |
|
"sort" |
|
"strconv" |
|
"sync" |
|
"sync/atomic" |
|
"time" |
|
|
|
"github.com/dgraph-io/badger/y" |
|
farm "github.com/dgryski/go-farm" |
|
"github.com/pkg/errors" |
|
) |
|
|
|
type oracle struct { |
|
// curRead must be at the top for memory alignment. See issue #311. |
|
curRead uint64 // Managed by the mutex. |
|
refCount int64 |
|
isManaged bool // Does not change value, so no locking required. |
|
|
|
sync.Mutex |
|
writeLock sync.Mutex |
|
nextCommit uint64 |
|
|
|
// Either of these is used to determine which versions can be permanently |
|
// discarded during compaction. |
|
discardTs uint64 // Used by ManagedDB. |
|
readMark y.WaterMark // Used by DB. |
|
|
|
// commits stores a key fingerprint and latest commit counter for it. |
|
// refCount is used to clear out commits map to avoid a memory blowup. |
|
commits map[uint64]uint64 |
|
} |
|
|
|
func (o *oracle) addRef() { |
|
atomic.AddInt64(&o.refCount, 1) |
|
} |
|
|
|
func (o *oracle) decrRef() { |
|
if count := atomic.AddInt64(&o.refCount, -1); count == 0 { |
|
// Clear out commits maps to release memory. |
|
o.Lock() |
|
// Avoids the race where something new is added to commitsMap |
|
// after we check refCount and before we take Lock. |
|
if atomic.LoadInt64(&o.refCount) != 0 { |
|
o.Unlock() |
|
return |
|
} |
|
if len(o.commits) >= 1000 { // If the map is still small, let it slide. |
|
o.commits = make(map[uint64]uint64) |
|
} |
|
o.Unlock() |
|
} |
|
} |
|
|
|
func (o *oracle) readTs() uint64 { |
|
if o.isManaged { |
|
return math.MaxUint64 |
|
} |
|
return atomic.LoadUint64(&o.curRead) |
|
} |
|
|
|
func (o *oracle) commitTs() uint64 { |
|
o.Lock() |
|
defer o.Unlock() |
|
return o.nextCommit |
|
} |
|
|
|
// Any deleted or invalid versions at or below ts would be discarded during |
|
// compaction to reclaim disk space in LSM tree and thence value log. |
|
func (o *oracle) setDiscardTs(ts uint64) { |
|
o.Lock() |
|
defer o.Unlock() |
|
o.discardTs = ts |
|
} |
|
|
|
func (o *oracle) discardAtOrBelow() uint64 { |
|
if o.isManaged { |
|
o.Lock() |
|
defer o.Unlock() |
|
return o.discardTs |
|
} |
|
return o.readMark.MinReadTs() |
|
} |
|
|
|
// hasConflict must be called while having a lock. |
|
func (o *oracle) hasConflict(txn *Txn) bool { |
|
if len(txn.reads) == 0 { |
|
return false |
|
} |
|
for _, ro := range txn.reads { |
|
if ts, has := o.commits[ro]; has && ts > txn.readTs { |
|
return true |
|
} |
|
} |
|
return false |
|
} |
|
|
|
func (o *oracle) newCommitTs(txn *Txn) uint64 { |
|
o.Lock() |
|
defer o.Unlock() |
|
|
|
if o.hasConflict(txn) { |
|
return 0 |
|
} |
|
|
|
var ts uint64 |
|
if !o.isManaged { |
|
// This is the general case, when user doesn't specify the read and commit ts. |
|
ts = o.nextCommit |
|
o.nextCommit++ |
|
|
|
} else { |
|
// If commitTs is set, use it instead. |
|
ts = txn.commitTs |
|
} |
|
|
|
for _, w := range txn.writes { |
|
o.commits[w] = ts // Update the commitTs. |
|
} |
|
return ts |
|
} |
|
|
|
func (o *oracle) doneCommit(cts uint64) { |
|
if o.isManaged { |
|
// No need to update anything. |
|
return |
|
} |
|
|
|
for { |
|
curRead := atomic.LoadUint64(&o.curRead) |
|
if cts <= curRead { |
|
return |
|
} |
|
atomic.CompareAndSwapUint64(&o.curRead, curRead, cts) |
|
} |
|
} |
|
|
|
// Txn represents a Badger transaction. |
|
type Txn struct { |
|
readTs uint64 |
|
commitTs uint64 |
|
|
|
update bool // update is used to conditionally keep track of reads. |
|
reads []uint64 // contains fingerprints of keys read. |
|
writes []uint64 // contains fingerprints of keys written. |
|
|
|
pendingWrites map[string]*Entry // cache stores any writes done by txn. |
|
|
|
db *DB |
|
callbacks []func() |
|
discarded bool |
|
|
|
size int64 |
|
count int64 |
|
numIterators int32 |
|
} |
|
|
|
type pendingWritesIterator struct { |
|
entries []*Entry |
|
nextIdx int |
|
readTs uint64 |
|
reversed bool |
|
} |
|
|
|
func (pi *pendingWritesIterator) Next() { |
|
pi.nextIdx++ |
|
} |
|
|
|
func (pi *pendingWritesIterator) Rewind() { |
|
pi.nextIdx = 0 |
|
} |
|
|
|
func (pi *pendingWritesIterator) Seek(key []byte) { |
|
key = y.ParseKey(key) |
|
pi.nextIdx = sort.Search(len(pi.entries), func(idx int) bool { |
|
cmp := bytes.Compare(pi.entries[idx].Key, key) |
|
if !pi.reversed { |
|
return cmp >= 0 |
|
} |
|
return cmp <= 0 |
|
}) |
|
} |
|
|
|
func (pi *pendingWritesIterator) Key() []byte { |
|
y.AssertTrue(pi.Valid()) |
|
entry := pi.entries[pi.nextIdx] |
|
return y.KeyWithTs(entry.Key, pi.readTs) |
|
} |
|
|
|
func (pi *pendingWritesIterator) Value() y.ValueStruct { |
|
y.AssertTrue(pi.Valid()) |
|
entry := pi.entries[pi.nextIdx] |
|
return y.ValueStruct{ |
|
Value: entry.Value, |
|
Meta: entry.meta, |
|
UserMeta: entry.UserMeta, |
|
ExpiresAt: entry.ExpiresAt, |
|
Version: pi.readTs, |
|
} |
|
} |
|
|
|
func (pi *pendingWritesIterator) Valid() bool { |
|
return pi.nextIdx < len(pi.entries) |
|
} |
|
|
|
func (pi *pendingWritesIterator) Close() error { |
|
return nil |
|
} |
|
|
|
func (txn *Txn) newPendingWritesIterator(reversed bool) *pendingWritesIterator { |
|
if !txn.update || len(txn.pendingWrites) == 0 { |
|
return nil |
|
} |
|
entries := make([]*Entry, 0, len(txn.pendingWrites)) |
|
for _, e := range txn.pendingWrites { |
|
entries = append(entries, e) |
|
} |
|
// Number of pending writes per transaction shouldn't be too big in general. |
|
sort.Slice(entries, func(i, j int) bool { |
|
cmp := bytes.Compare(entries[i].Key, entries[j].Key) |
|
if !reversed { |
|
return cmp < 0 |
|
} |
|
return cmp > 0 |
|
}) |
|
return &pendingWritesIterator{ |
|
readTs: txn.readTs, |
|
entries: entries, |
|
reversed: reversed, |
|
} |
|
} |
|
|
|
func (txn *Txn) checkSize(e *Entry) error { |
|
count := txn.count + 1 |
|
// Extra bytes for version in key. |
|
size := txn.size + int64(e.estimateSize(txn.db.opt.ValueThreshold)) + 10 |
|
if count >= txn.db.opt.maxBatchCount || size >= txn.db.opt.maxBatchSize { |
|
return ErrTxnTooBig |
|
} |
|
txn.count, txn.size = count, size |
|
return nil |
|
} |
|
|
|
// Set adds a key-value pair to the database. |
|
// |
|
// It will return ErrReadOnlyTxn if update flag was set to false when creating the |
|
// transaction. |
|
// |
|
// The current transaction keeps a reference to the key and val byte slice |
|
// arguments. Users must not modify key and val until the end of the transaction. |
|
func (txn *Txn) Set(key, val []byte) error { |
|
e := &Entry{ |
|
Key: key, |
|
Value: val, |
|
} |
|
return txn.SetEntry(e) |
|
} |
|
|
|
// SetWithMeta adds a key-value pair to the database, along with a metadata |
|
// byte. |
|
// |
|
// This byte is stored alongside the key, and can be used as an aid to |
|
// interpret the value or store other contextual bits corresponding to the |
|
// key-value pair. |
|
// |
|
// The current transaction keeps a reference to the key and val byte slice |
|
// arguments. Users must not modify key and val until the end of the transaction. |
|
func (txn *Txn) SetWithMeta(key, val []byte, meta byte) error { |
|
e := &Entry{Key: key, Value: val, UserMeta: meta} |
|
return txn.SetEntry(e) |
|
} |
|
|
|
// SetWithDiscard acts like SetWithMeta, but adds a marker to discard earlier |
|
// versions of the key. |
|
// |
|
// This method is only useful if you have set a higher limit for |
|
// options.NumVersionsToKeep. The default setting is 1, in which case, this |
|
// function doesn't add any more benefit than just calling the normal |
|
// SetWithMeta (or Set) function. If however, you have a higher setting for |
|
// NumVersionsToKeep (in Dgraph, we set it to infinity), you can use this method |
|
// to indicate that all the older versions can be discarded and removed during |
|
// compactions. |
|
// |
|
// The current transaction keeps a reference to the key and val byte slice |
|
// arguments. Users must not modify key and val until the end of the |
|
// transaction. |
|
func (txn *Txn) SetWithDiscard(key, val []byte, meta byte) error { |
|
e := &Entry{ |
|
Key: key, |
|
Value: val, |
|
UserMeta: meta, |
|
meta: bitDiscardEarlierVersions, |
|
} |
|
return txn.SetEntry(e) |
|
} |
|
|
|
// SetWithTTL adds a key-value pair to the database, along with a time-to-live |
|
// (TTL) setting. A key stored with a TTL would automatically expire after the |
|
// time has elapsed , and be eligible for garbage collection. |
|
// |
|
// The current transaction keeps a reference to the key and val byte slice |
|
// arguments. Users must not modify key and val until the end of the |
|
// transaction. |
|
func (txn *Txn) SetWithTTL(key, val []byte, dur time.Duration) error { |
|
expire := time.Now().Add(dur).Unix() |
|
e := &Entry{Key: key, Value: val, ExpiresAt: uint64(expire)} |
|
return txn.SetEntry(e) |
|
} |
|
|
|
func (txn *Txn) modify(e *Entry) error { |
|
if !txn.update { |
|
return ErrReadOnlyTxn |
|
} else if txn.discarded { |
|
return ErrDiscardedTxn |
|
} else if len(e.Key) == 0 { |
|
return ErrEmptyKey |
|
} else if len(e.Key) > maxKeySize { |
|
return exceedsMaxKeySizeError(e.Key) |
|
} else if int64(len(e.Value)) > txn.db.opt.ValueLogFileSize { |
|
return exceedsMaxValueSizeError(e.Value, txn.db.opt.ValueLogFileSize) |
|
} |
|
if err := txn.checkSize(e); err != nil { |
|
return err |
|
} |
|
|
|
fp := farm.Fingerprint64(e.Key) // Avoid dealing with byte arrays. |
|
txn.writes = append(txn.writes, fp) |
|
txn.pendingWrites[string(e.Key)] = e |
|
return nil |
|
} |
|
|
|
// SetEntry takes an Entry struct and adds the key-value pair in the struct, |
|
// along with other metadata to the database. |
|
// |
|
// The current transaction keeps a reference to the entry passed in argument. |
|
// Users must not modify the entry until the end of the transaction. |
|
func (txn *Txn) SetEntry(e *Entry) error { |
|
return txn.modify(e) |
|
} |
|
|
|
// Delete deletes a key. |
|
// |
|
// This is done by adding a delete marker for the key at commit timestamp. Any |
|
// reads happening before this timestamp would be unaffected. Any reads after |
|
// this commit would see the deletion. |
|
// |
|
// The current transaction keeps a reference to the key byte slice argument. |
|
// Users must not modify the key until the end of the transaction. |
|
func (txn *Txn) Delete(key []byte) error { |
|
e := &Entry{ |
|
Key: key, |
|
meta: bitDelete, |
|
} |
|
return txn.modify(e) |
|
} |
|
|
|
// Get looks for key and returns corresponding Item. |
|
// If key is not found, ErrKeyNotFound is returned. |
|
func (txn *Txn) Get(key []byte) (item *Item, rerr error) { |
|
if len(key) == 0 { |
|
return nil, ErrEmptyKey |
|
} else if txn.discarded { |
|
return nil, ErrDiscardedTxn |
|
} |
|
|
|
item = new(Item) |
|
if txn.update { |
|
if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key) { |
|
if isDeletedOrExpired(e.meta, e.ExpiresAt) { |
|
return nil, ErrKeyNotFound |
|
} |
|
// Fulfill from cache. |
|
item.meta = e.meta |
|
item.val = e.Value |
|
item.userMeta = e.UserMeta |
|
item.key = key |
|
item.status = prefetched |
|
item.version = txn.readTs |
|
item.expiresAt = e.ExpiresAt |
|
// We probably don't need to set db on item here. |
|
return item, nil |
|
} |
|
// Only track reads if this is update txn. No need to track read if txn serviced it |
|
// internally. |
|
fp := farm.Fingerprint64(key) |
|
txn.reads = append(txn.reads, fp) |
|
} |
|
|
|
seek := y.KeyWithTs(key, txn.readTs) |
|
vs, err := txn.db.get(seek) |
|
if err != nil { |
|
return nil, errors.Wrapf(err, "DB::Get key: %q", key) |
|
} |
|
if vs.Value == nil && vs.Meta == 0 { |
|
return nil, ErrKeyNotFound |
|
} |
|
if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { |
|
return nil, ErrKeyNotFound |
|
} |
|
|
|
item.key = key |
|
item.version = vs.Version |
|
item.meta = vs.Meta |
|
item.userMeta = vs.UserMeta |
|
item.db = txn.db |
|
item.vptr = vs.Value |
|
item.txn = txn |
|
item.expiresAt = vs.ExpiresAt |
|
return item, nil |
|
} |
|
|
|
func (txn *Txn) runCallbacks() { |
|
for _, cb := range txn.callbacks { |
|
cb() |
|
} |
|
txn.callbacks = txn.callbacks[:0] |
|
} |
|
|
|
// Discard discards a created transaction. This method is very important and must be called. Commit |
|
// method calls this internally, however, calling this multiple times doesn't cause any issues. So, |
|
// this can safely be called via a defer right when transaction is created. |
|
// |
|
// NOTE: If any operations are run on a discarded transaction, ErrDiscardedTxn is returned. |
|
func (txn *Txn) Discard() { |
|
if txn.discarded { // Avoid a re-run. |
|
return |
|
} |
|
if atomic.LoadInt32(&txn.numIterators) > 0 { |
|
panic("Unclosed iterator at time of Txn.Discard.") |
|
} |
|
txn.discarded = true |
|
txn.db.orc.readMark.Done(txn.readTs) |
|
txn.runCallbacks() |
|
if txn.update { |
|
txn.db.orc.decrRef() |
|
} |
|
} |
|
|
|
// Commit commits the transaction, following these steps: |
|
// |
|
// 1. If there are no writes, return immediately. |
|
// |
|
// 2. Check if read rows were updated since txn started. If so, return ErrConflict. |
|
// |
|
// 3. If no conflict, generate a commit timestamp and update written rows' commit ts. |
|
// |
|
// 4. Batch up all writes, write them to value log and LSM tree. |
|
// |
|
// 5. If callback is provided, Badger will return immediately after checking |
|
// for conflicts. Writes to the database will happen in the background. If |
|
// there is a conflict, an error will be returned and the callback will not |
|
// run. If there are no conflicts, the callback will be called in the |
|
// background upon successful completion of writes or any error during write. |
|
// |
|
// If error is nil, the transaction is successfully committed. In case of a non-nil error, the LSM |
|
// tree won't be updated, so there's no need for any rollback. |
|
func (txn *Txn) Commit(callback func(error)) error { |
|
if txn.commitTs == 0 && txn.db.opt.managedTxns { |
|
return ErrManagedTxn |
|
} |
|
if txn.discarded { |
|
return ErrDiscardedTxn |
|
} |
|
defer txn.Discard() |
|
if len(txn.writes) == 0 { |
|
return nil // Nothing to do. |
|
} |
|
|
|
state := txn.db.orc |
|
state.writeLock.Lock() |
|
commitTs := state.newCommitTs(txn) |
|
if commitTs == 0 { |
|
state.writeLock.Unlock() |
|
return ErrConflict |
|
} |
|
|
|
entries := make([]*Entry, 0, len(txn.pendingWrites)+1) |
|
for _, e := range txn.pendingWrites { |
|
// Suffix the keys with commit ts, so the key versions are sorted in |
|
// descending order of commit timestamp. |
|
e.Key = y.KeyWithTs(e.Key, commitTs) |
|
e.meta |= bitTxn |
|
entries = append(entries, e) |
|
} |
|
e := &Entry{ |
|
Key: y.KeyWithTs(txnKey, commitTs), |
|
Value: []byte(strconv.FormatUint(commitTs, 10)), |
|
meta: bitFinTxn, |
|
} |
|
entries = append(entries, e) |
|
|
|
req, err := txn.db.sendToWriteCh(entries) |
|
state.writeLock.Unlock() |
|
if err != nil { |
|
return err |
|
} |
|
|
|
// Need to release all locks or writes can get deadlocked. |
|
txn.runCallbacks() |
|
|
|
if callback == nil { |
|
// If batchSet failed, LSM would not have been updated. So, no need to rollback anything. |
|
|
|
// TODO: What if some of the txns successfully make it to value log, but others fail. |
|
// Nothing gets updated to LSM, until a restart happens. |
|
defer state.doneCommit(commitTs) |
|
return req.Wait() |
|
} |
|
go func() { |
|
err := req.Wait() |
|
// Write is complete. Let's call the callback function now. |
|
state.doneCommit(commitTs) |
|
callback(err) |
|
}() |
|
return nil |
|
} |
|
|
|
// NewTransaction creates a new transaction. Badger supports concurrent execution of transactions, |
|
// providing serializable snapshot isolation, avoiding write skews. Badger achieves this by tracking |
|
// the keys read and at Commit time, ensuring that these read keys weren't concurrently modified by |
|
// another transaction. |
|
// |
|
// For read-only transactions, set update to false. In this mode, we don't track the rows read for |
|
// any changes. Thus, any long running iterations done in this mode wouldn't pay this overhead. |
|
// |
|
// Running transactions concurrently is OK. However, a transaction itself isn't thread safe, and |
|
// should only be run serially. It doesn't matter if a transaction is created by one goroutine and |
|
// passed down to other, as long as the Txn APIs are called serially. |
|
// |
|
// When you create a new transaction, it is absolutely essential to call |
|
// Discard(). This should be done irrespective of what the update param is set |
|
// to. Commit API internally runs Discard, but running it twice wouldn't cause |
|
// any issues. |
|
// |
|
// txn := db.NewTransaction(false) |
|
// defer txn.Discard() |
|
// // Call various APIs. |
|
func (db *DB) NewTransaction(update bool) *Txn { |
|
if db.opt.ReadOnly && update { |
|
// DB is read-only, force read-only transaction. |
|
update = false |
|
} |
|
|
|
txn := &Txn{ |
|
update: update, |
|
db: db, |
|
readTs: db.orc.readTs(), |
|
count: 1, // One extra entry for BitFin. |
|
size: int64(len(txnKey) + 10), // Some buffer for the extra entry. |
|
} |
|
db.orc.readMark.Begin(txn.readTs) |
|
if update { |
|
txn.pendingWrites = make(map[string]*Entry) |
|
txn.db.orc.addRef() |
|
} |
|
return txn |
|
} |
|
|
|
// View executes a function creating and managing a read-only transaction for the user. Error |
|
// returned by the function is relayed by the View method. |
|
func (db *DB) View(fn func(txn *Txn) error) error { |
|
if db.opt.managedTxns { |
|
return ErrManagedTxn |
|
} |
|
txn := db.NewTransaction(false) |
|
defer txn.Discard() |
|
|
|
return fn(txn) |
|
} |
|
|
|
// Update executes a function, creating and managing a read-write transaction |
|
// for the user. Error returned by the function is relayed by the Update method. |
|
func (db *DB) Update(fn func(txn *Txn) error) error { |
|
if db.opt.managedTxns { |
|
return ErrManagedTxn |
|
} |
|
txn := db.NewTransaction(true) |
|
defer txn.Discard() |
|
|
|
if err := fn(txn); err != nil { |
|
return err |
|
} |
|
|
|
return txn.Commit(nil) |
|
}
|
|
|