|
|
@@ -7,8 +7,7 @@ import (
|
|
|
"log"
|
|
|
"os"
|
|
|
"runtime"
|
|
|
- "runtime/debug"
|
|
|
- "strings"
|
|
|
+ "sort"
|
|
|
"sync"
|
|
|
"time"
|
|
|
"unsafe"
|
|
|
@@ -23,6 +22,8 @@ const version = 2
|
|
|
// Represents a marker value to indicate that a file is a Bolt DB.
|
|
|
const magic uint32 = 0xED0CDAED
|
|
|
|
|
|
+const pgidNoFreelist pgid = 0xffffffffffffffff
|
|
|
+
|
|
|
// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
|
|
|
// syncing changes to a file. This is required as some operating systems,
|
|
|
// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
|
|
|
@@ -39,6 +40,19 @@ const (
|
|
|
// default page size for db is set to the OS page size.
|
|
|
var defaultPageSize = os.Getpagesize()
|
|
|
|
|
|
+// The time elapsed between consecutive file locking attempts.
|
|
|
+const flockRetryTimeout = 50 * time.Millisecond
|
|
|
+
|
|
|
+// FreelistType is the type of the freelist backend
|
|
|
+type FreelistType string
|
|
|
+
|
|
|
+const (
|
|
|
+ // FreelistArrayType indicates backend freelist type is array
|
|
|
+ FreelistArrayType = FreelistType("array")
|
|
|
+ // FreelistMapType indicates backend freelist type is hashmap
|
|
|
+ FreelistMapType = FreelistType("hashmap")
|
|
|
+)
|
|
|
+
|
|
|
// DB represents a collection of buckets persisted to a file on disk.
|
|
|
// All data access is performed through transactions which can be obtained through the DB.
|
|
|
// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
|
|
|
@@ -61,6 +75,18 @@ type DB struct {
|
|
|
// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
|
|
|
NoSync bool
|
|
|
|
|
|
+ // When true, skips syncing freelist to disk. This improves the database
|
|
|
+ // write performance under normal operation, but requires a full database
|
|
|
+ // re-sync during recovery.
|
|
|
+ NoFreelistSync bool
|
|
|
+
|
|
|
+ // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
|
|
|
+ // dramatic performance degradation if database is large and framentation in freelist is common.
|
|
|
+ // The alternative one is using hashmap, it is faster in almost all circumstances
|
|
|
+ // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
|
|
|
+ // The default type is array
|
|
|
+ FreelistType FreelistType
|
|
|
+
|
|
|
// When true, skips the truncate call when growing the database.
|
|
|
// Setting this to true is only safe on non-ext3/ext4 systems.
|
|
|
// Skipping truncation avoids preallocation of hard drive space and
|
|
|
@@ -95,9 +121,9 @@ type DB struct {
|
|
|
AllocSize int
|
|
|
|
|
|
path string
|
|
|
+ openFile func(string, int, os.FileMode) (*os.File, error)
|
|
|
file *os.File
|
|
|
- lockfile *os.File // windows only
|
|
|
- dataref []byte // mmap'ed readonly, write throws SEGV
|
|
|
+ dataref []byte // mmap'ed readonly, write throws SEGV
|
|
|
data *[maxMapSize]byte
|
|
|
datasz int
|
|
|
filesz int // current on disk file size
|
|
|
@@ -107,13 +133,15 @@ type DB struct {
|
|
|
opened bool
|
|
|
rwtx *Tx
|
|
|
txs []*Tx
|
|
|
- freelist *freelist
|
|
|
stats Stats
|
|
|
|
|
|
// [Psiphon]
|
|
|
// https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
|
|
|
mmapErr error // set on mmap failure; subsequently returned by all methods
|
|
|
|
|
|
+ freelist *freelist
|
|
|
+ freelistLoad sync.Once
|
|
|
+
|
|
|
pagePool sync.Pool
|
|
|
|
|
|
batchMu sync.Mutex
|
|
|
@@ -152,7 +180,9 @@ func (db *DB) String() string {
|
|
|
// If the file does not exist then it will be created automatically.
|
|
|
// Passing in nil options will cause Bolt to open the database with the default options.
|
|
|
func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
|
|
|
- var db = &DB{opened: true}
|
|
|
+ db := &DB{
|
|
|
+ opened: true,
|
|
|
+ }
|
|
|
|
|
|
// [Psiphon]
|
|
|
// Ensure cleanup on panic so recovery can reset a locked file.
|
|
|
@@ -167,8 +197,11 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
|
|
|
if options == nil {
|
|
|
options = DefaultOptions
|
|
|
}
|
|
|
+ db.NoSync = options.NoSync
|
|
|
db.NoGrowSync = options.NoGrowSync
|
|
|
db.MmapFlags = options.MmapFlags
|
|
|
+ db.NoFreelistSync = options.NoFreelistSync
|
|
|
+ db.FreelistType = options.FreelistType
|
|
|
|
|
|
// Set default values for later DB operations.
|
|
|
db.MaxBatchSize = DefaultMaxBatchSize
|
|
|
@@ -181,13 +214,18 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
|
|
|
db.readOnly = true
|
|
|
}
|
|
|
|
|
|
+ db.openFile = options.OpenFile
|
|
|
+ if db.openFile == nil {
|
|
|
+ db.openFile = os.OpenFile
|
|
|
+ }
|
|
|
+
|
|
|
// Open data file and separate sync handler for metadata writes.
|
|
|
- db.path = path
|
|
|
var err error
|
|
|
- if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
|
|
|
+ if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil {
|
|
|
_ = db.close()
|
|
|
return nil, err
|
|
|
}
|
|
|
+ db.path = db.file.Name()
|
|
|
|
|
|
// Lock file so that other processes using Bolt in read-write mode cannot
|
|
|
// use the database at the same time. This would cause corruption since
|
|
|
@@ -196,7 +234,7 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
|
|
|
// if !options.ReadOnly.
|
|
|
// The database file is locked using the shared lock (more than one process may
|
|
|
// hold a lock at the same time) otherwise (options.ReadOnly is set).
|
|
|
- if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil {
|
|
|
+ if err := flock(db, !db.readOnly, options.Timeout); err != nil {
|
|
|
_ = db.close()
|
|
|
return nil, err
|
|
|
}
|
|
|
@@ -204,31 +242,41 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
|
|
|
// Default values for test hooks
|
|
|
db.ops.writeAt = db.file.WriteAt
|
|
|
|
|
|
+ if db.pageSize = options.PageSize; db.pageSize == 0 {
|
|
|
+ // Set the default page size to the OS page size.
|
|
|
+ db.pageSize = defaultPageSize
|
|
|
+ }
|
|
|
+
|
|
|
// Initialize the database if it doesn't exist.
|
|
|
if info, err := db.file.Stat(); err != nil {
|
|
|
+ _ = db.close()
|
|
|
return nil, err
|
|
|
} else if info.Size() == 0 {
|
|
|
// Initialize new files with meta pages.
|
|
|
if err := db.init(); err != nil {
|
|
|
+ // clean up file descriptor on initialization fail
|
|
|
+ _ = db.close()
|
|
|
return nil, err
|
|
|
}
|
|
|
} else {
|
|
|
// Read the first meta page to determine the page size.
|
|
|
var buf [0x1000]byte
|
|
|
- if _, err := db.file.ReadAt(buf[:], 0); err == nil {
|
|
|
- m := db.pageInBuffer(buf[:], 0).meta()
|
|
|
- if err := m.validate(); err != nil {
|
|
|
- // If we can't read the page size, we can assume it's the same
|
|
|
- // as the OS -- since that's how the page size was chosen in the
|
|
|
- // first place.
|
|
|
- //
|
|
|
- // If the first page is invalid and this OS uses a different
|
|
|
- // page size than what the database was created with then we
|
|
|
- // are out of luck and cannot access the database.
|
|
|
- db.pageSize = os.Getpagesize()
|
|
|
- } else {
|
|
|
+ // If we can't read the page size, but can read a page, assume
|
|
|
+ // it's the same as the OS or one given -- since that's how the
|
|
|
+ // page size was chosen in the first place.
|
|
|
+ //
|
|
|
+ // If the first page is invalid and this OS uses a different
|
|
|
+ // page size than what the database was created with then we
|
|
|
+ // are out of luck and cannot access the database.
|
|
|
+ //
|
|
|
+ // TODO: scan for next page
|
|
|
+ if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
|
|
|
+ if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
|
|
|
db.pageSize = int(m.pageSize)
|
|
|
}
|
|
|
+ } else {
|
|
|
+ _ = db.close()
|
|
|
+ return nil, ErrInvalid
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -245,14 +293,50 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
|
|
|
return nil, err
|
|
|
}
|
|
|
|
|
|
- // Read in the freelist.
|
|
|
- db.freelist = newFreelist()
|
|
|
- db.freelist.read(db.page(db.meta().freelist))
|
|
|
+ if db.readOnly {
|
|
|
+ return db, nil
|
|
|
+ }
|
|
|
+
|
|
|
+ db.loadFreelist()
|
|
|
+
|
|
|
+ // Flush freelist when transitioning from no sync to sync so
|
|
|
+ // NoFreelistSync unaware boltdb can open the db later.
|
|
|
+ if !db.NoFreelistSync && !db.hasSyncedFreelist() {
|
|
|
+ tx, err := db.Begin(true)
|
|
|
+ if tx != nil {
|
|
|
+ err = tx.Commit()
|
|
|
+ }
|
|
|
+ if err != nil {
|
|
|
+ _ = db.close()
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
// Mark the database as opened and return.
|
|
|
return db, nil
|
|
|
}
|
|
|
|
|
|
+// loadFreelist reads the freelist if it is synced, or reconstructs it
|
|
|
+// by scanning the DB if it is not synced. It assumes there are no
|
|
|
+// concurrent accesses being made to the freelist.
|
|
|
+func (db *DB) loadFreelist() {
|
|
|
+ db.freelistLoad.Do(func() {
|
|
|
+ db.freelist = newFreelist(db.FreelistType)
|
|
|
+ if !db.hasSyncedFreelist() {
|
|
|
+ // Reconstruct free list by scanning the DB.
|
|
|
+ db.freelist.readIDs(db.freepages())
|
|
|
+ } else {
|
|
|
+ // Read free list from freelist page.
|
|
|
+ db.freelist.read(db.page(db.meta().freelist))
|
|
|
+ }
|
|
|
+ db.stats.FreePageN = db.freelist.free_count()
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+func (db *DB) hasSyncedFreelist() bool {
|
|
|
+ return db.meta().freelist != pgidNoFreelist
|
|
|
+}
|
|
|
+
|
|
|
// mmap opens the underlying memory-mapped file and initializes the meta references.
|
|
|
// minsz is the minimum size that the new mmap can be.
|
|
|
func (db *DB) mmap(minsz int) error {
|
|
|
@@ -360,9 +444,6 @@ func (db *DB) mmapSize(size int) (int, error) {
|
|
|
|
|
|
// init creates a new database file and initializes its meta pages.
|
|
|
func (db *DB) init() error {
|
|
|
- // Set the page size to the OS page size.
|
|
|
- db.pageSize = os.Getpagesize()
|
|
|
-
|
|
|
// Create two meta pages on a buffer.
|
|
|
buf := make([]byte, db.pageSize*4)
|
|
|
for i := 0; i < 2; i++ {
|
|
|
@@ -406,7 +487,8 @@ func (db *DB) init() error {
|
|
|
}
|
|
|
|
|
|
// Close releases all database resources.
|
|
|
-// All transactions must be closed before closing the database.
|
|
|
+// It will block waiting for any open transactions to finish
|
|
|
+// before closing the database and returning.
|
|
|
func (db *DB) Close() error {
|
|
|
db.rwlock.Lock()
|
|
|
defer db.rwlock.Unlock()
|
|
|
@@ -414,9 +496,6 @@ func (db *DB) Close() error {
|
|
|
db.metalock.Lock()
|
|
|
defer db.metalock.Unlock()
|
|
|
|
|
|
- // [Psiphon]
|
|
|
- // https://github.com/etcd-io/bbolt/commit/e06ec0a754bc30c2e17ad871962e71635bf94d45
|
|
|
- // "Fix Close() to wait for view transactions by getting a full lock on mmaplock"
|
|
|
db.mmaplock.Lock()
|
|
|
defer db.mmaplock.Unlock()
|
|
|
|
|
|
@@ -565,21 +644,36 @@ func (db *DB) beginRWTx() (*Tx, error) {
|
|
|
t := &Tx{writable: true}
|
|
|
t.init(db)
|
|
|
db.rwtx = t
|
|
|
+ db.freePages()
|
|
|
+ return t, nil
|
|
|
+}
|
|
|
|
|
|
- // Free any pages associated with closed read-only transactions.
|
|
|
- var minid txid = 0xFFFFFFFFFFFFFFFF
|
|
|
- for _, t := range db.txs {
|
|
|
- if t.meta.txid < minid {
|
|
|
- minid = t.meta.txid
|
|
|
- }
|
|
|
+// freePages releases any pages associated with closed read-only transactions.
|
|
|
+func (db *DB) freePages() {
|
|
|
+ // Free all pending pages prior to earliest open transaction.
|
|
|
+ sort.Sort(txsById(db.txs))
|
|
|
+ minid := txid(0xFFFFFFFFFFFFFFFF)
|
|
|
+ if len(db.txs) > 0 {
|
|
|
+ minid = db.txs[0].meta.txid
|
|
|
}
|
|
|
if minid > 0 {
|
|
|
db.freelist.release(minid - 1)
|
|
|
}
|
|
|
-
|
|
|
- return t, nil
|
|
|
+ // Release unused txid extents.
|
|
|
+ for _, t := range db.txs {
|
|
|
+ db.freelist.releaseRange(minid, t.meta.txid-1)
|
|
|
+ minid = t.meta.txid + 1
|
|
|
+ }
|
|
|
+ db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
|
|
|
+ // Any page both allocated and freed in an extent is safe to release.
|
|
|
}
|
|
|
|
|
|
+type txsById []*Tx
|
|
|
+
|
|
|
+func (t txsById) Len() int { return len(t) }
|
|
|
+func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
|
|
|
+func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
|
|
|
+
|
|
|
// removeTx removes a transaction from the database.
|
|
|
func (db *DB) removeTx(tx *Tx) {
|
|
|
// Release the read lock on the mmap.
|
|
|
@@ -672,11 +766,7 @@ func (db *DB) View(fn func(*Tx) error) error {
|
|
|
return err
|
|
|
}
|
|
|
|
|
|
- if err := t.Rollback(); err != nil {
|
|
|
- return err
|
|
|
- }
|
|
|
-
|
|
|
- return nil
|
|
|
+ return t.Rollback()
|
|
|
}
|
|
|
|
|
|
// Batch calls fn as part of a batch. It behaves similar to Update,
|
|
|
@@ -776,9 +866,7 @@ retry:
|
|
|
|
|
|
// pass success, or bolt internal errors, to all callers
|
|
|
for _, c := range b.calls {
|
|
|
- if c.err != nil {
|
|
|
- c.err <- err
|
|
|
- }
|
|
|
+ c.err <- err
|
|
|
}
|
|
|
break retry
|
|
|
}
|
|
|
@@ -865,7 +953,7 @@ func (db *DB) meta() *meta {
|
|
|
}
|
|
|
|
|
|
// allocate returns a contiguous block of memory starting at a given page.
|
|
|
-func (db *DB) allocate(count int) (*page, error) {
|
|
|
+func (db *DB) allocate(txid txid, count int) (*page, error) {
|
|
|
// Allocate a temporary buffer for the page.
|
|
|
var buf []byte
|
|
|
if count == 1 {
|
|
|
@@ -877,7 +965,7 @@ func (db *DB) allocate(count int) (*page, error) {
|
|
|
p.overflow = uint32(count - 1)
|
|
|
|
|
|
// Use pages from the freelist if they are available.
|
|
|
- if p.id = db.freelist.allocate(count); p.id != 0 {
|
|
|
+ if p.id = db.freelist.allocate(txid, count); p.id != 0 {
|
|
|
return p, nil
|
|
|
}
|
|
|
|
|
|
@@ -932,6 +1020,37 @@ func (db *DB) IsReadOnly() bool {
|
|
|
return db.readOnly
|
|
|
}
|
|
|
|
|
|
+func (db *DB) freepages() []pgid {
|
|
|
+ tx, err := db.beginTx()
|
|
|
+ defer func() {
|
|
|
+ err = tx.Rollback()
|
|
|
+ if err != nil {
|
|
|
+ panic("freepages: failed to rollback tx")
|
|
|
+ }
|
|
|
+ }()
|
|
|
+ if err != nil {
|
|
|
+ panic("freepages: failed to open read only tx")
|
|
|
+ }
|
|
|
+
|
|
|
+ reachable := make(map[pgid]*page)
|
|
|
+ nofreed := make(map[pgid]bool)
|
|
|
+
|
|
|
+ // [Psiphon]
|
|
|
+ // Use single-error checkBucket.
|
|
|
+ err = tx.checkBucket(&tx.root, reachable, nofreed)
|
|
|
+ if err != nil {
|
|
|
+ panic(fmt.Sprintf("freepages: failed to get all reachable pages (%s)", err))
|
|
|
+ }
|
|
|
+
|
|
|
+ var fids []pgid
|
|
|
+ for i := pgid(2); i < db.meta().pgid; i++ {
|
|
|
+ if _, ok := reachable[i]; !ok {
|
|
|
+ fids = append(fids, i)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return fids
|
|
|
+}
|
|
|
+
|
|
|
// Options represents the options that can be set when opening a database.
|
|
|
type Options struct {
|
|
|
// Timeout is the amount of time to wait to obtain a file lock.
|
|
|
@@ -942,6 +1061,17 @@ type Options struct {
|
|
|
// Sets the DB.NoGrowSync flag before memory mapping the file.
|
|
|
NoGrowSync bool
|
|
|
|
|
|
+ // Do not sync freelist to disk. This improves the database write performance
|
|
|
+ // under normal operation, but requires a full database re-sync during recovery.
|
|
|
+ NoFreelistSync bool
|
|
|
+
|
|
|
+ // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
|
|
|
+ // dramatic performance degradation if database is large and framentation in freelist is common.
|
|
|
+ // The alternative one is using hashmap, it is faster in almost all circumstances
|
|
|
+ // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
|
|
|
+ // The default type is array
|
|
|
+ FreelistType FreelistType
|
|
|
+
|
|
|
// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
|
|
|
// grab a shared lock (UNIX).
|
|
|
ReadOnly bool
|
|
|
@@ -958,13 +1088,26 @@ type Options struct {
|
|
|
// If initialMmapSize is smaller than the previous database size,
|
|
|
// it takes no effect.
|
|
|
InitialMmapSize int
|
|
|
+
|
|
|
+ // PageSize overrides the default OS page size.
|
|
|
+ PageSize int
|
|
|
+
|
|
|
+ // NoSync sets the initial value of DB.NoSync. Normally this can just be
|
|
|
+ // set directly on the DB itself when returned from Open(), but this option
|
|
|
+ // is useful in APIs which expose Options but not the underlying DB.
|
|
|
+ NoSync bool
|
|
|
+
|
|
|
+ // OpenFile is used to open files. It defaults to os.OpenFile. This option
|
|
|
+ // is useful for writing hermetic tests.
|
|
|
+ OpenFile func(string, int, os.FileMode) (*os.File, error)
|
|
|
}
|
|
|
|
|
|
// DefaultOptions represent the options used if nil options are passed into Open().
|
|
|
// No timeout is used which will cause Bolt to wait indefinitely for a lock.
|
|
|
var DefaultOptions = &Options{
|
|
|
- Timeout: 0,
|
|
|
- NoGrowSync: false,
|
|
|
+ Timeout: 0,
|
|
|
+ NoGrowSync: false,
|
|
|
+ FreelistType: FreelistArrayType,
|
|
|
}
|
|
|
|
|
|
// Stats represents statistics about the database.
|
|
|
@@ -999,10 +1142,6 @@ func (s *Stats) Sub(other *Stats) Stats {
|
|
|
return diff
|
|
|
}
|
|
|
|
|
|
-func (s *Stats) add(other *Stats) {
|
|
|
- s.TxStats.add(&other.TxStats)
|
|
|
-}
|
|
|
-
|
|
|
type Info struct {
|
|
|
Data uintptr
|
|
|
PageSize int
|
|
|
@@ -1041,7 +1180,8 @@ func (m *meta) copy(dest *meta) {
|
|
|
func (m *meta) write(p *page) {
|
|
|
if m.root.root >= m.pgid {
|
|
|
panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
|
|
|
- } else if m.freelist >= m.pgid {
|
|
|
+ } else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
|
|
|
+ // TODO: reject pgidNoFreeList if !NoFreelistSync
|
|
|
panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
|
|
|
}
|
|
|
|
|
|
@@ -1068,11 +1208,3 @@ func _assert(condition bool, msg string, v ...interface{}) {
|
|
|
panic(fmt.Sprintf("assertion failed: "+msg, v...))
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
-func warn(v ...interface{}) { fmt.Fprintln(os.Stderr, v...) }
|
|
|
-func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) }
|
|
|
-
|
|
|
-func printstack() {
|
|
|
- stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n")
|
|
|
- fmt.Fprintln(os.Stderr, stack)
|
|
|
-}
|