db.go 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210
  1. package bolt
  2. import (
  3. "errors"
  4. "fmt"
  5. "hash/fnv"
  6. "log"
  7. "os"
  8. "runtime"
  9. "sort"
  10. "sync"
  11. "time"
  12. "unsafe"
  13. )
  14. // The largest step that can be taken when remapping the mmap.
  15. const maxMmapStep = 1 << 30 // 1GB
  16. // The data file format version.
  17. const version = 2
  18. // Represents a marker value to indicate that a file is a Bolt DB.
  19. const magic uint32 = 0xED0CDAED
  20. const pgidNoFreelist pgid = 0xffffffffffffffff
  21. // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
  22. // syncing changes to a file. This is required as some operating systems,
  23. // such as OpenBSD, do not have a unified buffer cache (UBC) and writes
  24. // must be synchronized using the msync(2) syscall.
  25. const IgnoreNoSync = runtime.GOOS == "openbsd"
  26. // Default values if not set in a DB instance.
  27. const (
  28. DefaultMaxBatchSize int = 1000
  29. DefaultMaxBatchDelay = 10 * time.Millisecond
  30. DefaultAllocSize = 16 * 1024 * 1024
  31. )
  32. // default page size for db is set to the OS page size.
  33. var defaultPageSize = os.Getpagesize()
  34. // The time elapsed between consecutive file locking attempts.
  35. const flockRetryTimeout = 50 * time.Millisecond
  36. // FreelistType is the type of the freelist backend
  37. type FreelistType string
  38. const (
  39. // FreelistArrayType indicates backend freelist type is array
  40. FreelistArrayType = FreelistType("array")
  41. // FreelistMapType indicates backend freelist type is hashmap
  42. FreelistMapType = FreelistType("hashmap")
  43. )
  44. // DB represents a collection of buckets persisted to a file on disk.
  45. // All data access is performed through transactions which can be obtained through the DB.
  46. // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
  47. type DB struct {
  48. // When enabled, the database will perform a Check() after every commit.
  49. // A panic is issued if the database is in an inconsistent state. This
  50. // flag has a large performance impact so it should only be used for
  51. // debugging purposes.
  52. StrictMode bool
  53. // Setting the NoSync flag will cause the database to skip fsync()
  54. // calls after each commit. This can be useful when bulk loading data
  55. // into a database and you can restart the bulk load in the event of
  56. // a system failure or database corruption. Do not set this flag for
  57. // normal use.
  58. //
  59. // If the package global IgnoreNoSync constant is true, this value is
  60. // ignored. See the comment on that constant for more details.
  61. //
  62. // THIS IS UNSAFE. PLEASE USE WITH CAUTION.
  63. NoSync bool
  64. // When true, skips syncing freelist to disk. This improves the database
  65. // write performance under normal operation, but requires a full database
  66. // re-sync during recovery.
  67. NoFreelistSync bool
  68. // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
  69. // dramatic performance degradation if database is large and framentation in freelist is common.
  70. // The alternative one is using hashmap, it is faster in almost all circumstances
  71. // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
  72. // The default type is array
  73. FreelistType FreelistType
  74. // When true, skips the truncate call when growing the database.
  75. // Setting this to true is only safe on non-ext3/ext4 systems.
  76. // Skipping truncation avoids preallocation of hard drive space and
  77. // bypasses a truncate() and fsync() syscall on remapping.
  78. //
  79. // https://github.com/boltdb/bolt/issues/284
  80. NoGrowSync bool
  81. // If you want to read the entire database fast, you can set MmapFlag to
  82. // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
  83. MmapFlags int
  84. // MaxBatchSize is the maximum size of a batch. Default value is
  85. // copied from DefaultMaxBatchSize in Open.
  86. //
  87. // If <=0, disables batching.
  88. //
  89. // Do not change concurrently with calls to Batch.
  90. MaxBatchSize int
  91. // MaxBatchDelay is the maximum delay before a batch starts.
  92. // Default value is copied from DefaultMaxBatchDelay in Open.
  93. //
  94. // If <=0, effectively disables batching.
  95. //
  96. // Do not change concurrently with calls to Batch.
  97. MaxBatchDelay time.Duration
  98. // AllocSize is the amount of space allocated when the database
  99. // needs to create new pages. This is done to amortize the cost
  100. // of truncate() and fsync() when growing the data file.
  101. AllocSize int
  102. path string
  103. openFile func(string, int, os.FileMode) (*os.File, error)
  104. file *os.File
  105. dataref []byte // mmap'ed readonly, write throws SEGV
  106. data *[maxMapSize]byte
  107. datasz int
  108. filesz int // current on disk file size
  109. meta0 *meta
  110. meta1 *meta
  111. pageSize int
  112. opened bool
  113. rwtx *Tx
  114. txs []*Tx
  115. stats Stats
  116. // [Psiphon]
  117. // https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
  118. mmapErr error // set on mmap failure; subsequently returned by all methods
  119. freelist *freelist
  120. freelistLoad sync.Once
  121. pagePool sync.Pool
  122. batchMu sync.Mutex
  123. batch *batch
  124. rwlock sync.Mutex // Allows only one writer at a time.
  125. metalock sync.Mutex // Protects meta page access.
  126. mmaplock sync.RWMutex // Protects mmap access during remapping.
  127. statlock sync.RWMutex // Protects stats access.
  128. ops struct {
  129. writeAt func(b []byte, off int64) (n int, err error)
  130. }
  131. // Read only mode.
  132. // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
  133. readOnly bool
  134. }
  135. // Path returns the path to currently open database file.
  136. func (db *DB) Path() string {
  137. return db.path
  138. }
  139. // GoString returns the Go string representation of the database.
  140. func (db *DB) GoString() string {
  141. return fmt.Sprintf("bolt.DB{path:%q}", db.path)
  142. }
  143. // String returns the string representation of the database.
  144. func (db *DB) String() string {
  145. return fmt.Sprintf("DB<%q>", db.path)
  146. }
  147. // Open creates and opens a database at the given path.
  148. // If the file does not exist then it will be created automatically.
  149. // Passing in nil options will cause Bolt to open the database with the default options.
  150. func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
  151. db := &DB{
  152. opened: true,
  153. }
  154. // [Psiphon]
  155. // Ensure cleanup on panic so recovery can reset a locked file.
  156. defer func() {
  157. if r := recover(); r != nil {
  158. _ = db.close()
  159. panic(r)
  160. }
  161. }()
  162. // Set default options if no options are provided.
  163. if options == nil {
  164. options = DefaultOptions
  165. }
  166. db.NoSync = options.NoSync
  167. db.NoGrowSync = options.NoGrowSync
  168. db.MmapFlags = options.MmapFlags
  169. db.NoFreelistSync = options.NoFreelistSync
  170. db.FreelistType = options.FreelistType
  171. // Set default values for later DB operations.
  172. db.MaxBatchSize = DefaultMaxBatchSize
  173. db.MaxBatchDelay = DefaultMaxBatchDelay
  174. db.AllocSize = DefaultAllocSize
  175. flag := os.O_RDWR
  176. if options.ReadOnly {
  177. flag = os.O_RDONLY
  178. db.readOnly = true
  179. }
  180. db.openFile = options.OpenFile
  181. if db.openFile == nil {
  182. db.openFile = os.OpenFile
  183. }
  184. // Open data file and separate sync handler for metadata writes.
  185. var err error
  186. if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil {
  187. _ = db.close()
  188. return nil, err
  189. }
  190. db.path = db.file.Name()
  191. // Lock file so that other processes using Bolt in read-write mode cannot
  192. // use the database at the same time. This would cause corruption since
  193. // the two processes would write meta pages and free pages separately.
  194. // The database file is locked exclusively (only one process can grab the lock)
  195. // if !options.ReadOnly.
  196. // The database file is locked using the shared lock (more than one process may
  197. // hold a lock at the same time) otherwise (options.ReadOnly is set).
  198. if err := flock(db, !db.readOnly, options.Timeout); err != nil {
  199. _ = db.close()
  200. return nil, err
  201. }
  202. // Default values for test hooks
  203. db.ops.writeAt = db.file.WriteAt
  204. if db.pageSize = options.PageSize; db.pageSize == 0 {
  205. // Set the default page size to the OS page size.
  206. db.pageSize = defaultPageSize
  207. }
  208. // Initialize the database if it doesn't exist.
  209. if info, err := db.file.Stat(); err != nil {
  210. _ = db.close()
  211. return nil, err
  212. } else if info.Size() == 0 {
  213. // Initialize new files with meta pages.
  214. if err := db.init(); err != nil {
  215. // clean up file descriptor on initialization fail
  216. _ = db.close()
  217. return nil, err
  218. }
  219. } else {
  220. // Read the first meta page to determine the page size.
  221. var buf [0x1000]byte
  222. // If we can't read the page size, but can read a page, assume
  223. // it's the same as the OS or one given -- since that's how the
  224. // page size was chosen in the first place.
  225. //
  226. // If the first page is invalid and this OS uses a different
  227. // page size than what the database was created with then we
  228. // are out of luck and cannot access the database.
  229. //
  230. // TODO: scan for next page
  231. if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
  232. if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
  233. db.pageSize = int(m.pageSize)
  234. }
  235. } else {
  236. _ = db.close()
  237. return nil, ErrInvalid
  238. }
  239. }
  240. // Initialize page pool.
  241. db.pagePool = sync.Pool{
  242. New: func() interface{} {
  243. return make([]byte, db.pageSize)
  244. },
  245. }
  246. // Memory map the data file.
  247. if err := db.mmap(options.InitialMmapSize); err != nil {
  248. _ = db.close()
  249. return nil, err
  250. }
  251. if db.readOnly {
  252. return db, nil
  253. }
  254. db.loadFreelist()
  255. // Flush freelist when transitioning from no sync to sync so
  256. // NoFreelistSync unaware boltdb can open the db later.
  257. if !db.NoFreelistSync && !db.hasSyncedFreelist() {
  258. tx, err := db.Begin(true)
  259. if tx != nil {
  260. err = tx.Commit()
  261. }
  262. if err != nil {
  263. _ = db.close()
  264. return nil, err
  265. }
  266. }
  267. // Mark the database as opened and return.
  268. return db, nil
  269. }
  270. // loadFreelist reads the freelist if it is synced, or reconstructs it
  271. // by scanning the DB if it is not synced. It assumes there are no
  272. // concurrent accesses being made to the freelist.
  273. func (db *DB) loadFreelist() {
  274. db.freelistLoad.Do(func() {
  275. db.freelist = newFreelist(db.FreelistType)
  276. if !db.hasSyncedFreelist() {
  277. // Reconstruct free list by scanning the DB.
  278. db.freelist.readIDs(db.freepages())
  279. } else {
  280. // Read free list from freelist page.
  281. db.freelist.read(db.page(db.meta().freelist))
  282. }
  283. db.stats.FreePageN = db.freelist.free_count()
  284. })
  285. }
  286. func (db *DB) hasSyncedFreelist() bool {
  287. return db.meta().freelist != pgidNoFreelist
  288. }
  289. // mmap opens the underlying memory-mapped file and initializes the meta references.
  290. // minsz is the minimum size that the new mmap can be.
  291. func (db *DB) mmap(minsz int) error {
  292. db.mmaplock.Lock()
  293. defer db.mmaplock.Unlock()
  294. info, err := db.file.Stat()
  295. if err != nil {
  296. return fmt.Errorf("mmap stat error: %s", err)
  297. } else if int(info.Size()) < db.pageSize*2 {
  298. return fmt.Errorf("file size too small")
  299. }
  300. // Ensure the size is at least the minimum size.
  301. var size = int(info.Size())
  302. if size < minsz {
  303. size = minsz
  304. }
  305. size, err = db.mmapSize(size)
  306. if err != nil {
  307. return err
  308. }
  309. // Dereference all mmap references before unmapping.
  310. if db.rwtx != nil {
  311. db.rwtx.root.dereference()
  312. }
  313. // Unmap existing data before continuing.
  314. if err := db.munmap(); err != nil {
  315. return err
  316. }
  317. // Memory-map the data file as a byte slice.
  318. if err := mmap(db, size); err != nil {
  319. // [Psiphon]
  320. // https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
  321. // If mmap fails, we cannot safely continue. Mark the db as unusable,
  322. // causing all future calls to return the mmap error.
  323. db.mmapErr = MmapError(err.Error())
  324. return db.mmapErr
  325. }
  326. // Save references to the meta pages.
  327. db.meta0 = db.page(0).meta()
  328. db.meta1 = db.page(1).meta()
  329. // Validate the meta pages. We only return an error if both meta pages fail
  330. // validation, since meta0 failing validation means that it wasn't saved
  331. // properly -- but we can recover using meta1. And vice-versa.
  332. err0 := db.meta0.validate()
  333. err1 := db.meta1.validate()
  334. if err0 != nil && err1 != nil {
  335. return err0
  336. }
  337. return nil
  338. }
  339. // munmap unmaps the data file from memory.
  340. func (db *DB) munmap() error {
  341. if err := munmap(db); err != nil {
  342. return fmt.Errorf("unmap error: " + err.Error())
  343. }
  344. return nil
  345. }
  346. // mmapSize determines the appropriate size for the mmap given the current size
  347. // of the database. The minimum size is 32KB and doubles until it reaches 1GB.
  348. // Returns an error if the new mmap size is greater than the max allowed.
  349. func (db *DB) mmapSize(size int) (int, error) {
  350. // Double the size from 32KB until 1GB.
  351. for i := uint(15); i <= 30; i++ {
  352. if size <= 1<<i {
  353. return 1 << i, nil
  354. }
  355. }
  356. // Verify the requested size is not above the maximum allowed.
  357. if size > maxMapSize {
  358. return 0, fmt.Errorf("mmap too large")
  359. }
  360. // If larger than 1GB then grow by 1GB at a time.
  361. sz := int64(size)
  362. if remainder := sz % int64(maxMmapStep); remainder > 0 {
  363. sz += int64(maxMmapStep) - remainder
  364. }
  365. // Ensure that the mmap size is a multiple of the page size.
  366. // This should always be true since we're incrementing in MBs.
  367. pageSize := int64(db.pageSize)
  368. if (sz % pageSize) != 0 {
  369. sz = ((sz / pageSize) + 1) * pageSize
  370. }
  371. // If we've exceeded the max size then only grow up to the max size.
  372. if sz > maxMapSize {
  373. sz = maxMapSize
  374. }
  375. return int(sz), nil
  376. }
  377. // init creates a new database file and initializes its meta pages.
  378. func (db *DB) init() error {
  379. // Create two meta pages on a buffer.
  380. buf := make([]byte, db.pageSize*4)
  381. for i := 0; i < 2; i++ {
  382. p := db.pageInBuffer(buf[:], pgid(i))
  383. p.id = pgid(i)
  384. p.flags = metaPageFlag
  385. // Initialize the meta page.
  386. m := p.meta()
  387. m.magic = magic
  388. m.version = version
  389. m.pageSize = uint32(db.pageSize)
  390. m.freelist = 2
  391. m.root = bucket{root: 3}
  392. m.pgid = 4
  393. m.txid = txid(i)
  394. m.checksum = m.sum64()
  395. }
  396. // Write an empty freelist at page 3.
  397. p := db.pageInBuffer(buf[:], pgid(2))
  398. p.id = pgid(2)
  399. p.flags = freelistPageFlag
  400. p.count = 0
  401. // Write an empty leaf page at page 4.
  402. p = db.pageInBuffer(buf[:], pgid(3))
  403. p.id = pgid(3)
  404. p.flags = leafPageFlag
  405. p.count = 0
  406. // Write the buffer to our data file.
  407. if _, err := db.ops.writeAt(buf, 0); err != nil {
  408. return err
  409. }
  410. if err := fdatasync(db); err != nil {
  411. return err
  412. }
  413. return nil
  414. }
  415. // Close releases all database resources.
  416. // It will block waiting for any open transactions to finish
  417. // before closing the database and returning.
  418. func (db *DB) Close() error {
  419. db.rwlock.Lock()
  420. defer db.rwlock.Unlock()
  421. db.metalock.Lock()
  422. defer db.metalock.Unlock()
  423. db.mmaplock.Lock()
  424. defer db.mmaplock.Unlock()
  425. return db.close()
  426. }
  427. func (db *DB) close() error {
  428. if !db.opened {
  429. return nil
  430. }
  431. db.opened = false
  432. db.freelist = nil
  433. // Clear ops.
  434. db.ops.writeAt = nil
  435. // Close the mmap.
  436. if err := db.munmap(); err != nil {
  437. return err
  438. }
  439. // Close file handles.
  440. if db.file != nil {
  441. // No need to unlock read-only file.
  442. if !db.readOnly {
  443. // Unlock the file.
  444. if err := funlock(db); err != nil {
  445. log.Printf("bolt.Close(): funlock error: %s", err)
  446. }
  447. }
  448. // Close the file descriptor.
  449. if err := db.file.Close(); err != nil {
  450. return fmt.Errorf("db file close: %s", err)
  451. }
  452. db.file = nil
  453. }
  454. db.path = ""
  455. return nil
  456. }
  457. // Begin starts a new transaction.
  458. // Multiple read-only transactions can be used concurrently but only one
  459. // write transaction can be used at a time. Starting multiple write transactions
  460. // will cause the calls to block and be serialized until the current write
  461. // transaction finishes.
  462. //
  463. // Transactions should not be dependent on one another. Opening a read
  464. // transaction and a write transaction in the same goroutine can cause the
  465. // writer to deadlock because the database periodically needs to re-mmap itself
  466. // as it grows and it cannot do that while a read transaction is open.
  467. //
  468. // If a long running read transaction (for example, a snapshot transaction) is
  469. // needed, you might want to set DB.InitialMmapSize to a large enough value
  470. // to avoid potential blocking of write transaction.
  471. //
  472. // IMPORTANT: You must close read-only transactions after you are finished or
  473. // else the database will not reclaim old pages.
  474. func (db *DB) Begin(writable bool) (*Tx, error) {
  475. if writable {
  476. return db.beginRWTx()
  477. }
  478. return db.beginTx()
  479. }
  480. func (db *DB) beginTx() (*Tx, error) {
  481. // Lock the meta pages while we initialize the transaction. We obtain
  482. // the meta lock before the mmap lock because that's the order that the
  483. // write transaction will obtain them.
  484. db.metalock.Lock()
  485. // Obtain a read-only lock on the mmap. When the mmap is remapped it will
  486. // obtain a write lock so all transactions must finish before it can be
  487. // remapped.
  488. db.mmaplock.RLock()
  489. // Exit if the database is not open yet.
  490. if !db.opened {
  491. db.mmaplock.RUnlock()
  492. db.metalock.Unlock()
  493. return nil, ErrDatabaseNotOpen
  494. }
  495. // [Psiphon]
  496. // https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
  497. // Return mmap error if a previous mmap failed.
  498. if db.mmapErr != nil {
  499. db.mmaplock.RUnlock()
  500. db.metalock.Unlock()
  501. return nil, db.mmapErr
  502. }
  503. // Create a transaction associated with the database.
  504. t := &Tx{}
  505. t.init(db)
  506. // Keep track of transaction until it closes.
  507. db.txs = append(db.txs, t)
  508. n := len(db.txs)
  509. // Unlock the meta pages.
  510. db.metalock.Unlock()
  511. // Update the transaction stats.
  512. db.statlock.Lock()
  513. db.stats.TxN++
  514. db.stats.OpenTxN = n
  515. db.statlock.Unlock()
  516. return t, nil
  517. }
  518. func (db *DB) beginRWTx() (*Tx, error) {
  519. // If the database was opened with Options.ReadOnly, return an error.
  520. if db.readOnly {
  521. return nil, ErrDatabaseReadOnly
  522. }
  523. // Obtain writer lock. This is released by the transaction when it closes.
  524. // This enforces only one writer transaction at a time.
  525. db.rwlock.Lock()
  526. // Once we have the writer lock then we can lock the meta pages so that
  527. // we can set up the transaction.
  528. db.metalock.Lock()
  529. defer db.metalock.Unlock()
  530. // Exit if the database is not open yet.
  531. if !db.opened {
  532. db.rwlock.Unlock()
  533. return nil, ErrDatabaseNotOpen
  534. }
  535. // [Psiphon]
  536. // https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
  537. // Return mmap error if a previous mmap failed.
  538. if db.mmapErr != nil {
  539. db.rwlock.Unlock()
  540. return nil, db.mmapErr
  541. }
  542. // Create a transaction associated with the database.
  543. t := &Tx{writable: true}
  544. t.init(db)
  545. db.rwtx = t
  546. db.freePages()
  547. return t, nil
  548. }
  549. // freePages releases any pages associated with closed read-only transactions.
  550. func (db *DB) freePages() {
  551. // Free all pending pages prior to earliest open transaction.
  552. sort.Sort(txsById(db.txs))
  553. minid := txid(0xFFFFFFFFFFFFFFFF)
  554. if len(db.txs) > 0 {
  555. minid = db.txs[0].meta.txid
  556. }
  557. if minid > 0 {
  558. db.freelist.release(minid - 1)
  559. }
  560. // Release unused txid extents.
  561. for _, t := range db.txs {
  562. db.freelist.releaseRange(minid, t.meta.txid-1)
  563. minid = t.meta.txid + 1
  564. }
  565. db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
  566. // Any page both allocated and freed in an extent is safe to release.
  567. }
  568. type txsById []*Tx
  569. func (t txsById) Len() int { return len(t) }
  570. func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
  571. func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
  572. // removeTx removes a transaction from the database.
  573. func (db *DB) removeTx(tx *Tx) {
  574. // Release the read lock on the mmap.
  575. db.mmaplock.RUnlock()
  576. // Use the meta lock to restrict access to the DB object.
  577. db.metalock.Lock()
  578. // Remove the transaction.
  579. for i, t := range db.txs {
  580. if t == tx {
  581. last := len(db.txs) - 1
  582. db.txs[i] = db.txs[last]
  583. db.txs[last] = nil
  584. db.txs = db.txs[:last]
  585. break
  586. }
  587. }
  588. n := len(db.txs)
  589. // Unlock the meta pages.
  590. db.metalock.Unlock()
  591. // Merge statistics.
  592. db.statlock.Lock()
  593. db.stats.OpenTxN = n
  594. db.stats.TxStats.add(&tx.stats)
  595. db.statlock.Unlock()
  596. }
  597. // Update executes a function within the context of a read-write managed transaction.
  598. // If no error is returned from the function then the transaction is committed.
  599. // If an error is returned then the entire transaction is rolled back.
  600. // Any error that is returned from the function or returned from the commit is
  601. // returned from the Update() method.
  602. //
  603. // Attempting to manually commit or rollback within the function will cause a panic.
  604. func (db *DB) Update(fn func(*Tx) error) error {
  605. t, err := db.Begin(true)
  606. if err != nil {
  607. return err
  608. }
  609. // Make sure the transaction rolls back in the event of a panic.
  610. defer func() {
  611. if t.db != nil {
  612. t.rollback()
  613. }
  614. }()
  615. // Mark as a managed tx so that the inner function cannot manually commit.
  616. t.managed = true
  617. // If an error is returned from the function then rollback and return error.
  618. err = fn(t)
  619. t.managed = false
  620. if err != nil {
  621. _ = t.Rollback()
  622. return err
  623. }
  624. return t.Commit()
  625. }
  626. // View executes a function within the context of a managed read-only transaction.
  627. // Any error that is returned from the function is returned from the View() method.
  628. //
  629. // Attempting to manually rollback within the function will cause a panic.
  630. func (db *DB) View(fn func(*Tx) error) error {
  631. t, err := db.Begin(false)
  632. if err != nil {
  633. return err
  634. }
  635. // Make sure the transaction rolls back in the event of a panic.
  636. defer func() {
  637. if t.db != nil {
  638. t.rollback()
  639. }
  640. }()
  641. // Mark as a managed tx so that the inner function cannot manually rollback.
  642. t.managed = true
  643. // If an error is returned from the function then pass it through.
  644. err = fn(t)
  645. t.managed = false
  646. if err != nil {
  647. _ = t.Rollback()
  648. return err
  649. }
  650. return t.Rollback()
  651. }
  652. // Batch calls fn as part of a batch. It behaves similar to Update,
  653. // except:
  654. //
  655. // 1. concurrent Batch calls can be combined into a single Bolt
  656. // transaction.
  657. //
  658. // 2. the function passed to Batch may be called multiple times,
  659. // regardless of whether it returns error or not.
  660. //
  661. // This means that Batch function side effects must be idempotent and
  662. // take permanent effect only after a successful return is seen in
  663. // caller.
  664. //
  665. // The maximum batch size and delay can be adjusted with DB.MaxBatchSize
  666. // and DB.MaxBatchDelay, respectively.
  667. //
  668. // Batch is only useful when there are multiple goroutines calling it.
  669. func (db *DB) Batch(fn func(*Tx) error) error {
  670. errCh := make(chan error, 1)
  671. db.batchMu.Lock()
  672. if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
  673. // There is no existing batch, or the existing batch is full; start a new one.
  674. db.batch = &batch{
  675. db: db,
  676. }
  677. db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
  678. }
  679. db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
  680. if len(db.batch.calls) >= db.MaxBatchSize {
  681. // wake up batch, it's ready to run
  682. go db.batch.trigger()
  683. }
  684. db.batchMu.Unlock()
  685. err := <-errCh
  686. if err == trySolo {
  687. err = db.Update(fn)
  688. }
  689. return err
  690. }
  691. type call struct {
  692. fn func(*Tx) error
  693. err chan<- error
  694. }
  695. type batch struct {
  696. db *DB
  697. timer *time.Timer
  698. start sync.Once
  699. calls []call
  700. }
  701. // trigger runs the batch if it hasn't already been run.
  702. func (b *batch) trigger() {
  703. b.start.Do(b.run)
  704. }
  705. // run performs the transactions in the batch and communicates results
  706. // back to DB.Batch.
  707. func (b *batch) run() {
  708. b.db.batchMu.Lock()
  709. b.timer.Stop()
  710. // Make sure no new work is added to this batch, but don't break
  711. // other batches.
  712. if b.db.batch == b {
  713. b.db.batch = nil
  714. }
  715. b.db.batchMu.Unlock()
  716. retry:
  717. for len(b.calls) > 0 {
  718. var failIdx = -1
  719. err := b.db.Update(func(tx *Tx) error {
  720. for i, c := range b.calls {
  721. if err := safelyCall(c.fn, tx); err != nil {
  722. failIdx = i
  723. return err
  724. }
  725. }
  726. return nil
  727. })
  728. if failIdx >= 0 {
  729. // take the failing transaction out of the batch. it's
  730. // safe to shorten b.calls here because db.batch no longer
  731. // points to us, and we hold the mutex anyway.
  732. c := b.calls[failIdx]
  733. b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
  734. // tell the submitter re-run it solo, continue with the rest of the batch
  735. c.err <- trySolo
  736. continue retry
  737. }
  738. // pass success, or bolt internal errors, to all callers
  739. for _, c := range b.calls {
  740. c.err <- err
  741. }
  742. break retry
  743. }
  744. }
  745. // trySolo is a special sentinel error value used for signaling that a
  746. // transaction function should be re-run. It should never be seen by
  747. // callers.
  748. var trySolo = errors.New("batch function returned an error and should be re-run solo")
  749. type panicked struct {
  750. reason interface{}
  751. }
  752. func (p panicked) Error() string {
  753. if err, ok := p.reason.(error); ok {
  754. return err.Error()
  755. }
  756. return fmt.Sprintf("panic: %v", p.reason)
  757. }
  758. func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
  759. defer func() {
  760. if p := recover(); p != nil {
  761. err = panicked{p}
  762. }
  763. }()
  764. return fn(tx)
  765. }
  766. // Sync executes fdatasync() against the database file handle.
  767. //
  768. // This is not necessary under normal operation, however, if you use NoSync
  769. // then it allows you to force the database file to sync against the disk.
  770. func (db *DB) Sync() error { return fdatasync(db) }
  771. // Stats retrieves ongoing performance stats for the database.
  772. // This is only updated when a transaction closes.
  773. func (db *DB) Stats() Stats {
  774. db.statlock.RLock()
  775. defer db.statlock.RUnlock()
  776. return db.stats
  777. }
  778. // This is for internal access to the raw data bytes from the C cursor, use
  779. // carefully, or not at all.
  780. func (db *DB) Info() *Info {
  781. return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
  782. }
  783. // page retrieves a page reference from the mmap based on the current page size.
  784. func (db *DB) page(id pgid) *page {
  785. pos := id * pgid(db.pageSize)
  786. return (*page)(unsafe.Pointer(&db.data[pos]))
  787. }
  788. // pageInBuffer retrieves a page reference from a given byte array based on the current page size.
  789. func (db *DB) pageInBuffer(b []byte, id pgid) *page {
  790. return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
  791. }
  792. // meta retrieves the current meta page reference.
  793. func (db *DB) meta() *meta {
  794. // We have to return the meta with the highest txid which doesn't fail
  795. // validation. Otherwise, we can cause errors when in fact the database is
  796. // in a consistent state. metaA is the one with the higher txid.
  797. metaA := db.meta0
  798. metaB := db.meta1
  799. if db.meta1.txid > db.meta0.txid {
  800. metaA = db.meta1
  801. metaB = db.meta0
  802. }
  803. // Use higher meta page if valid. Otherwise fallback to previous, if valid.
  804. if err := metaA.validate(); err == nil {
  805. return metaA
  806. } else if err := metaB.validate(); err == nil {
  807. return metaB
  808. }
  809. // This should never be reached, because both meta1 and meta0 were validated
  810. // on mmap() and we do fsync() on every write.
  811. panic("bolt.DB.meta(): invalid meta pages")
  812. }
  813. // allocate returns a contiguous block of memory starting at a given page.
  814. func (db *DB) allocate(txid txid, count int) (*page, error) {
  815. // Allocate a temporary buffer for the page.
  816. var buf []byte
  817. if count == 1 {
  818. buf = db.pagePool.Get().([]byte)
  819. } else {
  820. buf = make([]byte, count*db.pageSize)
  821. }
  822. p := (*page)(unsafe.Pointer(&buf[0]))
  823. p.overflow = uint32(count - 1)
  824. // Use pages from the freelist if they are available.
  825. if p.id = db.freelist.allocate(txid, count); p.id != 0 {
  826. return p, nil
  827. }
  828. // Resize mmap() if we're at the end.
  829. p.id = db.rwtx.meta.pgid
  830. var minsz = int((p.id+pgid(count))+1) * db.pageSize
  831. if minsz >= db.datasz {
  832. if err := db.mmap(minsz); err != nil {
  833. return nil, fmt.Errorf("mmap allocate error: %s", err)
  834. }
  835. }
  836. // Move the page id high water mark.
  837. db.rwtx.meta.pgid += pgid(count)
  838. return p, nil
  839. }
  840. // grow grows the size of the database to the given sz.
  841. func (db *DB) grow(sz int) error {
  842. // Ignore if the new size is less than available file size.
  843. if sz <= db.filesz {
  844. return nil
  845. }
  846. // If the data is smaller than the alloc size then only allocate what's needed.
  847. // Once it goes over the allocation size then allocate in chunks.
  848. if db.datasz < db.AllocSize {
  849. sz = db.datasz
  850. } else {
  851. sz += db.AllocSize
  852. }
  853. // Truncate and fsync to ensure file size metadata is flushed.
  854. // https://github.com/boltdb/bolt/issues/284
  855. if !db.NoGrowSync && !db.readOnly {
  856. if runtime.GOOS != "windows" {
  857. if err := db.file.Truncate(int64(sz)); err != nil {
  858. return fmt.Errorf("file resize error: %s", err)
  859. }
  860. }
  861. if err := db.file.Sync(); err != nil {
  862. return fmt.Errorf("file sync error: %s", err)
  863. }
  864. }
  865. db.filesz = sz
  866. return nil
  867. }
  868. func (db *DB) IsReadOnly() bool {
  869. return db.readOnly
  870. }
  871. func (db *DB) freepages() []pgid {
  872. tx, err := db.beginTx()
  873. defer func() {
  874. err = tx.Rollback()
  875. if err != nil {
  876. panic("freepages: failed to rollback tx")
  877. }
  878. }()
  879. if err != nil {
  880. panic("freepages: failed to open read only tx")
  881. }
  882. reachable := make(map[pgid]*page)
  883. nofreed := make(map[pgid]bool)
  884. // [Psiphon]
  885. // Use single-error checkBucket.
  886. err = tx.checkBucket(&tx.root, reachable, nofreed)
  887. if err != nil {
  888. panic(fmt.Sprintf("freepages: failed to get all reachable pages (%s)", err))
  889. }
  890. var fids []pgid
  891. for i := pgid(2); i < db.meta().pgid; i++ {
  892. if _, ok := reachable[i]; !ok {
  893. fids = append(fids, i)
  894. }
  895. }
  896. return fids
  897. }
  898. // Options represents the options that can be set when opening a database.
  899. type Options struct {
  900. // Timeout is the amount of time to wait to obtain a file lock.
  901. // When set to zero it will wait indefinitely. This option is only
  902. // available on Darwin and Linux.
  903. Timeout time.Duration
  904. // Sets the DB.NoGrowSync flag before memory mapping the file.
  905. NoGrowSync bool
  906. // Do not sync freelist to disk. This improves the database write performance
  907. // under normal operation, but requires a full database re-sync during recovery.
  908. NoFreelistSync bool
  909. // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
  910. // dramatic performance degradation if database is large and framentation in freelist is common.
  911. // The alternative one is using hashmap, it is faster in almost all circumstances
  912. // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
  913. // The default type is array
  914. FreelistType FreelistType
  915. // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
  916. // grab a shared lock (UNIX).
  917. ReadOnly bool
  918. // Sets the DB.MmapFlags flag before memory mapping the file.
  919. MmapFlags int
  920. // InitialMmapSize is the initial mmap size of the database
  921. // in bytes. Read transactions won't block write transaction
  922. // if the InitialMmapSize is large enough to hold database mmap
  923. // size. (See DB.Begin for more information)
  924. //
  925. // If <=0, the initial map size is 0.
  926. // If initialMmapSize is smaller than the previous database size,
  927. // it takes no effect.
  928. InitialMmapSize int
  929. // PageSize overrides the default OS page size.
  930. PageSize int
  931. // NoSync sets the initial value of DB.NoSync. Normally this can just be
  932. // set directly on the DB itself when returned from Open(), but this option
  933. // is useful in APIs which expose Options but not the underlying DB.
  934. NoSync bool
  935. // OpenFile is used to open files. It defaults to os.OpenFile. This option
  936. // is useful for writing hermetic tests.
  937. OpenFile func(string, int, os.FileMode) (*os.File, error)
  938. }
  939. // DefaultOptions represent the options used if nil options are passed into Open().
  940. // No timeout is used which will cause Bolt to wait indefinitely for a lock.
  941. var DefaultOptions = &Options{
  942. Timeout: 0,
  943. NoGrowSync: false,
  944. FreelistType: FreelistArrayType,
  945. }
  946. // Stats represents statistics about the database.
  947. type Stats struct {
  948. // Freelist stats
  949. FreePageN int // total number of free pages on the freelist
  950. PendingPageN int // total number of pending pages on the freelist
  951. FreeAlloc int // total bytes allocated in free pages
  952. FreelistInuse int // total bytes used by the freelist
  953. // Transaction stats
  954. TxN int // total number of started read transactions
  955. OpenTxN int // number of currently open read transactions
  956. TxStats TxStats // global, ongoing stats.
  957. }
  958. // Sub calculates and returns the difference between two sets of database stats.
  959. // This is useful when obtaining stats at two different points and time and
  960. // you need the performance counters that occurred within that time span.
  961. func (s *Stats) Sub(other *Stats) Stats {
  962. if other == nil {
  963. return *s
  964. }
  965. var diff Stats
  966. diff.FreePageN = s.FreePageN
  967. diff.PendingPageN = s.PendingPageN
  968. diff.FreeAlloc = s.FreeAlloc
  969. diff.FreelistInuse = s.FreelistInuse
  970. diff.TxN = s.TxN - other.TxN
  971. diff.TxStats = s.TxStats.Sub(&other.TxStats)
  972. return diff
  973. }
  974. type Info struct {
  975. Data uintptr
  976. PageSize int
  977. }
  978. type meta struct {
  979. magic uint32
  980. version uint32
  981. pageSize uint32
  982. flags uint32
  983. root bucket
  984. freelist pgid
  985. pgid pgid
  986. txid txid
  987. checksum uint64
  988. }
  989. // validate checks the marker bytes and version of the meta page to ensure it matches this binary.
  990. func (m *meta) validate() error {
  991. if m.magic != magic {
  992. return ErrInvalid
  993. } else if m.version != version {
  994. return ErrVersionMismatch
  995. } else if m.checksum != 0 && m.checksum != m.sum64() {
  996. return ErrChecksum
  997. }
  998. return nil
  999. }
  1000. // copy copies one meta object to another.
  1001. func (m *meta) copy(dest *meta) {
  1002. *dest = *m
  1003. }
  1004. // write writes the meta onto a page.
  1005. func (m *meta) write(p *page) {
  1006. if m.root.root >= m.pgid {
  1007. panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
  1008. } else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
  1009. // TODO: reject pgidNoFreeList if !NoFreelistSync
  1010. panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
  1011. }
  1012. // Page id is either going to be 0 or 1 which we can determine by the transaction ID.
  1013. p.id = pgid(m.txid % 2)
  1014. p.flags |= metaPageFlag
  1015. // Calculate the checksum.
  1016. m.checksum = m.sum64()
  1017. m.copy(p.meta())
  1018. }
  1019. // generates the checksum for the meta.
  1020. func (m *meta) sum64() uint64 {
  1021. var h = fnv.New64a()
  1022. _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
  1023. return h.Sum64()
  1024. }
  1025. // _assert will panic with a given formatted message if the given condition is false.
  1026. func _assert(condition bool, msg string, v ...interface{}) {
  1027. if !condition {
  1028. panic(fmt.Sprintf("assertion failed: "+msg, v...))
  1029. }
  1030. }