dataStore_bolt.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. //go:build !PSIPHON_USE_BADGER_DB && !PSIPHON_USE_FILES_DB
  2. // +build !PSIPHON_USE_BADGER_DB,!PSIPHON_USE_FILES_DB
  3. /*
  4. * Copyright (c) 2018, Psiphon Inc.
  5. * All rights reserved.
  6. *
  7. * This program is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation, either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. *
  20. */
  21. package psiphon
  22. import (
  23. std_errors "errors"
  24. "fmt"
  25. "os"
  26. "path/filepath"
  27. "runtime/debug"
  28. "sync/atomic"
  29. "time"
  30. "github.com/Psiphon-Labs/bolt"
  31. "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common"
  32. "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/errors"
  33. )
  34. const (
  35. OPEN_DB_RETRIES = 2
  36. )
  37. type datastoreDB struct {
  38. boltDB *bolt.DB
  39. filename string
  40. isFailed int32
  41. isReset int32
  42. }
  43. type datastoreTx struct {
  44. db *datastoreDB
  45. boltTx *bolt.Tx
  46. }
  47. type datastoreBucket struct {
  48. db *datastoreDB
  49. boltBucket *bolt.Bucket
  50. }
  51. type datastoreCursor struct {
  52. db *datastoreDB
  53. boltCursor *bolt.Cursor
  54. }
  55. func datastoreOpenDB(
  56. rootDataDirectory string, retryAndReset bool) (*datastoreDB, error) {
  57. var db *datastoreDB
  58. var err error
  59. attempts := 1
  60. if retryAndReset {
  61. attempts += OPEN_DB_RETRIES
  62. }
  63. reset := false
  64. for attempt := 0; attempt < attempts; attempt++ {
  65. db, err = tryDatastoreOpenDB(rootDataDirectory, reset)
  66. if err == nil {
  67. break
  68. }
  69. NoticeWarning("tryDatastoreOpenDB failed: %s", err)
  70. // The datastore file may be corrupt, so, in subsequent iterations,
  71. // set the "reset" flag and attempt to delete the file and try again.
  72. //
  73. // Don't reset the datastore when open failed due to timeout obtaining
  74. // the file lock, as the datastore is simply locked by another
  75. // process and not corrupt. As the file lock is advisory, deleting
  76. // the file would succeed despite the lock. In this case, still retry
  77. // in case the the lock is released.
  78. reset = !std_errors.Is(err, bolt.ErrTimeout)
  79. }
  80. return db, err
  81. }
  82. func tryDatastoreOpenDB(
  83. rootDataDirectory string, reset bool) (retdb *datastoreDB, reterr error) {
  84. // Testing indicates that the bolt Check function can raise SIGSEGV due to
  85. // invalid mmap buffer accesses in cases such as opening a valid but
  86. // truncated datastore file.
  87. //
  88. // To handle this, we temporarily set SetPanicOnFault in order to treat the
  89. // fault as a panic, recover any panic, and return an error which will result
  90. // in a retry with reset.
  91. //
  92. // Limitation: another potential crash case is "fatal error: out of
  93. // memory" due to bolt.freelist.read attempting to allocate a slice using
  94. // a corrupted size value on disk. There is no way to recover from this
  95. // fatal.
  96. // Begin recovery preamble
  97. panicOnFault := debug.SetPanicOnFault(true)
  98. defer debug.SetPanicOnFault(panicOnFault)
  99. defer func() {
  100. if r := recover(); r != nil {
  101. retdb = nil
  102. reterr = errors.Tracef("panic: %v", r)
  103. }
  104. }()
  105. // End recovery preamble
  106. filename := filepath.Join(rootDataDirectory, "psiphon.boltdb")
  107. if reset {
  108. NoticeWarning("tryDatastoreOpenDB: reset")
  109. os.Remove(filename)
  110. }
  111. // A typical Psiphon datastore will not have a large, fragmented freelist.
  112. // For this reason, we're not setting FreelistType to FreelistMapType or
  113. // enabling NoFreelistSync. The latter option has a trade-off of slower
  114. // start up time.
  115. //
  116. // Monitor freelist stats in DataStoreMetrics in diagnostics and consider
  117. // setting these options if necessary.
  118. // To avoid excessive delays, we now omit the blocking SynchronousCheck
  119. // call here on startup. Datastore corruption is handled by panic/fault
  120. // recovery and setDatastoreFailed/resetFailedDatastore.
  121. //
  122. // As a tradeoff, corrupt pages will be detected only when visited and
  123. // there is no comprehensive datastore integrity guarantee at startup.
  124. // This is an improvement in some cases, such as when the client can
  125. // connect without visiting a corrupt page; whereas a reset will result
  126. // in loss of all discovered servers, replay dial parameters, OSL SLOKs,
  127. // etc.
  128. //
  129. // Also, any panic from bolt now results in a datastore reset, but
  130. // non-explicit/corruption panics in bolt are very unlikely, with none
  131. // observed in production with bolt code that's been stable for years.
  132. //
  133. // bolt.Open will still return ErrInvalid or ErrChecksum in datastore
  134. // corruption cases, so the reset-on-Open-failed logic remains active.
  135. // Note that ErrInvalid/ErrChecksum surface as panics in View/Update,
  136. // after Open.
  137. newDB, err := bolt.Open(filename, 0600, &bolt.Options{Timeout: 1 * time.Second})
  138. if err != nil {
  139. return nil, errors.Trace(err)
  140. }
  141. err = newDB.Update(func(tx *bolt.Tx) error {
  142. requiredBuckets := [][]byte{
  143. datastoreServerEntriesBucket,
  144. datastoreServerEntryTagsBucket,
  145. datastoreServerEntryTombstoneTagsBucket,
  146. datastoreUrlETagsBucket,
  147. datastoreKeyValueBucket,
  148. datastoreRemoteServerListStatsBucket,
  149. datastoreFailedTunnelStatsBucket,
  150. datastoreSLOKsBucket,
  151. datastoreTacticsBucket,
  152. datastoreSpeedTestSamplesBucket,
  153. datastoreDialParametersBucket,
  154. datastoreNetworkReplayParametersBucket,
  155. datastoreDSLOSLStatesBucket,
  156. }
  157. for _, bucket := range requiredBuckets {
  158. _, err := tx.CreateBucketIfNotExists(bucket)
  159. if err != nil {
  160. return err
  161. }
  162. }
  163. return nil
  164. })
  165. if err != nil {
  166. return nil, errors.Trace(err)
  167. }
  168. // Cleanup obsolete buckets
  169. err = newDB.Update(func(tx *bolt.Tx) error {
  170. obsoleteBuckets := [][]byte{
  171. []byte("tunnelStats"),
  172. []byte("rankedServerEntries"),
  173. []byte("splitTunnelRouteETags"),
  174. []byte("splitTunnelRouteData"),
  175. }
  176. for _, obsoleteBucket := range obsoleteBuckets {
  177. if tx.Bucket(obsoleteBucket) != nil {
  178. err := tx.DeleteBucket(obsoleteBucket)
  179. if err != nil {
  180. NoticeWarning("DeleteBucket %s error: %s", obsoleteBucket, err)
  181. // Continue, since this is not fatal
  182. }
  183. }
  184. }
  185. return nil
  186. })
  187. if err != nil {
  188. return nil, errors.Trace(err)
  189. }
  190. return &datastoreDB{
  191. boltDB: newDB,
  192. filename: filename,
  193. }, nil
  194. }
  195. var errDatastoreFailed = std_errors.New("datastore has failed")
  196. func (db *datastoreDB) isDatastoreFailed() bool {
  197. return atomic.LoadInt32(&db.isFailed) == 1
  198. }
  199. func (db *datastoreDB) setDatastoreFailed(r interface{}) {
  200. atomic.StoreInt32(&db.isFailed, 1)
  201. NoticeWarning("%s: %s", errDatastoreFailed.Error(), errors.Tracef("panic: %v", r))
  202. }
  203. func (db *datastoreDB) resetFailedDatastore() {
  204. if !db.isDatastoreFailed() {
  205. return
  206. }
  207. if !atomic.CompareAndSwapInt32(&db.isReset, 0, 1) {
  208. return
  209. }
  210. // Limitation: only this single attempt is made to close and delete the
  211. // datastore in this process run. It's unlikely to fail. A subsequent run
  212. // that hits the consistency check panic functions as a retry.
  213. err := db.close()
  214. if err != nil {
  215. NoticeWarning(errors.Trace(err).Error())
  216. return
  217. }
  218. err = os.Remove(db.filename)
  219. if err != nil {
  220. NoticeWarning(errors.Trace(err).Error())
  221. return
  222. }
  223. NoticeWarning("reset failed datastore")
  224. }
  225. func (db *datastoreDB) close() error {
  226. // Limitation: there is no panic recover in this case. We assume boltDB.Close
  227. // does not make mmap accesses and prefer to not continue with the datastore
  228. // file in a locked or open state. We also assume that any locks aquired by
  229. // boltDB.Close, held by transactions, will be released even if the
  230. // transaction panics and the database is in the failed state.
  231. return db.boltDB.Close()
  232. }
  233. func (db *datastoreDB) getDataStoreMetrics() string {
  234. fileSize := int64(0)
  235. fileInfo, err := os.Stat(db.filename)
  236. if err == nil {
  237. fileSize = fileInfo.Size()
  238. }
  239. stats := db.boltDB.Stats()
  240. return fmt.Sprintf("filesize %s | freepages %d | freealloc %s | txcount %d | writes %d | writetime %s",
  241. common.FormatByteCount(uint64(fileSize)),
  242. stats.FreePageN,
  243. common.FormatByteCount(uint64(stats.FreeAlloc)),
  244. stats.TxN,
  245. stats.TxStats.Write,
  246. stats.TxStats.WriteTime)
  247. }
  248. func (db *datastoreDB) view(fn func(tx *datastoreTx) error) (reterr error) {
  249. // If the datastore failed during this transaction, attempt to close and
  250. // then delete/reset the datastore so that it may be freshly recreated by
  251. // the next Open. This is invoked here in view (and update) and not in
  252. // setDatastoreFailed to avoid lock reentrancy when calling db.close.
  253. //
  254. // The higher level activeDatastoreDB and datastoreReferenceCount are not
  255. // modified, so from that perspective the datastore is still open and
  256. // operations will fail with the "datastore has failed" error.
  257. defer db.resetFailedDatastore()
  258. // In general, bolt code panics on failed datastore consistency checks.
  259. //
  260. // Any bolt function that performs mmap buffer accesses can raise SIGBUS due
  261. // to underlying storage changes, such as a truncation of the datastore file
  262. // or removal or network attached storage, etc.
  263. //
  264. // To handle this, we temporarily set SetPanicOnFault in order to treat the
  265. // fault as a panic, recover any panic to avoid crashing the process, and
  266. // putting this datastoreDB instance into a failed state. All subsequent
  267. // calls to this datastoreDBinstance or its related datastoreTx and
  268. // datastoreBucket instances will fail.
  269. // Begin recovery preamble
  270. if db.isDatastoreFailed() {
  271. return errDatastoreFailed
  272. }
  273. panicOnFault := debug.SetPanicOnFault(true)
  274. defer debug.SetPanicOnFault(panicOnFault)
  275. defer func() {
  276. if r := recover(); r != nil {
  277. db.setDatastoreFailed(r)
  278. reterr = errDatastoreFailed
  279. }
  280. }()
  281. // End recovery preamble
  282. return db.boltDB.View(
  283. func(tx *bolt.Tx) error {
  284. err := fn(&datastoreTx{db: db, boltTx: tx})
  285. if err != nil {
  286. return errors.Trace(err)
  287. }
  288. return nil
  289. })
  290. }
  291. func (db *datastoreDB) update(fn func(tx *datastoreTx) error) (reterr error) {
  292. // See resetFailedDatastore comment in datastoreDB.view.
  293. defer db.resetFailedDatastore()
  294. // Begin recovery preamble
  295. if db.isDatastoreFailed() {
  296. return errDatastoreFailed
  297. }
  298. panicOnFault := debug.SetPanicOnFault(true)
  299. defer debug.SetPanicOnFault(panicOnFault)
  300. defer func() {
  301. if r := recover(); r != nil {
  302. db.setDatastoreFailed(r)
  303. reterr = errDatastoreFailed
  304. }
  305. }()
  306. // End recovery preamble
  307. return db.boltDB.Update(
  308. func(tx *bolt.Tx) error {
  309. err := fn(&datastoreTx{db: db, boltTx: tx})
  310. if err != nil {
  311. return errors.Trace(err)
  312. }
  313. return nil
  314. })
  315. }
  316. func (tx *datastoreTx) bucket(name []byte) (retbucket *datastoreBucket) {
  317. // Begin recovery preamble
  318. if tx.db.isDatastoreFailed() {
  319. return &datastoreBucket{db: tx.db, boltBucket: nil}
  320. }
  321. panicOnFault := debug.SetPanicOnFault(true)
  322. defer debug.SetPanicOnFault(panicOnFault)
  323. defer func() {
  324. if r := recover(); r != nil {
  325. tx.db.setDatastoreFailed(r)
  326. retbucket = &datastoreBucket{db: tx.db, boltBucket: nil}
  327. }
  328. }()
  329. // End recovery preamble
  330. return &datastoreBucket{db: tx.db, boltBucket: tx.boltTx.Bucket(name)}
  331. }
  332. func (tx *datastoreTx) clearBucket(name []byte) (reterr error) {
  333. // Begin recovery preamble
  334. if tx.db.isDatastoreFailed() {
  335. return errDatastoreFailed
  336. }
  337. panicOnFault := debug.SetPanicOnFault(true)
  338. defer debug.SetPanicOnFault(panicOnFault)
  339. defer func() {
  340. if r := recover(); r != nil {
  341. tx.db.setDatastoreFailed(r)
  342. reterr = errDatastoreFailed
  343. }
  344. }()
  345. // End recovery preamble
  346. err := tx.boltTx.DeleteBucket(name)
  347. if err != nil {
  348. return errors.Trace(err)
  349. }
  350. _, err = tx.boltTx.CreateBucket(name)
  351. if err != nil {
  352. return errors.Trace(err)
  353. }
  354. return nil
  355. }
  356. func (b *datastoreBucket) get(key []byte) (retvalue []byte) {
  357. // Begin recovery preamble
  358. if b.db.isDatastoreFailed() {
  359. return nil
  360. }
  361. panicOnFault := debug.SetPanicOnFault(true)
  362. defer debug.SetPanicOnFault(panicOnFault)
  363. defer func() {
  364. if r := recover(); r != nil {
  365. b.db.setDatastoreFailed(r)
  366. retvalue = nil
  367. }
  368. }()
  369. // End recovery preamble
  370. return b.boltBucket.Get(key)
  371. }
  372. func (b *datastoreBucket) put(key, value []byte) (reterr error) {
  373. // Begin recovery preamble
  374. if b.db.isDatastoreFailed() {
  375. return errDatastoreFailed
  376. }
  377. panicOnFault := debug.SetPanicOnFault(true)
  378. defer debug.SetPanicOnFault(panicOnFault)
  379. defer func() {
  380. if r := recover(); r != nil {
  381. b.db.setDatastoreFailed(r)
  382. reterr = errDatastoreFailed
  383. }
  384. }()
  385. // End recovery preamble
  386. err := b.boltBucket.Put(key, value)
  387. if err != nil {
  388. return errors.Trace(err)
  389. }
  390. return nil
  391. }
  392. func (b *datastoreBucket) delete(key []byte) (reterr error) {
  393. // Begin recovery preamble
  394. if b.db.isDatastoreFailed() {
  395. return errDatastoreFailed
  396. }
  397. panicOnFault := debug.SetPanicOnFault(true)
  398. defer debug.SetPanicOnFault(panicOnFault)
  399. defer func() {
  400. if r := recover(); r != nil {
  401. b.db.setDatastoreFailed(r)
  402. reterr = errDatastoreFailed
  403. }
  404. }()
  405. // End recovery preamble
  406. err := b.boltBucket.Delete(key)
  407. if err != nil {
  408. return errors.Trace(err)
  409. }
  410. return nil
  411. }
  412. func (b *datastoreBucket) cursor() (retcursor datastoreCursor) {
  413. // Begin recovery preamble
  414. if b.db.isDatastoreFailed() {
  415. return datastoreCursor{db: b.db, boltCursor: nil}
  416. }
  417. panicOnFault := debug.SetPanicOnFault(true)
  418. defer debug.SetPanicOnFault(panicOnFault)
  419. defer func() {
  420. if r := recover(); r != nil {
  421. b.db.setDatastoreFailed(r)
  422. retcursor = datastoreCursor{db: b.db, boltCursor: nil}
  423. }
  424. }()
  425. // End recovery preamble
  426. return datastoreCursor{db: b.db, boltCursor: b.boltBucket.Cursor()}
  427. }
  428. func (c *datastoreCursor) firstKey() (retkey []byte) {
  429. // Begin recovery preamble
  430. if c.db.isDatastoreFailed() {
  431. return nil
  432. }
  433. panicOnFault := debug.SetPanicOnFault(true)
  434. defer debug.SetPanicOnFault(panicOnFault)
  435. defer func() {
  436. if r := recover(); r != nil {
  437. c.db.setDatastoreFailed(r)
  438. retkey = nil
  439. }
  440. }()
  441. // End recovery preamble
  442. key, _ := c.boltCursor.First()
  443. return key
  444. }
  445. func (c *datastoreCursor) nextKey() (retkey []byte) {
  446. // Begin recovery preamble
  447. if c.db.isDatastoreFailed() {
  448. return nil
  449. }
  450. panicOnFault := debug.SetPanicOnFault(true)
  451. defer debug.SetPanicOnFault(panicOnFault)
  452. defer func() {
  453. if r := recover(); r != nil {
  454. c.db.setDatastoreFailed(r)
  455. retkey = nil
  456. }
  457. }()
  458. // End recovery preamble
  459. key, _ := c.boltCursor.Next()
  460. return key
  461. }
  462. func (c *datastoreCursor) first() (retkey, retvalue []byte) {
  463. // Begin recovery preamble
  464. if c.db.isDatastoreFailed() {
  465. return nil, nil
  466. }
  467. panicOnFault := debug.SetPanicOnFault(true)
  468. defer debug.SetPanicOnFault(panicOnFault)
  469. defer func() {
  470. if r := recover(); r != nil {
  471. c.db.setDatastoreFailed(r)
  472. retkey = nil
  473. retvalue = nil
  474. }
  475. }()
  476. // End recovery preamble
  477. return c.boltCursor.First()
  478. }
  479. func (c *datastoreCursor) next() (retkey, retvalue []byte) {
  480. // Begin recovery preamble
  481. if c.db.isDatastoreFailed() {
  482. return nil, nil
  483. }
  484. panicOnFault := debug.SetPanicOnFault(true)
  485. defer debug.SetPanicOnFault(panicOnFault)
  486. defer func() {
  487. if r := recover(); r != nil {
  488. c.db.setDatastoreFailed(r)
  489. retkey = nil
  490. retvalue = nil
  491. }
  492. }()
  493. // End recovery preamble
  494. return c.boltCursor.Next()
  495. }
  496. func (c *datastoreCursor) close() {
  497. // BoltDB doesn't close cursors.
  498. }