Просмотр исходного кода

Mitigate datastore startup delay

- Remove potentially slow, blocking SynchronousCheck on start
- Add datastore reset on consistency check panics
- Datastore corruption will be detected when page is visited
- Still returns errDatastoreFailed for all operations after corruption
  detected
Rod Hynes 5 дней назад
Родитель
Сommit
44d37537d1
2 измененных файлов с 100 добавлено и 22 удалено
  1. 37 12
      psiphon/dataStoreRecovery_test.go
  2. 63 10
      psiphon/dataStore_bolt.go

+ 37 - 12
psiphon/dataStoreRecovery_test.go

@@ -81,8 +81,9 @@ func TestBoltResiliency(t *testing.T) {
 
 	noticeCandidateServers := make(chan struct{}, 1)
 	noticeExiting := make(chan struct{}, 1)
-	noticeResetDatastore := make(chan struct{}, 1)
+	noticeOpenResetDatastore := make(chan struct{}, 1)
 	noticeDatastoreFailed := make(chan struct{}, 1)
+	noticeFailedResetDatastore := make(chan struct{}, 1)
 
 	err = SetNoticeWriter(NewNoticeReceiver(
 		func(notice []byte) {
@@ -113,9 +114,11 @@ func TestBoltResiliency(t *testing.T) {
 				message := payload["message"].(string)
 				var channel chan struct{}
 				if strings.Contains(message, "tryDatastoreOpenDB: reset") {
-					channel = noticeResetDatastore
+					channel = noticeOpenResetDatastore
 				} else if strings.Contains(message, "datastore has failed") {
 					channel = noticeDatastoreFailed
+				} else if strings.Contains(message, "reset failed datastore") {
+					channel = noticeFailedResetDatastore
 				}
 				if channel != nil {
 					select {
@@ -147,14 +150,15 @@ func TestBoltResiliency(t *testing.T) {
 	drainNoticeChannels := func() {
 		drainNoticeChannel(noticeCandidateServers)
 		drainNoticeChannel(noticeExiting)
-		drainNoticeChannel(noticeResetDatastore)
+		drainNoticeChannel(noticeOpenResetDatastore)
 		drainNoticeChannel(noticeDatastoreFailed)
+		drainNoticeChannel(noticeFailedResetDatastore)
 	}
 
 	// Paving sufficient server entries, then truncating the datastore file to
 	// remove some server entry data, then iterating over all server entries (to
 	// produce the CandidateServers output) triggers datastore corruption
-	// detection and, at start up, reset/recovery.
+	// detection and reset/recovery.
 
 	paveServerEntries := func() {
 		for i := 0; i < serverEntryCount; i++ {
@@ -199,6 +203,27 @@ func TestBoltResiliency(t *testing.T) {
 		}
 	}
 
+	corruptDataStore := func() {
+		filename := filepath.Join(testDataDirName, "ca.psiphon.PsiphonTunnel.tunnel-core", "datastore", "psiphon.boltdb")
+		file, err := os.OpenFile(filename, os.O_RDWR, 0666)
+		if err != nil {
+			t.Fatalf("OpenFile failed: %s", err)
+		}
+		defer file.Close()
+		fileInfo, err := file.Stat()
+		if err != nil {
+			t.Fatalf("Stat failed: %s", err)
+		}
+		_, err = file.WriteAt(prng.Bytes(int(fileInfo.Size())), 0)
+		if err != nil {
+			t.Fatalf("WriteAt failed: %s", err)
+		}
+		err = file.Sync()
+		if err != nil {
+			t.Fatalf("Sync failed: %s", err)
+		}
+	}
+
 	truncateDataStore := func() {
 		filename := filepath.Join(testDataDirName, "ca.psiphon.PsiphonTunnel.tunnel-core", "datastore", "psiphon.boltdb")
 		file, err := os.OpenFile(filename, os.O_RDWR, 0666)
@@ -239,19 +264,19 @@ func TestBoltResiliency(t *testing.T) {
 
 	drainNoticeChannels()
 
-	// Truncate datastore file before running controller; expect a datastore
+	// Corrupt datastore file before running controller; expect a datastore
 	// "reset" notice on OpenDataStore.
 
 	t.Logf("test: recover from datastore corrupted before opening")
 
-	truncateDataStore()
+	corruptDataStore()
 
 	err = OpenDataStore(clientConfig)
 	if err != nil {
 		t.Fatalf("OpenDataStore failed: %s", err)
 	}
 
-	<-noticeResetDatastore
+	<-noticeOpenResetDatastore
 
 	if !canTruncateOpenDataStore {
 		CloseDataStore()
@@ -262,7 +287,7 @@ func TestBoltResiliency(t *testing.T) {
 
 	// Truncate datastore while running the controller. First, complete one
 	// successful data scan (CandidateServers). The next scan should trigger a
-	// datastore "failed" notice.
+	// datastore "failed" notice and datastore reset.
 
 	t.Logf("test: detect corrupt datastore while running")
 
@@ -274,6 +299,8 @@ func TestBoltResiliency(t *testing.T) {
 
 	<-noticeDatastoreFailed
 
+	<-noticeFailedResetDatastore
+
 	<-noticeExiting
 
 	stopController()
@@ -282,17 +309,15 @@ func TestBoltResiliency(t *testing.T) {
 
 	drainNoticeChannels()
 
-	// Restart successfully after previous failure shutdown.
+	// Restart successfully after previous failure reset and shutdown.
 
-	t.Logf("test: after restart, recover from datastore corrupted while running")
+	t.Logf("test: after restart, recover from reset datastore")
 
 	err = OpenDataStore(clientConfig)
 	if err != nil {
 		t.Fatalf("OpenDataStore failed: %s", err)
 	}
 
-	<-noticeResetDatastore
-
 	paveServerEntries()
 
 	stopController = startController()

+ 63 - 10
psiphon/dataStore_bolt.go

@@ -44,6 +44,7 @@ type datastoreDB struct {
 	boltDB   *bolt.DB
 	filename string
 	isFailed int32
+	isReset  int32
 }
 
 type datastoreTx struct {
@@ -141,17 +142,27 @@ func tryDatastoreOpenDB(
 	// Monitor freelist stats in DataStoreMetrics in diagnostics and consider
 	// setting these options if necessary.
 
-	newDB, err := bolt.Open(filename, 0600, &bolt.Options{Timeout: 1 * time.Second})
-	if err != nil {
-		return nil, errors.Trace(err)
-	}
+	// To avoid excessive delays, we now omit the blocking SynchronousCheck
+	// call here on startup. Datastore corruption is handled by panic/fault
+	// recovery and setDatastoreFailed/resetFailedDatastore.
+	//
+	// As a tradeoff, corrupt pages will be detected only when visited and
+	// there is no comprehensive datastore integrity guarantee at startup.
+	// This is an improvement in some cases, such as when the client can
+	// connect without visiting a corrupt page; whereas a reset will result
+	// in loss of all discovered servers, replay dial parameters, OSL SLOKs,
+	// etc.
+	//
+	// Also, any panic from bolt now results in a datastore reset, but
+	// non-explicit/corruption panics in bolt are very unlikely, with none
+	// observed in production with bolt code that's been stable for years.
+	//
+	// bolt.Open will still return ErrInvalid or ErrChecksum in datastore
+	// corruption cases, so the reset-on-Open-failed logic remains active.
+	// Note that ErrInvalid/ErrChecksum surface as panics in View/Update,
+	// after Open.
 
-	// Run consistency checks on datastore and emit errors for diagnostics
-	// purposes. We assume this will complete quickly for typical size Psiphon
-	// datastores and wait for the check to complete before proceeding.
-	err = newDB.View(func(tx *bolt.Tx) error {
-		return tx.SynchronousCheck()
-	})
+	newDB, err := bolt.Open(filename, 0600, &bolt.Options{Timeout: 1 * time.Second})
 	if err != nil {
 		return nil, errors.Trace(err)
 	}
@@ -225,6 +236,33 @@ func (db *datastoreDB) setDatastoreFailed(r interface{}) {
 	NoticeWarning("%s: %s", errDatastoreFailed.Error(), errors.Tracef("panic: %v", r))
 }
 
+func (db *datastoreDB) resetFailedDatastore() {
+
+	if !db.isDatastoreFailed() {
+		return
+	}
+	if !atomic.CompareAndSwapInt32(&db.isReset, 0, 1) {
+		return
+	}
+
+	// Limitation: only this single attempt is made to close and delete the
+	// datastore in this process run. It's unlikely to fail. A subsequent run
+	// that hits the consistency check panic functions as a retry.
+
+	err := db.close()
+	if err != nil {
+		NoticeWarning(errors.Trace(err).Error())
+		return
+	}
+	err = os.Remove(db.filename)
+	if err != nil {
+		NoticeWarning(errors.Trace(err).Error())
+		return
+	}
+
+	NoticeWarning("reset failed datastore")
+}
+
 func (db *datastoreDB) close() error {
 
 	// Limitation: there is no panic recover in this case. We assume boltDB.Close
@@ -254,6 +292,18 @@ func (db *datastoreDB) getDataStoreMetrics() string {
 
 func (db *datastoreDB) view(fn func(tx *datastoreTx) error) (reterr error) {
 
+	// If the datastore failed during this transaction, attempt to close and
+	// then delete/reset the datastore so that it may be freshly recreated by
+	// the next Open. This is invoked here in view (and update) and not in
+	// setDatastoreFailed to avoid lock reentrancy when calling db.close.
+	//
+	// The higher level activeDatastoreDB and datastoreReferenceCount are not
+	// modified, so from that perspective the datastore is still open and
+	// operations will fail with the "datastore has failed" error.
+	defer db.resetFailedDatastore()
+
+	// In general, bolt code panics on failed datastore consistency checks.
+	//
 	// Any bolt function that performs mmap buffer accesses can raise SIGBUS due
 	// to underlying storage changes, such as a truncation of the datastore file
 	// or removal or network attached storage, etc.
@@ -290,6 +340,9 @@ func (db *datastoreDB) view(fn func(tx *datastoreTx) error) (reterr error) {
 
 func (db *datastoreDB) update(fn func(tx *datastoreTx) error) (reterr error) {
 
+	// See resetFailedDatastore comment in datastoreDB.view.
+	defer db.resetFailedDatastore()
+
 	// Begin recovery preamble
 	if db.isDatastoreFailed() {
 		return errDatastoreFailed