3 месяцев назад · 44d37537d1
--- a/psiphon/dataStoreRecovery_test.go
+++ b/psiphon/dataStoreRecovery_test.go
@@ -81,8 +81,9 @@ func TestBoltResiliency(t *testing.T) {
 
				 
			
 
				 	noticeCandidateServers := make(chan struct{}, 1)
			
 
				 	noticeExiting := make(chan struct{}, 1)
			
 
				-	noticeResetDatastore := make(chan struct{}, 1)
			
 
				+	noticeOpenResetDatastore := make(chan struct{}, 1)
			
 
				 	noticeDatastoreFailed := make(chan struct{}, 1)
			
 
				+	noticeFailedResetDatastore := make(chan struct{}, 1)
			
 
				 
			
 
				 	err = SetNoticeWriter(NewNoticeReceiver(
			
 
				 		func(notice []byte) {
			
@@ -113,9 +114,11 @@ func TestBoltResiliency(t *testing.T) {
 
				 				message := payload["message"].(string)
			
 
				 				var channel chan struct{}
			
 
				 				if strings.Contains(message, "tryDatastoreOpenDB: reset") {
			
 
				-					channel = noticeResetDatastore
			
 
				+					channel = noticeOpenResetDatastore
			
 
				 				} else if strings.Contains(message, "datastore has failed") {
			
 
				 					channel = noticeDatastoreFailed
			
 
				+				} else if strings.Contains(message, "reset failed datastore") {
			
 
				+					channel = noticeFailedResetDatastore
			
 
				 				}
			
 
				 				if channel != nil {
			
 
				 					select {
			
@@ -147,14 +150,15 @@ func TestBoltResiliency(t *testing.T) {
 
				 	drainNoticeChannels := func() {
			
 
				 		drainNoticeChannel(noticeCandidateServers)
			
 
				 		drainNoticeChannel(noticeExiting)
			
 
				-		drainNoticeChannel(noticeResetDatastore)
			
 
				+		drainNoticeChannel(noticeOpenResetDatastore)
			
 
				 		drainNoticeChannel(noticeDatastoreFailed)
			
 
				+		drainNoticeChannel(noticeFailedResetDatastore)
			
 
				 	}
			
 
				 
			
 
				 	// Paving sufficient server entries, then truncating the datastore file to
			
 
				 	// remove some server entry data, then iterating over all server entries (to
			
 
				 	// produce the CandidateServers output) triggers datastore corruption
			
 
				-	// detection and, at start up, reset/recovery.
			
 
				+	// detection and reset/recovery.
			
 
				 
			
 
				 	paveServerEntries := func() {
			
 
				 		for i := 0; i < serverEntryCount; i++ {
			
@@ -199,6 +203,27 @@ func TestBoltResiliency(t *testing.T) {
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	corruptDataStore := func() {
			
 
				+		filename := filepath.Join(testDataDirName, "ca.psiphon.PsiphonTunnel.tunnel-core", "datastore", "psiphon.boltdb")
			
 
				+		file, err := os.OpenFile(filename, os.O_RDWR, 0666)
			
 
				+		if err != nil {
			
 
				+			t.Fatalf("OpenFile failed: %s", err)
			
 
				+		}
			
 
				+		defer file.Close()
			
 
				+		fileInfo, err := file.Stat()
			
 
				+		if err != nil {
			
 
				+			t.Fatalf("Stat failed: %s", err)
			
 
				+		}
			
 
				+		_, err = file.WriteAt(prng.Bytes(int(fileInfo.Size())), 0)
			
 
				+		if err != nil {
			
 
				+			t.Fatalf("WriteAt failed: %s", err)
			
 
				+		}
			
 
				+		err = file.Sync()
			
 
				+		if err != nil {
			
 
				+			t.Fatalf("Sync failed: %s", err)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	truncateDataStore := func() {
			
 
				 		filename := filepath.Join(testDataDirName, "ca.psiphon.PsiphonTunnel.tunnel-core", "datastore", "psiphon.boltdb")
			
 
				 		file, err := os.OpenFile(filename, os.O_RDWR, 0666)
			
@@ -239,19 +264,19 @@ func TestBoltResiliency(t *testing.T) {
 
				 
			
 
				 	drainNoticeChannels()
			
 
				 
			
 
				-	// Truncate datastore file before running controller; expect a datastore
			
 
				+	// Corrupt datastore file before running controller; expect a datastore
			
 
				 	// "reset" notice on OpenDataStore.
			
 
				 
			
 
				 	t.Logf("test: recover from datastore corrupted before opening")
			
 
				 
			
 
				-	truncateDataStore()
			
 
				+	corruptDataStore()
			
 
				 
			
 
				 	err = OpenDataStore(clientConfig)
			
 
				 	if err != nil {
			
 
				 		t.Fatalf("OpenDataStore failed: %s", err)
			
 
				 	}
			
 
				 
			
 
				-	<-noticeResetDatastore
			
 
				+	<-noticeOpenResetDatastore
			
 
				 
			
 
				 	if !canTruncateOpenDataStore {
			
 
				 		CloseDataStore()
			
@@ -262,7 +287,7 @@ func TestBoltResiliency(t *testing.T) {
 
				 
			
 
				 	// Truncate datastore while running the controller. First, complete one
			
 
				 	// successful data scan (CandidateServers). The next scan should trigger a
			
 
				-	// datastore "failed" notice.
			
 
				+	// datastore "failed" notice and datastore reset.
			
 
				 
			
 
				 	t.Logf("test: detect corrupt datastore while running")
			
 
				 
			
@@ -274,6 +299,8 @@ func TestBoltResiliency(t *testing.T) {
 
				 
			
 
				 	<-noticeDatastoreFailed
			
 
				 
			
 
				+	<-noticeFailedResetDatastore
			
 
				+
			
 
				 	<-noticeExiting
			
 
				 
			
 
				 	stopController()
			
@@ -282,17 +309,15 @@ func TestBoltResiliency(t *testing.T) {
 
				 
			
 
				 	drainNoticeChannels()
			
 
				 
			
 
				-	// Restart successfully after previous failure shutdown.
			
 
				+	// Restart successfully after previous failure reset and shutdown.
			
 
				 
			
 
				-	t.Logf("test: after restart, recover from datastore corrupted while running")
			
 
				+	t.Logf("test: after restart, recover from reset datastore")
			
 
				 
			
 
				 	err = OpenDataStore(clientConfig)
			
 
				 	if err != nil {
			
 
				 		t.Fatalf("OpenDataStore failed: %s", err)
			
 
				 	}
			
 
				 
			
 
				-	<-noticeResetDatastore
			
 
				-
			
 
				 	paveServerEntries()
			
 
				 
			
 
				 	stopController = startController()
			
--- a/psiphon/dataStore_bolt.go
+++ b/psiphon/dataStore_bolt.go
@@ -44,6 +44,7 @@ type datastoreDB struct {
 
				 	boltDB   *bolt.DB
			
 
				 	filename string
			
 
				 	isFailed int32
			
 
				+	isReset  int32
			
 
				 }
			
 
				 
			
 
				 type datastoreTx struct {
			
@@ -141,17 +142,27 @@ func tryDatastoreOpenDB(
 
				 	// Monitor freelist stats in DataStoreMetrics in diagnostics and consider
			
 
				 	// setting these options if necessary.
			
 
				 
			
 
				-	newDB, err := bolt.Open(filename, 0600, &bolt.Options{Timeout: 1 * time.Second})
			
 
				-	if err != nil {
			
 
				-		return nil, errors.Trace(err)
			
 
				-	}
			
 
				+	// To avoid excessive delays, we now omit the blocking SynchronousCheck
			
 
				+	// call here on startup. Datastore corruption is handled by panic/fault
			
 
				+	// recovery and setDatastoreFailed/resetFailedDatastore.
			
 
				+	//
			
 
				+	// As a tradeoff, corrupt pages will be detected only when visited and
			
 
				+	// there is no comprehensive datastore integrity guarantee at startup.
			
 
				+	// This is an improvement in some cases, such as when the client can
			
 
				+	// connect without visiting a corrupt page; whereas a reset will result
			
 
				+	// in loss of all discovered servers, replay dial parameters, OSL SLOKs,
			
 
				+	// etc.
			
 
				+	//
			
 
				+	// Also, any panic from bolt now results in a datastore reset, but
			
 
				+	// non-explicit/corruption panics in bolt are very unlikely, with none
			
 
				+	// observed in production with bolt code that's been stable for years.
			
 
				+	//
			
 
				+	// bolt.Open will still return ErrInvalid or ErrChecksum in datastore
			
 
				+	// corruption cases, so the reset-on-Open-failed logic remains active.
			
 
				+	// Note that ErrInvalid/ErrChecksum surface as panics in View/Update,
			
 
				+	// after Open.
			
 
				 
			
 
				-	// Run consistency checks on datastore and emit errors for diagnostics
			
 
				-	// purposes. We assume this will complete quickly for typical size Psiphon
			
 
				-	// datastores and wait for the check to complete before proceeding.
			
 
				-	err = newDB.View(func(tx *bolt.Tx) error {
			
 
				-		return tx.SynchronousCheck()
			
 
				-	})
			
 
				+	newDB, err := bolt.Open(filename, 0600, &bolt.Options{Timeout: 1 * time.Second})
			
 
				 	if err != nil {
			
 
				 		return nil, errors.Trace(err)
			
 
				 	}
			
@@ -225,6 +236,33 @@ func (db *datastoreDB) setDatastoreFailed(r interface{}) {
 
				 	NoticeWarning("%s: %s", errDatastoreFailed.Error(), errors.Tracef("panic: %v", r))
			
 
				 }
			
 
				 
			
 
				+func (db *datastoreDB) resetFailedDatastore() {
			
 
				+
			
 
				+	if !db.isDatastoreFailed() {
			
 
				+		return
			
 
				+	}
			
 
				+	if !atomic.CompareAndSwapInt32(&db.isReset, 0, 1) {
			
 
				+		return
			
 
				+	}
			
 
				+
			
 
				+	// Limitation: only this single attempt is made to close and delete the
			
 
				+	// datastore in this process run. It's unlikely to fail. A subsequent run
			
 
				+	// that hits the consistency check panic functions as a retry.
			
 
				+
			
 
				+	err := db.close()
			
 
				+	if err != nil {
			
 
				+		NoticeWarning(errors.Trace(err).Error())
			
 
				+		return
			
 
				+	}
			
 
				+	err = os.Remove(db.filename)
			
 
				+	if err != nil {
			
 
				+		NoticeWarning(errors.Trace(err).Error())
			
 
				+		return
			
 
				+	}
			
 
				+
			
 
				+	NoticeWarning("reset failed datastore")
			
 
				+}
			
 
				+
			
 
				 func (db *datastoreDB) close() error {
			
 
				 
			
 
				 	// Limitation: there is no panic recover in this case. We assume boltDB.Close
			
@@ -254,6 +292,18 @@ func (db *datastoreDB) getDataStoreMetrics() string {
 
				 
			
 
				 func (db *datastoreDB) view(fn func(tx *datastoreTx) error) (reterr error) {
			
 
				 
			
 
				+	// If the datastore failed during this transaction, attempt to close and
			
 
				+	// then delete/reset the datastore so that it may be freshly recreated by
			
 
				+	// the next Open. This is invoked here in view (and update) and not in
			
 
				+	// setDatastoreFailed to avoid lock reentrancy when calling db.close.
			
 
				+	//
			
 
				+	// The higher level activeDatastoreDB and datastoreReferenceCount are not
			
 
				+	// modified, so from that perspective the datastore is still open and
			
 
				+	// operations will fail with the "datastore has failed" error.
			
 
				+	defer db.resetFailedDatastore()
			
 
				+
			
 
				+	// In general, bolt code panics on failed datastore consistency checks.
			
 
				+	//
			
 
				 	// Any bolt function that performs mmap buffer accesses can raise SIGBUS due
			
 
				 	// to underlying storage changes, such as a truncation of the datastore file
			
 
				 	// or removal or network attached storage, etc.
			
@@ -290,6 +340,9 @@ func (db *datastoreDB) view(fn func(tx *datastoreTx) error) (reterr error) {
 
				 
			
 
				 func (db *datastoreDB) update(fn func(tx *datastoreTx) error) (reterr error) {
			
 
				 
			
 
				+	// See resetFailedDatastore comment in datastoreDB.view.
			
 
				+	defer db.resetFailedDatastore()
			
 
				+
			
 
				 	// Begin recovery preamble
			
 
				 	if db.isDatastoreFailed() {
			
 
				 		return errDatastoreFailed