Browse Source

Merge pull request #572 from rod-hynes/master

Fix: spurious datastore resets
Rod Hynes 5 years ago
parent
commit
eba564367e
4 changed files with 62 additions and 23 deletions
  1. 43 17
      psiphon/dataStore.go
  2. 2 1
      psiphon/dataStore_badger.go
  3. 15 4
      psiphon/dataStore_bolt.go
  4. 2 1
      psiphon/dataStore_files.go

+ 43 - 17
psiphon/dataStore.go

@@ -66,6 +66,17 @@ var (
 // must be paired with a corresponding call to CloseDataStore to ensure the
 // datastore is closed.
 func OpenDataStore(config *Config) error {
+	return openDataStore(config, true)
+}
+
+// OpenDataStoreWithoutReset performs an OpenDataStore but does not retry or
+// reset the datastore file in case of failures. Use OpenDataStoreWithoutReset
+// when the datastore may be locked by another process.
+func OpenDataStoreWithoutReset(config *Config) error {
+	return openDataStore(config, false)
+}
+
+func openDataStore(config *Config, retryAndReset bool) error {
 
 	datastoreMutex.Lock()
 
@@ -74,31 +85,37 @@ func OpenDataStore(config *Config) error {
 		return errors.Tracef(
 			"invalid datastore reference count: %d", datastoreReferenceCount)
 	}
-	datastoreReferenceCount += 1
-	if datastoreReferenceCount > 1 {
-		var err error
+
+	if datastoreReferenceCount > 0 {
+
 		if activeDatastoreDB == nil {
-			err = errors.TraceNew("datastore unexpectedly closed")
+			datastoreMutex.Unlock()
+			return errors.TraceNew("datastore unexpectedly closed")
 		}
+
+		// Add a reference to the open datastore.
+
+		datastoreReferenceCount += 1
 		datastoreMutex.Unlock()
-		return err
+		return nil
 	}
 
-	existingDB := activeDatastoreDB
-
-	if existingDB != nil {
+	if activeDatastoreDB != nil {
 		datastoreMutex.Unlock()
 		return errors.TraceNew("datastore unexpectedly open")
 	}
 
-	newDB, err := datastoreOpenDB(config.GetDataStoreDirectory())
+	// datastoreReferenceCount is 0, so open the datastore.
+
+	newDB, err := datastoreOpenDB(
+		config.GetDataStoreDirectory(), retryAndReset)
 	if err != nil {
 		datastoreMutex.Unlock()
 		return errors.Trace(err)
 	}
 
+	datastoreReferenceCount = 1
 	activeDatastoreDB = newDB
-
 	datastoreMutex.Unlock()
 
 	_ = resetAllPersistentStatsToUnreported()
@@ -603,7 +620,7 @@ func (iterator *ServerEntryIterator) reset(isInitialRound bool) error {
 	// Support stand-alone GetTactics operation. See TacticsStorer for more
 	// details.
 	if iterator.isTacticsServerEntryIterator {
-		err := OpenDataStore(iterator.config)
+		err := OpenDataStoreWithoutReset(iterator.config)
 		if err != nil {
 			return errors.Trace(err)
 		}
@@ -753,7 +770,7 @@ func (iterator *ServerEntryIterator) Next() (*protocol.ServerEntry, error) {
 	// Support stand-alone GetTactics operation. See TacticsStorer for more
 	// details.
 	if iterator.isTacticsServerEntryIterator {
-		err := OpenDataStore(iterator.config)
+		err := OpenDataStoreWithoutReset(iterator.config)
 		if err != nil {
 			return nil, errors.Trace(err)
 		}
@@ -1845,7 +1862,7 @@ func DeleteDialParameters(serverIPAddress, networkID string) error {
 // TacticsStorer implements tactics.Storer.
 //
 // Each TacticsStorer datastore operation is wrapped with
-// OpenDataStore/CloseDataStore, which enables a limited degree of
+// OpenDataStoreWithoutReset/CloseDataStore, which enables a limited degree of
 // multiprocess datastore synchronization:
 //
 // One process runs a Controller. Another process runs a stand-alone operation
@@ -1871,12 +1888,21 @@ func DeleteDialParameters(serverIPAddress, networkID string) error {
 // that is, if a different process writes tactics in between GetTactics calls
 // to GetTacticsRecord and then SetTacticsRecord. This is because all tactics
 // writes are considered fresh and valid.
+//
+//
+// Using OpenDataStoreWithoutReset ensures that the GetTactics attempt in the
+// non-Controller operation will immediately fail if the datastore is locked
+// and not reset (delete) the datastore file when open fails. The Controller
+// process will use OpenDataStore, which performs the reset on failure, to
+// recover from datastore corruption; when OpenDataStore is called while the
+// non-Controller operation holds a datastore lock, the OpenDataStore timeout,
+// 1s, should be sufficient to avoid an unnecessary reset.
 type TacticsStorer struct {
 	config *Config
 }
 
 func (t *TacticsStorer) SetTacticsRecord(networkID string, record []byte) error {
-	err := OpenDataStore(t.config)
+	err := OpenDataStoreWithoutReset(t.config)
 	if err != nil {
 		return errors.Trace(err)
 	}
@@ -1889,7 +1915,7 @@ func (t *TacticsStorer) SetTacticsRecord(networkID string, record []byte) error
 }
 
 func (t *TacticsStorer) GetTacticsRecord(networkID string) ([]byte, error) {
-	err := OpenDataStore(t.config)
+	err := OpenDataStoreWithoutReset(t.config)
 	if err != nil {
 		return nil, errors.Trace(err)
 	}
@@ -1902,7 +1928,7 @@ func (t *TacticsStorer) GetTacticsRecord(networkID string) ([]byte, error) {
 }
 
 func (t *TacticsStorer) SetSpeedTestSamplesRecord(networkID string, record []byte) error {
-	err := OpenDataStore(t.config)
+	err := OpenDataStoreWithoutReset(t.config)
 	if err != nil {
 		return errors.Trace(err)
 	}
@@ -1915,7 +1941,7 @@ func (t *TacticsStorer) SetSpeedTestSamplesRecord(networkID string, record []byt
 }
 
 func (t *TacticsStorer) GetSpeedTestSamplesRecord(networkID string) ([]byte, error) {
-	err := OpenDataStore(t.config)
+	err := OpenDataStoreWithoutReset(t.config)
 	if err != nil {
 		return nil, errors.Trace(err)
 	}

+ 2 - 1
psiphon/dataStore_badger.go

@@ -52,7 +52,8 @@ type datastoreCursor struct {
 	prefix         []byte
 }
 
-func datastoreOpenDB(rootDataDirectory string) (*datastoreDB, error) {
+func datastoreOpenDB(
+	rootDataDirectory string, _ bool) (*datastoreDB, error) {
 
 	dbDirectory := filepath.Join(rootDataDirectory, "psiphon.badgerdb")
 

+ 15 - 4
psiphon/dataStore_bolt.go

@@ -33,6 +33,10 @@ import (
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/errors"
 )
 
+const (
+	OPEN_DB_RETRIES = 2
+)
+
 type datastoreDB struct {
 	boltDB   *bolt.DB
 	isFailed int32
@@ -53,14 +57,20 @@ type datastoreCursor struct {
 	boltCursor *bolt.Cursor
 }
 
-func datastoreOpenDB(rootDataDirectory string) (*datastoreDB, error) {
+func datastoreOpenDB(
+	rootDataDirectory string, retryAndReset bool) (*datastoreDB, error) {
 
 	var db *datastoreDB
 	var err error
 
-	for retry := 0; retry < 3; retry++ {
+	attempts := 1
+	if retryAndReset {
+		attempts += OPEN_DB_RETRIES
+	}
+
+	for attempt := 0; attempt < attempts; attempt++ {
 
-		db, err = tryDatastoreOpenDB(rootDataDirectory, retry > 0)
+		db, err = tryDatastoreOpenDB(rootDataDirectory, attempt > 0)
 		if err == nil {
 			break
 		}
@@ -74,7 +84,8 @@ func datastoreOpenDB(rootDataDirectory string) (*datastoreDB, error) {
 	return db, err
 }
 
-func tryDatastoreOpenDB(rootDataDirectory string, reset bool) (retdb *datastoreDB, reterr error) {
+func tryDatastoreOpenDB(
+	rootDataDirectory string, reset bool) (retdb *datastoreDB, reterr error) {
 
 	// Testing indicates that the bolt Check function can raise SIGSEGV due to
 	// invalid mmap buffer accesses in cases such as opening a valid but

+ 2 - 1
psiphon/dataStore_files.go

@@ -75,7 +75,8 @@ type datastoreCursor struct {
 	lastBuffer *bytes.Buffer
 }
 
-func datastoreOpenDB(rootDataDirectory string) (*datastoreDB, error) {
+func datastoreOpenDB(
+	rootDataDirectory string, _ bool) (*datastoreDB, error) {
 
 	dataDirectory := filepath.Join(rootDataDirectory, "psiphon.filesdb")
 	err := os.MkdirAll(dataDirectory, 0700)