Close partitioned replicas

yandex · Aug 19, 2024 · 1e0e995 · 1e0e995
1 parent 279deff
commit 1e0e995
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 6 deletions.
diff --git a/internal/app/lost.go b/internal/app/lost.go
@@ -22,24 +22,47 @@ func (app *App) stateLost() appState {
 				return stateLost
 			}
 			if offline {
-				app.logger.Info("Rdsync have lost connection to ZK. However HA cluster is live. Setting local node online")
+				app.logger.Info("Rdsync have lost connection to ZK. However HA cluster is alive. Setting local node online")
 				err = node.SetOnline(app.ctx)
 				if err != nil {
 					app.logger.Error("Unable to set local node online", "error", err)
 				}
 				return stateLost
 			}
-			app.logger.Info("Rdsync have lost connection to ZK. However HA cluster is live. Do nothing")
+			app.logger.Info("Rdsync have lost connection to ZK. However HA cluster is alive. Do nothing")
 			return stateLost
 		}
 	} else {
 		shardState, err := app.getShardStateFromDB()
 		if err != nil {
 			app.logger.Error("Failed to get shard state from DB", "error", err)
+			return stateLost
+		}
+
+		app.logger.Info(fmt.Sprintf("Shard state: %v", shardState))
+		master, err := app.getMasterHost(shardState)
+		if err != nil || master == "" {
+			app.logger.Error("Failed to get master from shard state", "error", err)
 		} else {
-			app.logger.Info(fmt.Sprintf("Shard state: %v", shardState))
+			local := app.shard.Local()
+			offline, err := local.IsOffline(app.ctx)
+			if err != nil {
+				app.logger.Error("Failed to get node offline state", "fqdn", local.FQDN(), "error", err)
+				return stateLost
+			}
+			if shardState[master].PingOk && shardState[master].PingStable && replicates(shardState[master], shardState[local.FQDN()].ReplicaState, local.FQDN(), app.shard.Get(master), false) {
+				if offline {
+					app.logger.Info("Rdsync have lost connection to ZK. However our replication connection is alive. Setting local node online")
+					err = node.SetOnline(app.ctx)
+					if err != nil {
+						app.logger.Error("Unable to set local node online", "error", err)
+					}
+					return stateLost
+				}
+				app.logger.Info("Rdsync have lost connection to ZK. However our replication connection is alive. Do nothing")
+				return stateLost
+			}
 		}
-		return stateLost
 	}
 
 	offline, err := node.IsOffline(app.ctx)

diff --git a/internal/app/replication.go b/internal/app/replication.go
@@ -10,8 +10,8 @@ func replicates(masterState *HostState, replicaState *ReplicaState, replicaFQDN
 	if replicaState == nil || !(replicaState.MasterLinkState || allowSync) {
 		return false
 	}
-	if slices.Contains(masterState.ConnectedReplicas, replicaFQDN) {
+	if masterState != nil && slices.Contains(masterState.ConnectedReplicas, replicaFQDN) {
 		return true
 	}
-	return masterNode.MatchHost(replicaState.MasterHost)
+	return masterNode != nil && masterNode.MatchHost(replicaState.MasterHost)
 }
diff --git a/tests/features/06_cluster_lost.feature b/tests/features/06_cluster_lost.feature
@@ -52,6 +52,30 @@ Feature: Cluster mode survives dcs conn loss
         And host "redis3" is attached to the network
         Then redis host "redis1" should become available within "60" seconds
 
+    Scenario: Cluster mode partitioned replica goes offline
+        Given clustered shard is up and running
+        Then redis host "redis1" should be master
+        And redis host "redis2" should become replica of "redis1" within "15" seconds
+        And replication on redis host "redis2" should run fine within "15" seconds
+        And redis host "redis3" should become replica of "redis1" within "15" seconds
+        And replication on redis host "redis3" should run fine within "15" seconds
+        And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
+        """
+            ["redis1","redis2","redis3"]
+        """
+        When host "zoo3" is detached from the network
+        And host "zoo2" is detached from the network
+        And host "zoo1" is detached from the network
+        And host "redis1" is detached from the network
+        And host "redis3" is detached from the network
+        Then redis host "redis2" should become unavailable within "30" seconds
+        When host "zoo3" is attached to the network
+        And host "zoo2" is attached to the network
+        And host "zoo1" is attached to the network
+        And host "redis1" is attached to the network
+        And host "redis3" is attached to the network
+        Then redis host "redis2" should become available within "60" seconds
+
     Scenario: Cluster mode partially partitioned manager gives up on manager role
         Given clustered shard is up and running
         Then redis host "redis1" should be master

diff --git a/tests/features/06_sentinel_lost.feature b/tests/features/06_sentinel_lost.feature
@@ -52,6 +52,30 @@ Feature: Sentinel mode survives dcs conn loss
         And host "redis3" is attached to the network
         Then redis host "redis1" should become available within "60" seconds
 
+    Scenario: Sentinel mode partitioned replica goes offline
+        Given sentinel shard is up and running
+        Then redis host "redis1" should be master
+        And redis host "redis2" should become replica of "redis1" within "15" seconds
+        And replication on redis host "redis2" should run fine within "15" seconds
+        And redis host "redis3" should become replica of "redis1" within "15" seconds
+        And replication on redis host "redis3" should run fine within "15" seconds
+        And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
+        """
+            ["redis1","redis2","redis3"]
+        """
+        When host "zoo3" is detached from the network
+        And host "zoo2" is detached from the network
+        And host "zoo1" is detached from the network
+        And host "redis1" is detached from the network
+        And host "redis3" is detached from the network
+        Then redis host "redis2" should become unavailable within "30" seconds
+        When host "zoo3" is attached to the network
+        And host "zoo2" is attached to the network
+        And host "zoo1" is attached to the network
+        And host "redis1" is attached to the network
+        And host "redis3" is attached to the network
+        Then redis host "redis2" should become available within "60" seconds
+
     Scenario: Sentinel mode partially partitioned manager gives up on manager role
         Given sentinel shard is up and running
         Then redis host "redis1" should be master