Skip to content

Commit

Permalink
Close partitioned replicas
Browse files Browse the repository at this point in the history
  • Loading branch information
secwall committed Aug 19, 2024
1 parent 279deff commit 1e0e995
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 6 deletions.
31 changes: 27 additions & 4 deletions internal/app/lost.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,47 @@ func (app *App) stateLost() appState {
return stateLost
}
if offline {
app.logger.Info("Rdsync have lost connection to ZK. However HA cluster is live. Setting local node online")
app.logger.Info("Rdsync have lost connection to ZK. However HA cluster is alive. Setting local node online")
err = node.SetOnline(app.ctx)
if err != nil {
app.logger.Error("Unable to set local node online", "error", err)
}
return stateLost
}
app.logger.Info("Rdsync have lost connection to ZK. However HA cluster is live. Do nothing")
app.logger.Info("Rdsync have lost connection to ZK. However HA cluster is alive. Do nothing")
return stateLost
}
} else {
shardState, err := app.getShardStateFromDB()
if err != nil {
app.logger.Error("Failed to get shard state from DB", "error", err)
return stateLost
}

app.logger.Info(fmt.Sprintf("Shard state: %v", shardState))
master, err := app.getMasterHost(shardState)
if err != nil || master == "" {
app.logger.Error("Failed to get master from shard state", "error", err)
} else {
app.logger.Info(fmt.Sprintf("Shard state: %v", shardState))
local := app.shard.Local()
offline, err := local.IsOffline(app.ctx)
if err != nil {
app.logger.Error("Failed to get node offline state", "fqdn", local.FQDN(), "error", err)
return stateLost
}
if shardState[master].PingOk && shardState[master].PingStable && replicates(shardState[master], shardState[local.FQDN()].ReplicaState, local.FQDN(), app.shard.Get(master), false) {
if offline {
app.logger.Info("Rdsync have lost connection to ZK. However our replication connection is alive. Setting local node online")
err = node.SetOnline(app.ctx)
if err != nil {
app.logger.Error("Unable to set local node online", "error", err)
}
return stateLost
}
app.logger.Info("Rdsync have lost connection to ZK. However our replication connection is alive. Do nothing")
return stateLost
}
}
return stateLost
}

offline, err := node.IsOffline(app.ctx)
Expand Down
4 changes: 2 additions & 2 deletions internal/app/replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ func replicates(masterState *HostState, replicaState *ReplicaState, replicaFQDN
if replicaState == nil || !(replicaState.MasterLinkState || allowSync) {
return false
}
if slices.Contains(masterState.ConnectedReplicas, replicaFQDN) {
if masterState != nil && slices.Contains(masterState.ConnectedReplicas, replicaFQDN) {
return true
}
return masterNode.MatchHost(replicaState.MasterHost)
return masterNode != nil && masterNode.MatchHost(replicaState.MasterHost)
}
24 changes: 24 additions & 0 deletions tests/features/06_cluster_lost.feature
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,30 @@ Feature: Cluster mode survives dcs conn loss
And host "redis3" is attached to the network
Then redis host "redis1" should become available within "60" seconds

Scenario: Cluster mode partitioned replica goes offline
Given clustered shard is up and running
Then redis host "redis1" should be master
And redis host "redis2" should become replica of "redis1" within "15" seconds
And replication on redis host "redis2" should run fine within "15" seconds
And redis host "redis3" should become replica of "redis1" within "15" seconds
And replication on redis host "redis3" should run fine within "15" seconds
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["redis1","redis2","redis3"]
"""
When host "zoo3" is detached from the network
And host "zoo2" is detached from the network
And host "zoo1" is detached from the network
And host "redis1" is detached from the network
And host "redis3" is detached from the network
Then redis host "redis2" should become unavailable within "30" seconds
When host "zoo3" is attached to the network
And host "zoo2" is attached to the network
And host "zoo1" is attached to the network
And host "redis1" is attached to the network
And host "redis3" is attached to the network
Then redis host "redis2" should become available within "60" seconds

Scenario: Cluster mode partially partitioned manager gives up on manager role
Given clustered shard is up and running
Then redis host "redis1" should be master
Expand Down
24 changes: 24 additions & 0 deletions tests/features/06_sentinel_lost.feature
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,30 @@ Feature: Sentinel mode survives dcs conn loss
And host "redis3" is attached to the network
Then redis host "redis1" should become available within "60" seconds

Scenario: Sentinel mode partitioned replica goes offline
Given sentinel shard is up and running
Then redis host "redis1" should be master
And redis host "redis2" should become replica of "redis1" within "15" seconds
And replication on redis host "redis2" should run fine within "15" seconds
And redis host "redis3" should become replica of "redis1" within "15" seconds
And replication on redis host "redis3" should run fine within "15" seconds
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["redis1","redis2","redis3"]
"""
When host "zoo3" is detached from the network
And host "zoo2" is detached from the network
And host "zoo1" is detached from the network
And host "redis1" is detached from the network
And host "redis3" is detached from the network
Then redis host "redis2" should become unavailable within "30" seconds
When host "zoo3" is attached to the network
And host "zoo2" is attached to the network
And host "zoo1" is attached to the network
And host "redis1" is attached to the network
And host "redis3" is attached to the network
Then redis host "redis2" should become available within "60" seconds

Scenario: Sentinel mode partially partitioned manager gives up on manager role
Given sentinel shard is up and running
Then redis host "redis1" should be master
Expand Down

0 comments on commit 1e0e995

Please sign in to comment.