From e7fee16236a41958decb14c2557f64687c386090 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Tue, 30 Aug 2022 16:56:56 +0530 Subject: [PATCH 1/7] refactor: refactor vtorc tests to run as a single test with sub-tests (#11108) Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- go/test/endtoend/vtorc/general/main_test.go | 4 +- go/test/endtoend/vtorc/general/vtorc_test.go | 222 +++++++------------ 2 files changed, 81 insertions(+), 145 deletions(-) diff --git a/go/test/endtoend/vtorc/general/main_test.go b/go/test/endtoend/vtorc/general/main_test.go index 018e6da21fa..c52502d7c9b 100644 --- a/go/test/endtoend/vtorc/general/main_test.go +++ b/go/test/endtoend/vtorc/general/main_test.go @@ -32,8 +32,8 @@ func TestMain(m *testing.M) { var cellInfos []*utils.CellInfo cellInfos = append(cellInfos, &utils.CellInfo{ CellName: utils.Cell1, - NumReplicas: 6, - NumRdonly: 2, + NumReplicas: 4, + NumRdonly: 1, UIDBase: 100, }) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 560e351fb31..f2509713208 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -90,94 +90,13 @@ func TestKeyspaceShard(t *testing.T) { utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) } -// 3. make primary readonly, let orc repair -func TestPrimaryReadOnly(t *testing.T) { - defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, - }, 1, "") - keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - - // find primary from topo - curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) - assert.NotNil(t, curPrimary, "should have elected a primary") - - // Make the current primary database read-only. - _, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "") - require.NoError(t, err) - - // wait for repair - match := utils.WaitForReadOnlyValue(t, curPrimary, 0) - require.True(t, match) -} - -// 4. make replica ReadWrite, let orc repair -func TestReplicaReadWrite(t *testing.T) { - defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, - }, 1, "") - keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - - // find primary from topo - curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) - assert.NotNil(t, curPrimary, "should have elected a primary") - - var replica *cluster.Vttablet - for _, tablet := range shard0.Vttablets { - // we know we have only two tablets, so the "other" one must be the new primary - if tablet.Alias != curPrimary.Alias { - replica = tablet - break - } - } - // Make the replica database read-write. - _, err := utils.RunSQL(t, "set global read_only=OFF", replica, "") - require.NoError(t, err) - - // wait for repair - match := utils.WaitForReadOnlyValue(t, replica, 1) - require.True(t, match) -} - -// 5. stop replication, let orc repair -func TestStopReplication(t *testing.T) { - defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, - }, 1, "") - keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - - // find primary from topo - curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) - assert.NotNil(t, curPrimary, "should have elected a primary") - - // TODO(deepthi): we should not need to do this, the DB should be created automatically - _, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false) - require.NoError(t, err) - - var replica *cluster.Vttablet - for _, tablet := range shard0.Vttablets { - // we know we have only two tablets, so the "other" one must be the new primary - if tablet.Alias != curPrimary.Alias { - replica = tablet - break - } - } - require.NotNil(t, replica, "should be able to find a replica") - // use vtctlclient to stop replication - _, err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias) - require.NoError(t, err) - - // check replication is setup correctly - utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 15*time.Second) -} - -// 6. setup replication from non-primary, let orc repair -func TestReplicationFromOtherReplica(t *testing.T) { +// Cases to test: +// 1. make primary readonly, let vtorc repair +// 2. make replica ReadWrite, let vtorc repair +// 3. stop replication, let vtorc repair +// 4. setup replication from non-primary, let vtorc repair +// 5. make instance A replicates from B and B from A, wait for repair +func TestVTOrcRepairs(t *testing.T) { defer cluster.PanicHandler(t) utils.SetupVttabletsAndVtorc(t, clusterInfo, 3, 0, nil, cluster.VtorcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -189,10 +108,6 @@ func TestReplicationFromOtherReplica(t *testing.T) { curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") - // TODO(deepthi): we should not need to do this, the DB should be created automatically - _, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false) - require.NoError(t, err) - var replica, otherReplica *cluster.Vttablet for _, tablet := range shard0.Vttablets { // we know we have only two tablets, so the "other" one must be the new primary @@ -210,17 +125,80 @@ func TestReplicationFromOtherReplica(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - // point replica at otherReplica - changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ - "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort) - _, err = utils.RunSQL(t, changeReplicationSourceCommand, replica, "") - require.NoError(t, err) + t.Run("PrimaryReadOnly", func(t *testing.T) { + // Make the current primary database read-only. + _, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "") + require.NoError(t, err) + + // wait for repair + match := utils.WaitForReadOnlyValue(t, curPrimary, 0) + require.True(t, match) + }) + + t.Run("ReplicaReadWrite", func(t *testing.T) { + // Make the replica database read-write. + _, err := utils.RunSQL(t, "set global read_only=OFF", replica, "") + require.NoError(t, err) + + // wait for repair + match := utils.WaitForReadOnlyValue(t, replica, 1) + require.True(t, match) + }) + + t.Run("StopReplication", func(t *testing.T) { + // use vtctlclient to stop replication + _, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias) + require.NoError(t, err) + + // check replication is setup correctly + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) + + // Stop just the IO thread on the replica + _, err = utils.RunSQL(t, "STOP SLAVE IO_THREAD", replica, "") + require.NoError(t, err) + + // check replication is setup correctly + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) + + // Stop just the SQL thread on the replica + _, err = utils.RunSQL(t, "STOP SLAVE SQL_THREAD", replica, "") + require.NoError(t, err) + + // check replication is setup correctly + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) + }) + + t.Run("ReplicationFromOtherReplica", func(t *testing.T) { + // point replica at otherReplica + changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ + "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort) + _, err := utils.RunSQL(t, changeReplicationSourceCommand, replica, "") + require.NoError(t, err) + + // wait until the source port is set back correctly by vtorc + utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second) + + // check that writes succeed + utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) + }) - // wait until the source port is set back correctly by vtorc - utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second) + t.Run("CircularReplication", func(t *testing.T) { + // change the replication source on the primary + changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ + "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ + "START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort) + _, err := utils.RunSQL(t, changeReplicationSourceCommands, curPrimary, "") + require.NoError(t, err) + + // wait for curPrimary to reach stable state + time.Sleep(1 * time.Second) - // check that writes succeed - utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) + // wait for repair + err = utils.WaitForReplicationToStop(t, curPrimary) + require.NoError(t, err) + // check that the writes still succeed + utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second) + }) } func TestRepairAfterTER(t *testing.T) { @@ -257,48 +235,6 @@ func TestRepairAfterTER(t *testing.T) { utils.CheckReplication(t, clusterInfo, newPrimary, []*cluster.Vttablet{curPrimary}, 15*time.Second) } -// 7. make instance A replicates from B and B from A, wait for repair -func TestCircularReplication(t *testing.T) { - defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, - }, 1, "") - keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - - // find primary from topo - primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) - assert.NotNil(t, primary, "should have elected a primary") - - var replica *cluster.Vttablet - for _, tablet := range shard0.Vttablets { - // we know we have only two tablets, so the "other" one must be the new primary - if tablet.Alias != primary.Alias { - replica = tablet - break - } - } - - // check replication is setup correctly - utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica}, 15*time.Second) - - // change the replication source on the primary - changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ - "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ - "START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort) - _, err := utils.RunSQL(t, changeReplicationSourceCommands, primary, "") - require.NoError(t, err) - - // wait for primary to reach stable state - time.Sleep(1 * time.Second) - - // wait for repair - err = utils.WaitForReplicationToStop(t, primary) - require.NoError(t, err) - // check that the writes still succeed - utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{replica}, 10*time.Second) -} - // TestSemiSync tests that semi-sync is setup correctly by vtorc if it is incorrectly set func TestSemiSync(t *testing.T) { // stop any vtorc instance running due to a previous test. From 377eb2051d130b0227edd1edff5cc31e79d17d98 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Thu, 28 Sep 2023 11:13:27 +0200 Subject: [PATCH 2/7] `vtctld`/`vtorc`: improve reparenting stats (#13723) Signed-off-by: Tim Vaillancourt Signed-off-by: Manan Gupta Co-authored-by: Manan Gupta --- go/stats/timings.go | 4 +- go/test/endtoend/vtorc/general/vtorc_test.go | 20 ++ .../primaryfailure/primary_failure_test.go | 230 +++++++++++++- go/test/endtoend/vtorc/utils/utils.go | 141 +++++++++ .../reparentutil/emergency_reparenter.go | 24 +- .../reparentutil/emergency_reparenter_test.go | 30 +- .../vtctl/reparentutil/planned_reparenter.go | 16 + .../planned_reparenter_flaky_test.go | 291 ++++++++++++++++++ go/vt/vtctl/reparentutil/util.go | 7 + 9 files changed, 744 insertions(+), 19 deletions(-) diff --git a/go/stats/timings.go b/go/stats/timings.go index 9048bedee11..cd51ec77a66 100644 --- a/go/stats/timings.go +++ b/go/stats/timings.go @@ -62,10 +62,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings { return t } -// Reset will clear histograms: used during testing +// Reset will clear histograms and counters: used during testing func (t *Timings) Reset() { t.mu.RLock() t.histograms = make(map[string]*Histogram) + t.totalCount.Set(0) + t.totalTime.Set(0) t.mu.RUnlock() } diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index f2509713208..2f2ca2d16ba 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -28,6 +28,7 @@ import ( "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/vtorc/utils" "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/vtorc/logic" ) // Cases to test: @@ -72,6 +73,8 @@ func TestSingleKeyspace(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) + utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], keyspace.Name, shard0.Name, 1) } // Cases to test: @@ -88,6 +91,8 @@ func TestKeyspaceShard(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) + utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], keyspace.Name, shard0.Name, 1) } // Cases to test: @@ -107,6 +112,9 @@ func TestVTOrcRepairs(t *testing.T) { // find primary from topo curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") + vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) var replica, otherReplica *cluster.Vttablet for _, tablet := range shard0.Vttablets { @@ -314,6 +322,9 @@ func TestVtorcWithPrs(t *testing.T) { // find primary from topo curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") + vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find any replica tablet other than the current primary var replica *cluster.Vttablet @@ -339,6 +350,15 @@ func TestVtorcWithPrs(t *testing.T) { // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + + // Verify that VTOrc didn't run any other recovery + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0) utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second) } diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index a922db9f010..32c7babe33b 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -17,6 +17,8 @@ limitations under the License. package primaryfailure import ( + "fmt" + "path" "testing" "time" @@ -40,6 +42,70 @@ func TestDownPrimary(t *testing.T) { // find primary from topo curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") + vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) + + // find the replica and rdonly tablets + var replica, rdonly *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two replcia tablets, so the one not the primary must be the other replica + if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { + replica = tablet + } + if tablet.Type == "rdonly" { + rdonly = tablet + } + } + assert.NotNil(t, replica, "could not find replica tablet") + assert.NotNil(t, rdonly, "could not find rdonly tablet") + + // Start a cross-cell replica + crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) + + // check that the replication is setup correctly before we failover + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second) + // since all tablets are up and running, InstancePollSecondsExceeded should have `0` zero value + utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 0, true) + // Make the rdonly vttablet unavailable + err := rdonly.VttabletProcess.TearDown() + require.NoError(t, err) + err = rdonly.MysqlctlProcess.Stop() + require.NoError(t, err) + // We have bunch of Vttablets down. Therefore we expect at least 1 occurrence of InstancePollSecondsExceeded + utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 1, false) + // Make the current primary vttablet unavailable. + err = curPrimary.VttabletProcess.TearDown() + require.NoError(t, err) + err = curPrimary.MysqlctlProcess.Stop() + require.NoError(t, err) + defer func() { + // we remove the tablet from our global list + utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) + utils.PermanentlyRemoveVttablet(clusterInfo, rdonly) + }() + + // check that the replica gets promoted + utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + + // also check that the replication is working correctly after failover + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) +} + +// bring down primary before VTOrc has started, let vtorc repair. +func TestDownPrimaryBeforeVTOrc(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + curPrimary := shard0.Vttablets[0] + + // Promote the first tablet as the primary + err := clusterInfo.ClusterInstance.VtctlclientProcess.InitializeShard(keyspace.Name, shard0.Name, clusterInfo.ClusterInstance.Cell, curPrimary.TabletUID) + require.NoError(t, err) // find the replica and rdonly tablets var replica, rdonly *cluster.Vttablet @@ -58,7 +124,140 @@ func TestDownPrimary(t *testing.T) { // check that the replication is setup correctly before we failover utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second) - // Make the current primary database unavailable. + // Make the current primary vttablet unavailable. + _ = curPrimary.VttabletProcess.TearDown() + err = curPrimary.MysqlctlProcess.Stop() + require.NoError(t, err) + + // Start a VTOrc instance + utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{ + PreventCrossDataCenterPrimaryFailover: true, + }, 1) + + vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] + + defer func() { + // we remove the tablet from our global list + utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) + }() + + // check that the replica gets promoted + utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + + // also check that the replication is working correctly after failover + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) +} + +// delete the primary record and let vtorc repair. +func TestDeletedPrimaryTablet(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{}, 1, "none") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) + + // find the replica and rdonly tablets + var replica, rdonly *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two replcia tablets, so the one not the primary must be the other replica + if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { + replica = tablet + } + if tablet.Type == "rdonly" { + rdonly = tablet + } + } + assert.NotNil(t, replica, "could not find replica tablet") + assert.NotNil(t, rdonly, "could not find rdonly tablet") + + // check that the replication is setup correctly before we failover + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second) + + // Disable VTOrc recoveries + vtOrcProcess.DisableGlobalRecoveries(t) + // use vtctlclient to stop replication on the replica + _, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias) + require.NoError(t, err) + // insert a write that is not available on the replica. + utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly}, 10*time.Second) + + // Make the current primary vttablet unavailable and delete its tablet record. + _ = curPrimary.VttabletProcess.TearDown() + err = curPrimary.MysqlctlProcess.Stop() + require.NoError(t, err) + // use vtctlclient to start replication on the replica back + _, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StartReplication", replica.Alias) + require.NoError(t, err) + err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", curPrimary.Alias) + require.NoError(t, err) + // Enable VTOrc recoveries now + vtOrcProcess.EnableGlobalRecoveries(t) + + defer func() { + // we remove the tablet from our global list + utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) + }() + + // check that the replica gets promoted. Also verify that it has all the writes. + utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + utils.CheckTabletUptoDate(t, clusterInfo, replica) + + // also check that the replication is working correctly after failover + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1) +} + +// TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead. +// Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know +// that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag. +func TestDeadPrimaryRecoversImmediately(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) + defer cluster.PanicHandler(t) + // We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test. + // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. + // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ + PreventCrossDataCenterPrimaryFailover: true, + }, 1, "semi_sync") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) + + // find the replica and rdonly tablets + var replica, rdonly *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two replcia tablets, so the one not the primary must be the other replica + if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { + replica = tablet + } + if tablet.Type == "rdonly" { + rdonly = tablet + } + } + assert.NotNil(t, replica, "could not find replica tablet") + assert.NotNil(t, rdonly, "could not find rdonly tablet") + + // Start a cross-cell replica + crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) + + // check that the replication is setup correctly before we failover + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second) + + // Make the current primary vttablet unavailable. + curPrimary.VttabletProcess.Kill() err := curPrimary.MysqlctlProcess.Stop() require.NoError(t, err) defer func() { @@ -69,7 +268,34 @@ func TestDownPrimary(t *testing.T) { // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // also check that the replication is working correctly after failover - utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) + + // Parse log file and find out how much time it took for DeadPrimary to recover. + logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName) + // log prefix printed at the end of analysis where we conclude we have DeadPrimary + t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery") + // log prefix printed at the end of recovery + t2 := extractTimeFromLog(t, logFile, "auditType:RecoverDeadPrimary") + curr := time.Now().Format("2006-01-02") + timeLayout := "2006-01-02 15:04:05.000000" + timeStr1 := fmt.Sprintf("%s %s", curr, t1) + timeStr2 := fmt.Sprintf("%s %s", curr, t2) + time1, err := time.Parse(timeLayout, timeStr1) + if err != nil { + t.Errorf("unable to parse time %s", err.Error()) + } + time2, err := time.Parse(timeLayout, timeStr2) + if err != nil { + t.Errorf("unable to parse time %s", err.Error()) + } + diff := time2.Sub(time1) + fmt.Printf("The difference between %s and %s is %v seconds.\n", t1, t2, diff.Seconds()) + // assert that it takes less than `remote_operation_timeout` to recover from `DeadPrimary` + // use the value provided in `remote_operation_timeout` flag to compare with. + // We are testing against 9.5 seconds to be safe and prevent flakiness. + assert.Less(t, diff.Seconds(), 9.5) } // Failover should not be cross data centers, according to the configuration file diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 96075aef3c0..713f651203f 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -18,12 +18,16 @@ package utils import ( "context" + "encoding/json" "fmt" "io" + "math" "net/http" "os" "os/exec" "path" + "reflect" + "strconv" "strings" "testing" "time" @@ -915,3 +919,140 @@ func WaitForReadOnlyValue(t *testing.T, curPrimary *cluster.Vttablet, expectValu } return false } + +// WaitForSuccessfulRecoveryCount waits until the given recovery name's count of successful runs matches the count expected +func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, recoveryName string, countExpected int) { + t.Helper() + timeout := 15 * time.Second + startTime := time.Now() + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) + successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) + if successCount == countExpected { + return + } + time.Sleep(time.Second) + } + vars := vtorcInstance.GetVars() + successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) + successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) + assert.EqualValues(t, countExpected, successCount) +} + +// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected. +func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyspace, shard string, countExpected int) { + t.Helper() + timeout := 15 * time.Second + startTime := time.Now() + mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard) + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{}) + successCount := getIntFromValue(prsCountsMap[mapKey]) + if successCount == countExpected { + return + } + time.Sleep(time.Second) + } + vars := vtorcInstance.GetVars() + prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{}) + successCount := getIntFromValue(prsCountsMap[mapKey]) + assert.EqualValues(t, countExpected, successCount) +} + +// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected. +func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyspace, shard string, countExpected int) { + t.Helper() + timeout := 15 * time.Second + startTime := time.Now() + mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard) + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{}) + successCount := getIntFromValue(ersCountsMap[mapKey]) + if successCount == countExpected { + return + } + time.Sleep(time.Second) + } + vars := vtorcInstance.GetVars() + ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{}) + successCount := getIntFromValue(ersCountsMap[mapKey]) + assert.EqualValues(t, countExpected, successCount) +} + +// getIntFromValue is a helper function to get an integer from the given value. +// If it is convertible to a float, then we round the number to the nearest integer. +// If the value is not numeric at all, we return 0. +func getIntFromValue(val any) int { + value := reflect.ValueOf(val) + if value.CanFloat() { + return int(math.Round(value.Float())) + } + if value.CanInt() { + return int(value.Int()) + } + return 0 +} + +// WaitForTabletType waits for the tablet to reach a certain type. +func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) { + t.Helper() + err := tablet.VttabletProcess.WaitForTabletTypes([]string{expectedTabletType}) + require.NoError(t, err) +} + +// WaitForInstancePollSecondsExceededCount waits for 30 seconds and then queries api/aggregated-discovery-metrics. +// It expects to find minimum occurrence or exact count of `keyName` provided. +func WaitForInstancePollSecondsExceededCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyName string, minCountExpected float64, enforceEquality bool) { + t.Helper() + var sinceInSeconds = 30 + duration := time.Duration(sinceInSeconds) + time.Sleep(duration * time.Second) + + statusCode, res, err := vtorcInstance.MakeAPICall("api/aggregated-discovery-metrics?seconds=" + strconv.Itoa(sinceInSeconds)) + if err != nil { + assert.Fail(t, "Not able to call api/aggregated-discovery-metrics") + } + if statusCode == 200 { + resultMap := make(map[string]any) + err := json.Unmarshal([]byte(res), &resultMap) + if err != nil { + assert.Fail(t, "invalid response from api/aggregated-discovery-metrics") + } + successCount := resultMap[keyName] + if iSuccessCount, ok := successCount.(float64); ok { + if enforceEquality { + assert.Equal(t, iSuccessCount, minCountExpected) + } else { + assert.GreaterOrEqual(t, iSuccessCount, minCountExpected) + } + return + } + } + assert.Fail(t, "invalid response from api/aggregated-discovery-metrics") +} + +// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test. +// This function is supposed to be called as the first defer command from the vtorc tests. +func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) { + // If the test has not failed, then we don't need to print anything. + if !t.Failed() { + return + } + + log.Errorf("Printing VTOrc logs") + for _, vtorc := range clusterInstance.VtorcProcesses { + if vtorc == nil || vtorc.LogFileName == "" { + continue + } + filePath := path.Join(vtorc.LogDir, vtorc.LogFileName) + log.Errorf("Printing file - %s", filePath) + content, err := os.ReadFile(filePath) + if err != nil { + log.Errorf("Error while reading the file - %v", err) + } + log.Errorf("%s", string(content)) + } +} diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index ba846ebc147..f3b2ce8d2b4 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -68,9 +68,14 @@ type EmergencyReparentOptions struct { // counters for Emergency Reparent Shard var ( - ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run") - ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded") - ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed") + // TODO(timvaillancourt): remove legacyERS* gauges in v19+. + legacyERSCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run") + legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded") + legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed") + + ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run", + []string{"Keyspace", "Shard", "Result"}, + ) ) // NewEmergencyReparenter returns a new EmergencyReparenter object, ready to @@ -98,26 +103,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l // keyspace and shard. func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) { var err error + statsLabels := []string{keyspace, shard} + // First step is to lock the shard for the given operation, if not already locked if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { var unlock func(*error) opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias) ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction) if err != nil { + ersCounter.Add(append(statsLabels, failureResult), 1) return nil, err } defer unlock(&err) } // dispatch success or failure of ERS + startTime := time.Now() ev := &events.Reparent{} defer func() { + reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime)) switch err { case nil: - ersSuccessCounter.Add(1) + legacyERSSuccessCounter.Add(1) + ersCounter.Add(append(statsLabels, successResult), 1) event.DispatchUpdate(ev, "finished EmergencyReparentShard") default: - ersFailureCounter.Add(1) + legacyERSFailureCounter.Add(1) + ersCounter.Add(append(statsLabels, failureResult), 1) event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error()) } }() @@ -141,7 +153,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) { // log the starting of the operation and increment the counter erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard) - ersCounter.Add(1) + legacyERSCounter.Add(1) var ( stoppedReplicationSnapshot *replicationSnapshot diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go index af7b33a3151..40c3c295f30 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go @@ -2747,10 +2747,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) { } } -func TestEmergencyReparenterCounters(t *testing.T) { - ersCounter.Set(0) - ersSuccessCounter.Set(0) - ersFailureCounter.Set(0) +func TestEmergencyReparenterStats(t *testing.T) { + ersCounter.ResetAll() + legacyERSCounter.Reset() + legacyERSSuccessCounter.Reset() + legacyERSFailureCounter.Reset() + reparentShardOpTimings.Reset() emergencyReparentOps := EmergencyReparentOptions{} tmc := &testutil.TabletManagerClient{ @@ -2879,9 +2881,13 @@ func TestEmergencyReparenterCounters(t *testing.T) { require.NoError(t, err) // check the counter values - require.EqualValues(t, 1, ersCounter.Get()) - require.EqualValues(t, 1, ersSuccessCounter.Get()) - require.EqualValues(t, 0, ersFailureCounter.Get()) + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts()) + + // check the legacy counter values + require.EqualValues(t, 1, legacyERSCounter.Get()) + require.EqualValues(t, 1, legacyERSSuccessCounter.Get()) + require.EqualValues(t, 0, legacyERSFailureCounter.Get()) // set emergencyReparentOps to request a non existent tablet emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{ @@ -2894,9 +2900,13 @@ func TestEmergencyReparenterCounters(t *testing.T) { require.Error(t, err) // check the counter values - require.EqualValues(t, 2, ersCounter.Get()) - require.EqualValues(t, 1, ersSuccessCounter.Get()) - require.EqualValues(t, 1, ersFailureCounter.Get()) + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts()) + + // check the legacy counter values + require.EqualValues(t, 2, legacyERSCounter.Get()) + require.EqualValues(t, 1, legacyERSSuccessCounter.Get()) + require.EqualValues(t, 1, legacyERSFailureCounter.Get()) } func TestEmergencyReparenter_findMostAdvanced(t *testing.T) { diff --git a/go/vt/vtctl/reparentutil/planned_reparenter.go b/go/vt/vtctl/reparentutil/planned_reparenter.go index 4f46d234e25..31b222f9df5 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter.go @@ -26,6 +26,7 @@ import ( "vitess.io/vitess/go/event" "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/concurrency" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/topo" @@ -39,6 +40,13 @@ import ( "vitess.io/vitess/go/vt/proto/vtrpc" ) +// counters for Planned Reparent Shard +var ( + prsCounter = stats.NewCountersWithMultiLabels("planned_reparent_counts", "Number of times Planned Reparent Shard has been run", + []string{"Keyspace", "Shard", "Result"}, + ) +) + // PlannedReparenter performs PlannedReparentShard operations. type PlannedReparenter struct { ts *topo.Server @@ -88,11 +96,14 @@ func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, log // both the current and desired primary are reachable and in a good state. func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) { var err error + statsLabels := []string{keyspace, shard} + if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { var unlock func(*error) opts.lockAction = pr.getLockAction(opts) ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction) if err != nil { + prsCounter.Add(append(statsLabels, failureResult), 1) return nil, err } defer unlock(&err) @@ -101,18 +112,23 @@ func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil { shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard) if err != nil { + prsCounter.Add(append(statsLabels, failureResult), 1) return nil, err } opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias } + startTime := time.Now() ev := &events.Reparent{} defer func() { + reparentShardOpTimings.Add("PlannedReparentShard", time.Since(startTime)) switch err { case nil: + prsCounter.Add(append(statsLabels, successResult), 1) event.DispatchUpdate(ev, "finished PlannedReparentShard") default: + prsCounter.Add(append(statsLabels, failureResult), 1) event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error()) } }() diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go index 09900c4a624..b186f68b0f2 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go @@ -3719,3 +3719,294 @@ func AssertReparentEventsEqual(t *testing.T, expected *events.Reparent, actual * AssertReparentEventsEqualWithMessage(t, expected, actual, "") } + +// TestPlannedReparenter_verifyAllTabletsReachable tests the functionality of verifyAllTabletsReachable. +func TestPlannedReparenter_verifyAllTabletsReachable(t *testing.T) { + tests := []struct { + name string + tmc tmclient.TabletManagerClient + tabletMap map[string]*topo.TabletInfo + remoteOpTime time.Duration + wantErr string + }{ + { + name: "Success", + tmc: &testutil.TabletManagerClient{ + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000201": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + }, + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + }, + }, + "zone1-0000000200": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 200, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + "zone1-0000000201": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 201, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + }, + }, { + name: "Failure", + tmc: &testutil.TabletManagerClient{ + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Error: fmt.Errorf("primary status failed"), + }, + "zone1-0000000201": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + }, + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + }, + }, + "zone1-0000000200": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 200, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + "zone1-0000000201": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 201, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + }, + wantErr: "primary status failed", + }, { + name: "Timeout", + tmc: &testutil.TabletManagerClient{ + PrimaryStatusDelays: map[string]time.Duration{ + "zone1-0000000100": 20 * time.Second, + }, + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000201": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + }, + remoteOpTime: 100 * time.Millisecond, + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + }, + }, + "zone1-0000000200": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 200, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + "zone1-0000000201": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 201, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + }, + wantErr: "context deadline exceeded", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ts := memorytopo.NewServer(ctx, "zone1") + defer ts.Close() + + pr := &PlannedReparenter{ + ts: ts, + tmc: tt.tmc, + } + if tt.remoteOpTime != 0 { + oldTime := topo.RemoteOperationTimeout + topo.RemoteOperationTimeout = tt.remoteOpTime + defer func() { + topo.RemoteOperationTimeout = oldTime + }() + } + err := pr.verifyAllTabletsReachable(context.Background(), tt.tabletMap) + if tt.wantErr == "" { + require.NoError(t, err) + return + } + require.ErrorContains(t, err, tt.wantErr) + }) + } +} + +func TestPlannedReparenterStats(t *testing.T) { + prsCounter.ResetAll() + reparentShardOpTimings.Reset() + + tmc := &testutil.TabletManagerClient{ + PrimaryPositionResults: map[string]struct { + Position string + Error error + }{ + "zone1-0000000100": { + Position: "position1", + Error: nil, + }, + }, + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000100": nil, + }, + SetReplicationSourceResults: map[string]error{ + "zone1-0000000101": nil, + }, + SetReadWriteResults: map[string]error{ + "zone1-0000000100": nil, + }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000101": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + } + shards := []*vtctldatapb.Shard{ + { + Keyspace: "testkeyspace", + Name: "-", + }, + } + tablets := []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + Keyspace: "testkeyspace", + Shard: "-", + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Type: topodatapb.TabletType_REPLICA, + Keyspace: "testkeyspace", + Shard: "-", + }, + } + plannedReparentOps := PlannedReparentOptions{ + NewPrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + } + keyspace := "testkeyspace" + shard := "-" + ts := memorytopo.NewServer(context.Background(), "zone1") + + ctx := context.Background() + logger := logutil.NewMemoryLogger() + + testutil.AddShards(ctx, t, ts, shards...) + testutil.AddTablets(ctx, t, ts, &testutil.AddTabletOptions{ + AlsoSetShardPrimary: true, + SkipShardCreation: false, + }, tablets...) + + prp := NewPlannedReparenter(ts, tmc, logger) + // run a successful prs + _, err := prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps) + require.NoError(t, err) + + // check the counter values + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, prsCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 1, "PlannedReparentShard": 1}, reparentShardOpTimings.Counts()) + + // set plannedReparentOps to request a non existent tablet + plannedReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{ + Cell: "bogus", + Uid: 100, + } + + // run a failing prs + _, err = prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps) + require.Error(t, err) + + // check the counter values + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, prsCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 2, "PlannedReparentShard": 2}, reparentShardOpTimings.Counts()) +} diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go index f4cebc3dd7d..ac51a45c65f 100644 --- a/go/vt/vtctl/reparentutil/util.go +++ b/go/vt/vtctl/reparentutil/util.go @@ -23,6 +23,7 @@ import ( "time" "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/concurrency" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" @@ -38,6 +39,12 @@ import ( "vitess.io/vitess/go/vt/proto/vtrpc" ) +var ( + reparentShardOpTimings = stats.NewTimings("reparent_shard_operation_timings", "Timings of reparent shard operations", "Operation") + failureResult = "failure" + successResult = "success" +) + // ChooseNewPrimary finds a tablet that should become a primary after reparent. // The criteria for the new primary-elect are (preferably) to be in the same // cell as the current primary, and to be different from avoidPrimaryAlias. The From f973f540d1f54da4fde456530474b3dbebb2169e Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Thu, 12 Oct 2023 18:26:26 +0200 Subject: [PATCH 3/7] revert changes to go/test/endtoend Signed-off-by: Tim Vaillancourt --- go/test/endtoend/vtorc/general/main_test.go | 4 +- go/test/endtoend/vtorc/general/vtorc_test.go | 242 +++++++++++------- .../primaryfailure/primary_failure_test.go | 230 +---------------- go/test/endtoend/vtorc/utils/utils.go | 141 ---------- 4 files changed, 147 insertions(+), 470 deletions(-) diff --git a/go/test/endtoend/vtorc/general/main_test.go b/go/test/endtoend/vtorc/general/main_test.go index c52502d7c9b..018e6da21fa 100644 --- a/go/test/endtoend/vtorc/general/main_test.go +++ b/go/test/endtoend/vtorc/general/main_test.go @@ -32,8 +32,8 @@ func TestMain(m *testing.M) { var cellInfos []*utils.CellInfo cellInfos = append(cellInfos, &utils.CellInfo{ CellName: utils.Cell1, - NumReplicas: 4, - NumRdonly: 1, + NumReplicas: 6, + NumRdonly: 2, UIDBase: 100, }) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 2f2ca2d16ba..560e351fb31 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -28,7 +28,6 @@ import ( "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/vtorc/utils" "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtorc/logic" ) // Cases to test: @@ -73,8 +72,6 @@ func TestSingleKeyspace(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) - utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], keyspace.Name, shard0.Name, 1) } // Cases to test: @@ -91,17 +88,96 @@ func TestKeyspaceShard(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) - utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], keyspace.Name, shard0.Name, 1) } -// Cases to test: -// 1. make primary readonly, let vtorc repair -// 2. make replica ReadWrite, let vtorc repair -// 3. stop replication, let vtorc repair -// 4. setup replication from non-primary, let vtorc repair -// 5. make instance A replicates from B and B from A, wait for repair -func TestVTOrcRepairs(t *testing.T) { +// 3. make primary readonly, let orc repair +func TestPrimaryReadOnly(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{ + PreventCrossDataCenterPrimaryFailover: true, + }, 1, "") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + + // Make the current primary database read-only. + _, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "") + require.NoError(t, err) + + // wait for repair + match := utils.WaitForReadOnlyValue(t, curPrimary, 0) + require.True(t, match) +} + +// 4. make replica ReadWrite, let orc repair +func TestReplicaReadWrite(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{ + PreventCrossDataCenterPrimaryFailover: true, + }, 1, "") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + + var replica *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two tablets, so the "other" one must be the new primary + if tablet.Alias != curPrimary.Alias { + replica = tablet + break + } + } + // Make the replica database read-write. + _, err := utils.RunSQL(t, "set global read_only=OFF", replica, "") + require.NoError(t, err) + + // wait for repair + match := utils.WaitForReadOnlyValue(t, replica, 1) + require.True(t, match) +} + +// 5. stop replication, let orc repair +func TestStopReplication(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{ + PreventCrossDataCenterPrimaryFailover: true, + }, 1, "") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + + // TODO(deepthi): we should not need to do this, the DB should be created automatically + _, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false) + require.NoError(t, err) + + var replica *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two tablets, so the "other" one must be the new primary + if tablet.Alias != curPrimary.Alias { + replica = tablet + break + } + } + require.NotNil(t, replica, "should be able to find a replica") + // use vtctlclient to stop replication + _, err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias) + require.NoError(t, err) + + // check replication is setup correctly + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 15*time.Second) +} + +// 6. setup replication from non-primary, let orc repair +func TestReplicationFromOtherReplica(t *testing.T) { defer cluster.PanicHandler(t) utils.SetupVttabletsAndVtorc(t, clusterInfo, 3, 0, nil, cluster.VtorcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -112,9 +188,10 @@ func TestVTOrcRepairs(t *testing.T) { // find primary from topo curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") - vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) - utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) + + // TODO(deepthi): we should not need to do this, the DB should be created automatically + _, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false) + require.NoError(t, err) var replica, otherReplica *cluster.Vttablet for _, tablet := range shard0.Vttablets { @@ -133,80 +210,17 @@ func TestVTOrcRepairs(t *testing.T) { // check replication is setup correctly utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - t.Run("PrimaryReadOnly", func(t *testing.T) { - // Make the current primary database read-only. - _, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "") - require.NoError(t, err) - - // wait for repair - match := utils.WaitForReadOnlyValue(t, curPrimary, 0) - require.True(t, match) - }) - - t.Run("ReplicaReadWrite", func(t *testing.T) { - // Make the replica database read-write. - _, err := utils.RunSQL(t, "set global read_only=OFF", replica, "") - require.NoError(t, err) - - // wait for repair - match := utils.WaitForReadOnlyValue(t, replica, 1) - require.True(t, match) - }) - - t.Run("StopReplication", func(t *testing.T) { - // use vtctlclient to stop replication - _, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias) - require.NoError(t, err) - - // check replication is setup correctly - utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - - // Stop just the IO thread on the replica - _, err = utils.RunSQL(t, "STOP SLAVE IO_THREAD", replica, "") - require.NoError(t, err) - - // check replication is setup correctly - utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - - // Stop just the SQL thread on the replica - _, err = utils.RunSQL(t, "STOP SLAVE SQL_THREAD", replica, "") - require.NoError(t, err) - - // check replication is setup correctly - utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - }) - - t.Run("ReplicationFromOtherReplica", func(t *testing.T) { - // point replica at otherReplica - changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ - "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort) - _, err := utils.RunSQL(t, changeReplicationSourceCommand, replica, "") - require.NoError(t, err) - - // wait until the source port is set back correctly by vtorc - utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second) - - // check that writes succeed - utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) - }) - - t.Run("CircularReplication", func(t *testing.T) { - // change the replication source on the primary - changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ - "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ - "START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort) - _, err := utils.RunSQL(t, changeReplicationSourceCommands, curPrimary, "") - require.NoError(t, err) + // point replica at otherReplica + changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ + "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort) + _, err = utils.RunSQL(t, changeReplicationSourceCommand, replica, "") + require.NoError(t, err) - // wait for curPrimary to reach stable state - time.Sleep(1 * time.Second) + // wait until the source port is set back correctly by vtorc + utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second) - // wait for repair - err = utils.WaitForReplicationToStop(t, curPrimary) - require.NoError(t, err) - // check that the writes still succeed - utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second) - }) + // check that writes succeed + utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second) } func TestRepairAfterTER(t *testing.T) { @@ -243,6 +257,48 @@ func TestRepairAfterTER(t *testing.T) { utils.CheckReplication(t, clusterInfo, newPrimary, []*cluster.Vttablet{curPrimary}, 15*time.Second) } +// 7. make instance A replicates from B and B from A, wait for repair +func TestCircularReplication(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{ + PreventCrossDataCenterPrimaryFailover: true, + }, 1, "") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, primary, "should have elected a primary") + + var replica *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two tablets, so the "other" one must be the new primary + if tablet.Alias != primary.Alias { + replica = tablet + break + } + } + + // check replication is setup correctly + utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica}, 15*time.Second) + + // change the replication source on the primary + changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+ + "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ + "START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort) + _, err := utils.RunSQL(t, changeReplicationSourceCommands, primary, "") + require.NoError(t, err) + + // wait for primary to reach stable state + time.Sleep(1 * time.Second) + + // wait for repair + err = utils.WaitForReplicationToStop(t, primary) + require.NoError(t, err) + // check that the writes still succeed + utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{replica}, 10*time.Second) +} + // TestSemiSync tests that semi-sync is setup correctly by vtorc if it is incorrectly set func TestSemiSync(t *testing.T) { // stop any vtorc instance running due to a previous test. @@ -322,9 +378,6 @@ func TestVtorcWithPrs(t *testing.T) { // find primary from topo curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") - vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) - utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find any replica tablet other than the current primary var replica *cluster.Vttablet @@ -350,15 +403,6 @@ func TestVtorcWithPrs(t *testing.T) { // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) - - // Verify that VTOrc didn't run any other recovery - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) - utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0) - utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0) utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second) } diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 32c7babe33b..a922db9f010 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -17,8 +17,6 @@ limitations under the License. package primaryfailure import ( - "fmt" - "path" "testing" "time" @@ -42,70 +40,6 @@ func TestDownPrimary(t *testing.T) { // find primary from topo curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) assert.NotNil(t, curPrimary, "should have elected a primary") - vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) - utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) - - // find the replica and rdonly tablets - var replica, rdonly *cluster.Vttablet - for _, tablet := range shard0.Vttablets { - // we know we have only two replcia tablets, so the one not the primary must be the other replica - if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { - replica = tablet - } - if tablet.Type == "rdonly" { - rdonly = tablet - } - } - assert.NotNil(t, replica, "could not find replica tablet") - assert.NotNil(t, rdonly, "could not find rdonly tablet") - - // Start a cross-cell replica - crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) - - // check that the replication is setup correctly before we failover - utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second) - // since all tablets are up and running, InstancePollSecondsExceeded should have `0` zero value - utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 0, true) - // Make the rdonly vttablet unavailable - err := rdonly.VttabletProcess.TearDown() - require.NoError(t, err) - err = rdonly.MysqlctlProcess.Stop() - require.NoError(t, err) - // We have bunch of Vttablets down. Therefore we expect at least 1 occurrence of InstancePollSecondsExceeded - utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 1, false) - // Make the current primary vttablet unavailable. - err = curPrimary.VttabletProcess.TearDown() - require.NoError(t, err) - err = curPrimary.MysqlctlProcess.Stop() - require.NoError(t, err) - defer func() { - // we remove the tablet from our global list - utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) - utils.PermanentlyRemoveVttablet(clusterInfo, rdonly) - }() - - // check that the replica gets promoted - utils.CheckPrimaryTablet(t, clusterInfo, replica, true) - - // also check that the replication is working correctly after failover - utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) - utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) -} - -// bring down primary before VTOrc has started, let vtorc repair. -func TestDownPrimaryBeforeVTOrc(t *testing.T) { - defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) - defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none") - keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - curPrimary := shard0.Vttablets[0] - - // Promote the first tablet as the primary - err := clusterInfo.ClusterInstance.VtctlclientProcess.InitializeShard(keyspace.Name, shard0.Name, clusterInfo.ClusterInstance.Cell, curPrimary.TabletUID) - require.NoError(t, err) // find the replica and rdonly tablets var replica, rdonly *cluster.Vttablet @@ -124,140 +58,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // check that the replication is setup correctly before we failover utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second) - // Make the current primary vttablet unavailable. - _ = curPrimary.VttabletProcess.TearDown() - err = curPrimary.MysqlctlProcess.Stop() - require.NoError(t, err) - - // Start a VTOrc instance - utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, - }, 1) - - vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] - - defer func() { - // we remove the tablet from our global list - utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) - }() - - // check that the replica gets promoted - utils.CheckPrimaryTablet(t, clusterInfo, replica, true) - - // also check that the replication is working correctly after failover - utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) - utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) -} - -// delete the primary record and let vtorc repair. -func TestDeletedPrimaryTablet(t *testing.T) { - defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) - defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{}, 1, "none") - keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - // find primary from topo - curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) - assert.NotNil(t, curPrimary, "should have elected a primary") - vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) - utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) - - // find the replica and rdonly tablets - var replica, rdonly *cluster.Vttablet - for _, tablet := range shard0.Vttablets { - // we know we have only two replcia tablets, so the one not the primary must be the other replica - if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { - replica = tablet - } - if tablet.Type == "rdonly" { - rdonly = tablet - } - } - assert.NotNil(t, replica, "could not find replica tablet") - assert.NotNil(t, rdonly, "could not find rdonly tablet") - - // check that the replication is setup correctly before we failover - utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second) - - // Disable VTOrc recoveries - vtOrcProcess.DisableGlobalRecoveries(t) - // use vtctlclient to stop replication on the replica - _, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias) - require.NoError(t, err) - // insert a write that is not available on the replica. - utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly}, 10*time.Second) - - // Make the current primary vttablet unavailable and delete its tablet record. - _ = curPrimary.VttabletProcess.TearDown() - err = curPrimary.MysqlctlProcess.Stop() - require.NoError(t, err) - // use vtctlclient to start replication on the replica back - _, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StartReplication", replica.Alias) - require.NoError(t, err) - err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", curPrimary.Alias) - require.NoError(t, err) - // Enable VTOrc recoveries now - vtOrcProcess.EnableGlobalRecoveries(t) - - defer func() { - // we remove the tablet from our global list - utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) - }() - - // check that the replica gets promoted. Also verify that it has all the writes. - utils.CheckPrimaryTablet(t, clusterInfo, replica, true) - utils.CheckTabletUptoDate(t, clusterInfo, replica) - - // also check that the replication is working correctly after failover - utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1) -} - -// TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead. -// Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know -// that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag. -func TestDeadPrimaryRecoversImmediately(t *testing.T) { - defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) - defer cluster.PanicHandler(t) - // We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test. - // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. - // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. - utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, - }, 1, "semi_sync") - keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - // find primary from topo - curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) - assert.NotNil(t, curPrimary, "should have elected a primary") - vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0] - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) - utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) - - // find the replica and rdonly tablets - var replica, rdonly *cluster.Vttablet - for _, tablet := range shard0.Vttablets { - // we know we have only two replcia tablets, so the one not the primary must be the other replica - if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { - replica = tablet - } - if tablet.Type == "rdonly" { - rdonly = tablet - } - } - assert.NotNil(t, replica, "could not find replica tablet") - assert.NotNil(t, rdonly, "could not find rdonly tablet") - - // Start a cross-cell replica - crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) - - // check that the replication is setup correctly before we failover - utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second) - - // Make the current primary vttablet unavailable. - curPrimary.VttabletProcess.Kill() + // Make the current primary database unavailable. err := curPrimary.MysqlctlProcess.Stop() require.NoError(t, err) defer func() { @@ -268,34 +69,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // also check that the replication is working correctly after failover - utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) - utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) - utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) - - // Parse log file and find out how much time it took for DeadPrimary to recover. - logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName) - // log prefix printed at the end of analysis where we conclude we have DeadPrimary - t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery") - // log prefix printed at the end of recovery - t2 := extractTimeFromLog(t, logFile, "auditType:RecoverDeadPrimary") - curr := time.Now().Format("2006-01-02") - timeLayout := "2006-01-02 15:04:05.000000" - timeStr1 := fmt.Sprintf("%s %s", curr, t1) - timeStr2 := fmt.Sprintf("%s %s", curr, t2) - time1, err := time.Parse(timeLayout, timeStr1) - if err != nil { - t.Errorf("unable to parse time %s", err.Error()) - } - time2, err := time.Parse(timeLayout, timeStr2) - if err != nil { - t.Errorf("unable to parse time %s", err.Error()) - } - diff := time2.Sub(time1) - fmt.Printf("The difference between %s and %s is %v seconds.\n", t1, t2, diff.Seconds()) - // assert that it takes less than `remote_operation_timeout` to recover from `DeadPrimary` - // use the value provided in `remote_operation_timeout` flag to compare with. - // We are testing against 9.5 seconds to be safe and prevent flakiness. - assert.Less(t, diff.Seconds(), 9.5) + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) } // Failover should not be cross data centers, according to the configuration file diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 713f651203f..96075aef3c0 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -18,16 +18,12 @@ package utils import ( "context" - "encoding/json" "fmt" "io" - "math" "net/http" "os" "os/exec" "path" - "reflect" - "strconv" "strings" "testing" "time" @@ -919,140 +915,3 @@ func WaitForReadOnlyValue(t *testing.T, curPrimary *cluster.Vttablet, expectValu } return false } - -// WaitForSuccessfulRecoveryCount waits until the given recovery name's count of successful runs matches the count expected -func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, recoveryName string, countExpected int) { - t.Helper() - timeout := 15 * time.Second - startTime := time.Now() - for time.Since(startTime) < timeout { - vars := vtorcInstance.GetVars() - successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) - if successCount == countExpected { - return - } - time.Sleep(time.Second) - } - vars := vtorcInstance.GetVars() - successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) - assert.EqualValues(t, countExpected, successCount) -} - -// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected. -func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyspace, shard string, countExpected int) { - t.Helper() - timeout := 15 * time.Second - startTime := time.Now() - mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard) - for time.Since(startTime) < timeout { - vars := vtorcInstance.GetVars() - prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{}) - successCount := getIntFromValue(prsCountsMap[mapKey]) - if successCount == countExpected { - return - } - time.Sleep(time.Second) - } - vars := vtorcInstance.GetVars() - prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{}) - successCount := getIntFromValue(prsCountsMap[mapKey]) - assert.EqualValues(t, countExpected, successCount) -} - -// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected. -func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyspace, shard string, countExpected int) { - t.Helper() - timeout := 15 * time.Second - startTime := time.Now() - mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard) - for time.Since(startTime) < timeout { - vars := vtorcInstance.GetVars() - ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{}) - successCount := getIntFromValue(ersCountsMap[mapKey]) - if successCount == countExpected { - return - } - time.Sleep(time.Second) - } - vars := vtorcInstance.GetVars() - ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{}) - successCount := getIntFromValue(ersCountsMap[mapKey]) - assert.EqualValues(t, countExpected, successCount) -} - -// getIntFromValue is a helper function to get an integer from the given value. -// If it is convertible to a float, then we round the number to the nearest integer. -// If the value is not numeric at all, we return 0. -func getIntFromValue(val any) int { - value := reflect.ValueOf(val) - if value.CanFloat() { - return int(math.Round(value.Float())) - } - if value.CanInt() { - return int(value.Int()) - } - return 0 -} - -// WaitForTabletType waits for the tablet to reach a certain type. -func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) { - t.Helper() - err := tablet.VttabletProcess.WaitForTabletTypes([]string{expectedTabletType}) - require.NoError(t, err) -} - -// WaitForInstancePollSecondsExceededCount waits for 30 seconds and then queries api/aggregated-discovery-metrics. -// It expects to find minimum occurrence or exact count of `keyName` provided. -func WaitForInstancePollSecondsExceededCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyName string, minCountExpected float64, enforceEquality bool) { - t.Helper() - var sinceInSeconds = 30 - duration := time.Duration(sinceInSeconds) - time.Sleep(duration * time.Second) - - statusCode, res, err := vtorcInstance.MakeAPICall("api/aggregated-discovery-metrics?seconds=" + strconv.Itoa(sinceInSeconds)) - if err != nil { - assert.Fail(t, "Not able to call api/aggregated-discovery-metrics") - } - if statusCode == 200 { - resultMap := make(map[string]any) - err := json.Unmarshal([]byte(res), &resultMap) - if err != nil { - assert.Fail(t, "invalid response from api/aggregated-discovery-metrics") - } - successCount := resultMap[keyName] - if iSuccessCount, ok := successCount.(float64); ok { - if enforceEquality { - assert.Equal(t, iSuccessCount, minCountExpected) - } else { - assert.GreaterOrEqual(t, iSuccessCount, minCountExpected) - } - return - } - } - assert.Fail(t, "invalid response from api/aggregated-discovery-metrics") -} - -// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test. -// This function is supposed to be called as the first defer command from the vtorc tests. -func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) { - // If the test has not failed, then we don't need to print anything. - if !t.Failed() { - return - } - - log.Errorf("Printing VTOrc logs") - for _, vtorc := range clusterInstance.VtorcProcesses { - if vtorc == nil || vtorc.LogFileName == "" { - continue - } - filePath := path.Join(vtorc.LogDir, vtorc.LogFileName) - log.Errorf("Printing file - %s", filePath) - content, err := os.ReadFile(filePath) - if err != nil { - log.Errorf("Error while reading the file - %v", err) - } - log.Errorf("%s", string(content)) - } -} From 76e23159767b8bd2f007fb0a94a578a31aa943b9 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Thu, 12 Oct 2023 19:06:37 +0200 Subject: [PATCH 4/7] remove unrelated+incompatible test Signed-off-by: Tim Vaillancourt --- .../planned_reparenter_flaky_test.go | 186 ------------------ 1 file changed, 186 deletions(-) diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go index b186f68b0f2..ca158f8c124 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go @@ -3720,192 +3720,6 @@ func AssertReparentEventsEqual(t *testing.T, expected *events.Reparent, actual * AssertReparentEventsEqualWithMessage(t, expected, actual, "") } -// TestPlannedReparenter_verifyAllTabletsReachable tests the functionality of verifyAllTabletsReachable. -func TestPlannedReparenter_verifyAllTabletsReachable(t *testing.T) { - tests := []struct { - name string - tmc tmclient.TabletManagerClient - tabletMap map[string]*topo.TabletInfo - remoteOpTime time.Duration - wantErr string - }{ - { - name: "Success", - tmc: &testutil.TabletManagerClient{ - PrimaryStatusResults: map[string]struct { - Status *replicationdatapb.PrimaryStatus - Error error - }{ - "zone1-0000000200": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - "zone1-0000000201": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - "zone1-0000000100": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - }, - }, - tabletMap: map[string]*topo.TabletInfo{ - "zone1-0000000100": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 100, - }, - Type: topodatapb.TabletType_PRIMARY, - }, - }, - "zone1-0000000200": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 200, - }, - Type: topodatapb.TabletType_REPLICA, - }, - }, - "zone1-0000000201": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 201, - }, - Type: topodatapb.TabletType_REPLICA, - }, - }, - }, - }, { - name: "Failure", - tmc: &testutil.TabletManagerClient{ - PrimaryStatusResults: map[string]struct { - Status *replicationdatapb.PrimaryStatus - Error error - }{ - "zone1-0000000200": { - Error: fmt.Errorf("primary status failed"), - }, - "zone1-0000000201": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - "zone1-0000000100": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - }, - }, - tabletMap: map[string]*topo.TabletInfo{ - "zone1-0000000100": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 100, - }, - Type: topodatapb.TabletType_PRIMARY, - }, - }, - "zone1-0000000200": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 200, - }, - Type: topodatapb.TabletType_REPLICA, - }, - }, - "zone1-0000000201": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 201, - }, - Type: topodatapb.TabletType_REPLICA, - }, - }, - }, - wantErr: "primary status failed", - }, { - name: "Timeout", - tmc: &testutil.TabletManagerClient{ - PrimaryStatusDelays: map[string]time.Duration{ - "zone1-0000000100": 20 * time.Second, - }, - PrimaryStatusResults: map[string]struct { - Status *replicationdatapb.PrimaryStatus - Error error - }{ - "zone1-0000000200": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - "zone1-0000000201": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - "zone1-0000000100": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - }, - }, - remoteOpTime: 100 * time.Millisecond, - tabletMap: map[string]*topo.TabletInfo{ - "zone1-0000000100": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 100, - }, - Type: topodatapb.TabletType_PRIMARY, - }, - }, - "zone1-0000000200": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 200, - }, - Type: topodatapb.TabletType_REPLICA, - }, - }, - "zone1-0000000201": { - Tablet: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone1", - Uid: 201, - }, - Type: topodatapb.TabletType_REPLICA, - }, - }, - }, - wantErr: "context deadline exceeded", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ts := memorytopo.NewServer(ctx, "zone1") - defer ts.Close() - - pr := &PlannedReparenter{ - ts: ts, - tmc: tt.tmc, - } - if tt.remoteOpTime != 0 { - oldTime := topo.RemoteOperationTimeout - topo.RemoteOperationTimeout = tt.remoteOpTime - defer func() { - topo.RemoteOperationTimeout = oldTime - }() - } - err := pr.verifyAllTabletsReachable(context.Background(), tt.tabletMap) - if tt.wantErr == "" { - require.NoError(t, err) - return - } - require.ErrorContains(t, err, tt.wantErr) - }) - } -} - func TestPlannedReparenterStats(t *testing.T) { prsCounter.ResetAll() reparentShardOpTimings.Reset() From 7b9419f0569a47fc29d54230bb0c8f05370fbe53 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Thu, 12 Oct 2023 19:08:32 +0200 Subject: [PATCH 5/7] Fix memorytopo.NewServer(...) for v14 Signed-off-by: Tim Vaillancourt --- go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go index ca158f8c124..fb0087e545c 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go @@ -3790,7 +3790,7 @@ func TestPlannedReparenterStats(t *testing.T) { } keyspace := "testkeyspace" shard := "-" - ts := memorytopo.NewServer(context.Background(), "zone1") + ts := memorytopo.NewServer("zone1") ctx := context.Background() logger := logutil.NewMemoryLogger() From 0e3466bf954b409ad41f8ae07184d69f4a29475f Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Thu, 12 Oct 2023 20:00:38 +0200 Subject: [PATCH 6/7] remove non-existant "PrimaryStatusResults" field Signed-off-by: Tim Vaillancourt --- .../reparentutil/planned_reparenter_flaky_test.go | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go index fb0087e545c..ec20eb19352 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go @@ -3743,18 +3743,6 @@ func TestPlannedReparenterStats(t *testing.T) { SetReadWriteResults: map[string]error{ "zone1-0000000100": nil, }, - // This is only needed to verify reachability, so empty results are fine. - PrimaryStatusResults: map[string]struct { - Status *replicationdatapb.PrimaryStatus - Error error - }{ - "zone1-0000000101": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - "zone1-0000000100": { - Status: &replicationdatapb.PrimaryStatus{}, - }, - }, } shards := []*vtctldatapb.Shard{ { From 9299a03d4a32901d5d5194a03f2b89de3f51f7ff Mon Sep 17 00:00:00 2001 From: Rohit Nayak <57520317+rohit-nayak-ps@users.noreply.github.com> Date: Thu, 28 Sep 2023 21:23:13 +0200 Subject: [PATCH 7/7] Remove FOSSA Test from CI until we can do it in a secure way (#14119) Signed-off-by: Rohit Nayak --- .github/workflows/static_checks_etc.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/static_checks_etc.yml b/.github/workflows/static_checks_etc.yml index 287f72686fc..2e7e6f585db 100644 --- a/.github/workflows/static_checks_etc.yml +++ b/.github/workflows/static_checks_etc.yml @@ -25,12 +25,6 @@ jobs: if: steps.skip-workflow.outputs.skip-workflow == 'false' uses: actions/checkout@v3 - - name: Run FOSSA scan and upload build data - if: steps.skip-workflow.outputs.skip-workflow == 'false' - uses: fossa-contrib/fossa-action@v1 - with: - fossa-api-key: 76d7483ea206d530d9452e44bffe7ba8 - - name: Check for changes in Go files if: steps.skip-workflow.outputs.skip-workflow == 'false' uses: frouioui/paths-filter@main @@ -211,4 +205,4 @@ jobs: echo 'We wish to maintain a consistent changelog directory, please run `go run ./go/tools/releases/releases.go`, commit and push again.' echo 'Running `go run ./go/tools/releases/releases.go` on CI yields the following changes:' echo "$output" - echo "" \ No newline at end of file + echo ""