From e7fee16236a41958decb14c2557f64687c386090 Mon Sep 17 00:00:00 2001
From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com>
Date: Tue, 30 Aug 2022 16:56:56 +0530
Subject: [PATCH 1/7] refactor: refactor vtorc tests to run as a single test
 with sub-tests (#11108)

Signed-off-by: Manan Gupta <manan@planetscale.com>

Signed-off-by: Manan Gupta <manan@planetscale.com>
---
 go/test/endtoend/vtorc/general/main_test.go  |   4 +-
 go/test/endtoend/vtorc/general/vtorc_test.go | 222 +++++++------------
 2 files changed, 81 insertions(+), 145 deletions(-)

diff --git a/go/test/endtoend/vtorc/general/main_test.go b/go/test/endtoend/vtorc/general/main_test.go
index 018e6da21fa..c52502d7c9b 100644
--- a/go/test/endtoend/vtorc/general/main_test.go
+++ b/go/test/endtoend/vtorc/general/main_test.go
@@ -32,8 +32,8 @@ func TestMain(m *testing.M) {
 	var cellInfos []*utils.CellInfo
 	cellInfos = append(cellInfos, &utils.CellInfo{
 		CellName:    utils.Cell1,
-		NumReplicas: 6,
-		NumRdonly:   2,
+		NumReplicas: 4,
+		NumRdonly:   1,
 		UIDBase:     100,
 	})
 
diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go
index 560e351fb31..f2509713208 100644
--- a/go/test/endtoend/vtorc/general/vtorc_test.go
+++ b/go/test/endtoend/vtorc/general/vtorc_test.go
@@ -90,94 +90,13 @@ func TestKeyspaceShard(t *testing.T) {
 	utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
 }
 
-// 3. make primary readonly, let orc repair
-func TestPrimaryReadOnly(t *testing.T) {
-	defer cluster.PanicHandler(t)
-	utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{
-		PreventCrossDataCenterPrimaryFailover: true,
-	}, 1, "")
-	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
-	shard0 := &keyspace.Shards[0]
-
-	// find primary from topo
-	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
-	assert.NotNil(t, curPrimary, "should have elected a primary")
-
-	// Make the current primary database read-only.
-	_, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "")
-	require.NoError(t, err)
-
-	// wait for repair
-	match := utils.WaitForReadOnlyValue(t, curPrimary, 0)
-	require.True(t, match)
-}
-
-// 4. make replica ReadWrite, let orc repair
-func TestReplicaReadWrite(t *testing.T) {
-	defer cluster.PanicHandler(t)
-	utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{
-		PreventCrossDataCenterPrimaryFailover: true,
-	}, 1, "")
-	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
-	shard0 := &keyspace.Shards[0]
-
-	// find primary from topo
-	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
-	assert.NotNil(t, curPrimary, "should have elected a primary")
-
-	var replica *cluster.Vttablet
-	for _, tablet := range shard0.Vttablets {
-		// we know we have only two tablets, so the "other" one must be the new primary
-		if tablet.Alias != curPrimary.Alias {
-			replica = tablet
-			break
-		}
-	}
-	// Make the replica database read-write.
-	_, err := utils.RunSQL(t, "set global read_only=OFF", replica, "")
-	require.NoError(t, err)
-
-	// wait for repair
-	match := utils.WaitForReadOnlyValue(t, replica, 1)
-	require.True(t, match)
-}
-
-// 5. stop replication, let orc repair
-func TestStopReplication(t *testing.T) {
-	defer cluster.PanicHandler(t)
-	utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{
-		PreventCrossDataCenterPrimaryFailover: true,
-	}, 1, "")
-	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
-	shard0 := &keyspace.Shards[0]
-
-	// find primary from topo
-	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
-	assert.NotNil(t, curPrimary, "should have elected a primary")
-
-	// TODO(deepthi): we should not need to do this, the DB should be created automatically
-	_, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false)
-	require.NoError(t, err)
-
-	var replica *cluster.Vttablet
-	for _, tablet := range shard0.Vttablets {
-		// we know we have only two tablets, so the "other" one must be the new primary
-		if tablet.Alias != curPrimary.Alias {
-			replica = tablet
-			break
-		}
-	}
-	require.NotNil(t, replica, "should be able to find a replica")
-	// use vtctlclient to stop replication
-	_, err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
-	require.NoError(t, err)
-
-	// check replication is setup correctly
-	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 15*time.Second)
-}
-
-// 6. setup replication from non-primary, let orc repair
-func TestReplicationFromOtherReplica(t *testing.T) {
+// Cases to test:
+// 1. make primary readonly, let vtorc repair
+// 2. make replica ReadWrite, let vtorc repair
+// 3. stop replication, let vtorc repair
+// 4. setup replication from non-primary, let vtorc repair
+// 5. make instance A replicates from B and B from A, wait for repair
+func TestVTOrcRepairs(t *testing.T) {
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVtorc(t, clusterInfo, 3, 0, nil, cluster.VtorcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -189,10 +108,6 @@ func TestReplicationFromOtherReplica(t *testing.T) {
 	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
 	assert.NotNil(t, curPrimary, "should have elected a primary")
 
-	// TODO(deepthi): we should not need to do this, the DB should be created automatically
-	_, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false)
-	require.NoError(t, err)
-
 	var replica, otherReplica *cluster.Vttablet
 	for _, tablet := range shard0.Vttablets {
 		// we know we have only two tablets, so the "other" one must be the new primary
@@ -210,17 +125,80 @@ func TestReplicationFromOtherReplica(t *testing.T) {
 	// check replication is setup correctly
 	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
 
-	// point replica at otherReplica
-	changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
-		"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort)
-	_, err = utils.RunSQL(t, changeReplicationSourceCommand, replica, "")
-	require.NoError(t, err)
+	t.Run("PrimaryReadOnly", func(t *testing.T) {
+		// Make the current primary database read-only.
+		_, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "")
+		require.NoError(t, err)
+
+		// wait for repair
+		match := utils.WaitForReadOnlyValue(t, curPrimary, 0)
+		require.True(t, match)
+	})
+
+	t.Run("ReplicaReadWrite", func(t *testing.T) {
+		// Make the replica database read-write.
+		_, err := utils.RunSQL(t, "set global read_only=OFF", replica, "")
+		require.NoError(t, err)
+
+		// wait for repair
+		match := utils.WaitForReadOnlyValue(t, replica, 1)
+		require.True(t, match)
+	})
+
+	t.Run("StopReplication", func(t *testing.T) {
+		// use vtctlclient to stop replication
+		_, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
+		require.NoError(t, err)
+
+		// check replication is setup correctly
+		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
+
+		// Stop just the IO thread on the replica
+		_, err = utils.RunSQL(t, "STOP SLAVE IO_THREAD", replica, "")
+		require.NoError(t, err)
+
+		// check replication is setup correctly
+		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
+
+		// Stop just the SQL thread on the replica
+		_, err = utils.RunSQL(t, "STOP SLAVE SQL_THREAD", replica, "")
+		require.NoError(t, err)
+
+		// check replication is setup correctly
+		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
+	})
+
+	t.Run("ReplicationFromOtherReplica", func(t *testing.T) {
+		// point replica at otherReplica
+		changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
+			"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort)
+		_, err := utils.RunSQL(t, changeReplicationSourceCommand, replica, "")
+		require.NoError(t, err)
+
+		// wait until the source port is set back correctly by vtorc
+		utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second)
+
+		// check that writes succeed
+		utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
+	})
 
-	// wait until the source port is set back correctly by vtorc
-	utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second)
+	t.Run("CircularReplication", func(t *testing.T) {
+		// change the replication source on the primary
+		changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
+			"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+
+			"START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort)
+		_, err := utils.RunSQL(t, changeReplicationSourceCommands, curPrimary, "")
+		require.NoError(t, err)
+
+		// wait for curPrimary to reach stable state
+		time.Sleep(1 * time.Second)
 
-	// check that writes succeed
-	utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
+		// wait for repair
+		err = utils.WaitForReplicationToStop(t, curPrimary)
+		require.NoError(t, err)
+		// check that the writes still succeed
+		utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second)
+	})
 }
 
 func TestRepairAfterTER(t *testing.T) {
@@ -257,48 +235,6 @@ func TestRepairAfterTER(t *testing.T) {
 	utils.CheckReplication(t, clusterInfo, newPrimary, []*cluster.Vttablet{curPrimary}, 15*time.Second)
 }
 
-// 7. make instance A replicates from B and B from A, wait for repair
-func TestCircularReplication(t *testing.T) {
-	defer cluster.PanicHandler(t)
-	utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{
-		PreventCrossDataCenterPrimaryFailover: true,
-	}, 1, "")
-	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
-	shard0 := &keyspace.Shards[0]
-
-	// find primary from topo
-	primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
-	assert.NotNil(t, primary, "should have elected a primary")
-
-	var replica *cluster.Vttablet
-	for _, tablet := range shard0.Vttablets {
-		// we know we have only two tablets, so the "other" one must be the new primary
-		if tablet.Alias != primary.Alias {
-			replica = tablet
-			break
-		}
-	}
-
-	// check replication is setup correctly
-	utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica}, 15*time.Second)
-
-	// change the replication source on the primary
-	changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
-		"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+
-		"START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort)
-	_, err := utils.RunSQL(t, changeReplicationSourceCommands, primary, "")
-	require.NoError(t, err)
-
-	// wait for primary to reach stable state
-	time.Sleep(1 * time.Second)
-
-	// wait for repair
-	err = utils.WaitForReplicationToStop(t, primary)
-	require.NoError(t, err)
-	// check that the writes still succeed
-	utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{replica}, 10*time.Second)
-}
-
 // TestSemiSync tests that semi-sync is setup correctly by vtorc if it is incorrectly set
 func TestSemiSync(t *testing.T) {
 	// stop any vtorc instance running due to a previous test.

From 377eb2051d130b0227edd1edff5cc31e79d17d98 Mon Sep 17 00:00:00 2001
From: Tim Vaillancourt <tim@timvaillancourt.com>
Date: Thu, 28 Sep 2023 11:13:27 +0200
Subject: [PATCH 2/7] `vtctld`/`vtorc`: improve reparenting stats (#13723)

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
Signed-off-by: Manan Gupta <manan@planetscale.com>
Co-authored-by: Manan Gupta <manan@planetscale.com>
---
 go/stats/timings.go                           |   4 +-
 go/test/endtoend/vtorc/general/vtorc_test.go  |  20 ++
 .../primaryfailure/primary_failure_test.go    | 230 +++++++++++++-
 go/test/endtoend/vtorc/utils/utils.go         | 141 +++++++++
 .../reparentutil/emergency_reparenter.go      |  24 +-
 .../reparentutil/emergency_reparenter_test.go |  30 +-
 .../vtctl/reparentutil/planned_reparenter.go  |  16 +
 .../planned_reparenter_flaky_test.go          | 291 ++++++++++++++++++
 go/vt/vtctl/reparentutil/util.go              |   7 +
 9 files changed, 744 insertions(+), 19 deletions(-)

diff --git a/go/stats/timings.go b/go/stats/timings.go
index 9048bedee11..cd51ec77a66 100644
--- a/go/stats/timings.go
+++ b/go/stats/timings.go
@@ -62,10 +62,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings {
 	return t
 }
 
-// Reset will clear histograms: used during testing
+// Reset will clear histograms and counters: used during testing
 func (t *Timings) Reset() {
 	t.mu.RLock()
 	t.histograms = make(map[string]*Histogram)
+	t.totalCount.Set(0)
+	t.totalTime.Set(0)
 	t.mu.RUnlock()
 }
 
diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go
index f2509713208..2f2ca2d16ba 100644
--- a/go/test/endtoend/vtorc/general/vtorc_test.go
+++ b/go/test/endtoend/vtorc/general/vtorc_test.go
@@ -28,6 +28,7 @@ import (
 	"vitess.io/vitess/go/test/endtoend/cluster"
 	"vitess.io/vitess/go/test/endtoend/vtorc/utils"
 	"vitess.io/vitess/go/vt/log"
+	"vitess.io/vitess/go/vt/vtorc/logic"
 )
 
 // Cases to test:
@@ -72,6 +73,8 @@ func TestSingleKeyspace(t *testing.T) {
 
 	utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
 	utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
+	utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], keyspace.Name, shard0.Name, 1)
 }
 
 // Cases to test:
@@ -88,6 +91,8 @@ func TestKeyspaceShard(t *testing.T) {
 
 	utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
 	utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
+	utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], keyspace.Name, shard0.Name, 1)
 }
 
 // Cases to test:
@@ -107,6 +112,9 @@ func TestVTOrcRepairs(t *testing.T) {
 	// find primary from topo
 	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
 	assert.NotNil(t, curPrimary, "should have elected a primary")
+	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
 
 	var replica, otherReplica *cluster.Vttablet
 	for _, tablet := range shard0.Vttablets {
@@ -314,6 +322,9 @@ func TestVtorcWithPrs(t *testing.T) {
 	// find primary from topo
 	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
 	assert.NotNil(t, curPrimary, "should have elected a primary")
+	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
 
 	// find any replica tablet other than the current primary
 	var replica *cluster.Vttablet
@@ -339,6 +350,15 @@ func TestVtorcWithPrs(t *testing.T) {
 
 	// check that the replica gets promoted
 	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
+
+	// Verify that VTOrc didn't run any other recovery
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
+	utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
 	utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second)
 }
 
diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
index a922db9f010..32c7babe33b 100644
--- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
+++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
@@ -17,6 +17,8 @@ limitations under the License.
 package primaryfailure
 
 import (
+	"fmt"
+	"path"
 	"testing"
 	"time"
 
@@ -40,6 +42,70 @@ func TestDownPrimary(t *testing.T) {
 	// find primary from topo
 	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
 	assert.NotNil(t, curPrimary, "should have elected a primary")
+	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
+
+	// find the replica and rdonly tablets
+	var replica, rdonly *cluster.Vttablet
+	for _, tablet := range shard0.Vttablets {
+		// we know we have only two replcia tablets, so the one not the primary must be the other replica
+		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
+			replica = tablet
+		}
+		if tablet.Type == "rdonly" {
+			rdonly = tablet
+		}
+	}
+	assert.NotNil(t, replica, "could not find replica tablet")
+	assert.NotNil(t, rdonly, "could not find rdonly tablet")
+
+	// Start a cross-cell replica
+	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
+
+	// check that the replication is setup correctly before we failover
+	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)
+	// since all tablets are up and running, InstancePollSecondsExceeded should have `0` zero value
+	utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 0, true)
+	// Make the rdonly vttablet unavailable
+	err := rdonly.VttabletProcess.TearDown()
+	require.NoError(t, err)
+	err = rdonly.MysqlctlProcess.Stop()
+	require.NoError(t, err)
+	// We have bunch of Vttablets down. Therefore we expect at least 1 occurrence of InstancePollSecondsExceeded
+	utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 1, false)
+	// Make the current primary vttablet unavailable.
+	err = curPrimary.VttabletProcess.TearDown()
+	require.NoError(t, err)
+	err = curPrimary.MysqlctlProcess.Stop()
+	require.NoError(t, err)
+	defer func() {
+		// we remove the tablet from our global list
+		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
+		utils.PermanentlyRemoveVttablet(clusterInfo, rdonly)
+	}()
+
+	// check that the replica gets promoted
+	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
+
+	// also check that the replication is working correctly after failover
+	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
+}
+
+// bring down primary before VTOrc has started, let vtorc repair.
+func TestDownPrimaryBeforeVTOrc(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
+	defer cluster.PanicHandler(t)
+	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none")
+	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
+	shard0 := &keyspace.Shards[0]
+	curPrimary := shard0.Vttablets[0]
+
+	// Promote the first tablet as the primary
+	err := clusterInfo.ClusterInstance.VtctlclientProcess.InitializeShard(keyspace.Name, shard0.Name, clusterInfo.ClusterInstance.Cell, curPrimary.TabletUID)
+	require.NoError(t, err)
 
 	// find the replica and rdonly tablets
 	var replica, rdonly *cluster.Vttablet
@@ -58,7 +124,140 @@ func TestDownPrimary(t *testing.T) {
 	// check that the replication is setup correctly before we failover
 	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second)
 
-	// Make the current primary database unavailable.
+	// Make the current primary vttablet unavailable.
+	_ = curPrimary.VttabletProcess.TearDown()
+	err = curPrimary.MysqlctlProcess.Stop()
+	require.NoError(t, err)
+
+	// Start a VTOrc instance
+	utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
+		PreventCrossDataCenterPrimaryFailover: true,
+	}, 1)
+
+	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
+
+	defer func() {
+		// we remove the tablet from our global list
+		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
+	}()
+
+	// check that the replica gets promoted
+	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
+
+	// also check that the replication is working correctly after failover
+	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
+}
+
+// delete the primary record and let vtorc repair.
+func TestDeletedPrimaryTablet(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
+	defer cluster.PanicHandler(t)
+	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{}, 1, "none")
+	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
+	shard0 := &keyspace.Shards[0]
+	// find primary from topo
+	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
+	assert.NotNil(t, curPrimary, "should have elected a primary")
+	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
+
+	// find the replica and rdonly tablets
+	var replica, rdonly *cluster.Vttablet
+	for _, tablet := range shard0.Vttablets {
+		// we know we have only two replcia tablets, so the one not the primary must be the other replica
+		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
+			replica = tablet
+		}
+		if tablet.Type == "rdonly" {
+			rdonly = tablet
+		}
+	}
+	assert.NotNil(t, replica, "could not find replica tablet")
+	assert.NotNil(t, rdonly, "could not find rdonly tablet")
+
+	// check that the replication is setup correctly before we failover
+	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)
+
+	// Disable VTOrc recoveries
+	vtOrcProcess.DisableGlobalRecoveries(t)
+	// use vtctlclient to stop replication on the replica
+	_, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
+	require.NoError(t, err)
+	// insert a write that is not available on the replica.
+	utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly}, 10*time.Second)
+
+	// Make the current primary vttablet unavailable and delete its tablet record.
+	_ = curPrimary.VttabletProcess.TearDown()
+	err = curPrimary.MysqlctlProcess.Stop()
+	require.NoError(t, err)
+	// use vtctlclient to start replication on the replica back
+	_, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StartReplication", replica.Alias)
+	require.NoError(t, err)
+	err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", curPrimary.Alias)
+	require.NoError(t, err)
+	// Enable VTOrc recoveries now
+	vtOrcProcess.EnableGlobalRecoveries(t)
+
+	defer func() {
+		// we remove the tablet from our global list
+		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
+	}()
+
+	// check that the replica gets promoted. Also verify that it has all the writes.
+	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
+	utils.CheckTabletUptoDate(t, clusterInfo, replica)
+
+	// also check that the replication is working correctly after failover
+	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1)
+}
+
+// TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead.
+// Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know
+// that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag.
+func TestDeadPrimaryRecoversImmediately(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
+	defer cluster.PanicHandler(t)
+	// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
+	// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
+	// If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test.
+	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{
+		PreventCrossDataCenterPrimaryFailover: true,
+	}, 1, "semi_sync")
+	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
+	shard0 := &keyspace.Shards[0]
+	// find primary from topo
+	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
+	assert.NotNil(t, curPrimary, "should have elected a primary")
+	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
+
+	// find the replica and rdonly tablets
+	var replica, rdonly *cluster.Vttablet
+	for _, tablet := range shard0.Vttablets {
+		// we know we have only two replcia tablets, so the one not the primary must be the other replica
+		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
+			replica = tablet
+		}
+		if tablet.Type == "rdonly" {
+			rdonly = tablet
+		}
+	}
+	assert.NotNil(t, replica, "could not find replica tablet")
+	assert.NotNil(t, rdonly, "could not find rdonly tablet")
+
+	// Start a cross-cell replica
+	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
+
+	// check that the replication is setup correctly before we failover
+	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)
+
+	// Make the current primary vttablet unavailable.
+	curPrimary.VttabletProcess.Kill()
 	err := curPrimary.MysqlctlProcess.Stop()
 	require.NoError(t, err)
 	defer func() {
@@ -69,7 +268,34 @@ func TestDownPrimary(t *testing.T) {
 	// check that the replica gets promoted
 	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
 	// also check that the replication is working correctly after failover
-	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
+	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
+	utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
+
+	// Parse log file and find out how much time it took for DeadPrimary to recover.
+	logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
+	// log prefix printed at the end of analysis where we conclude we have DeadPrimary
+	t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery")
+	// log prefix printed at the end of recovery
+	t2 := extractTimeFromLog(t, logFile, "auditType:RecoverDeadPrimary")
+	curr := time.Now().Format("2006-01-02")
+	timeLayout := "2006-01-02 15:04:05.000000"
+	timeStr1 := fmt.Sprintf("%s %s", curr, t1)
+	timeStr2 := fmt.Sprintf("%s %s", curr, t2)
+	time1, err := time.Parse(timeLayout, timeStr1)
+	if err != nil {
+		t.Errorf("unable to parse time %s", err.Error())
+	}
+	time2, err := time.Parse(timeLayout, timeStr2)
+	if err != nil {
+		t.Errorf("unable to parse time %s", err.Error())
+	}
+	diff := time2.Sub(time1)
+	fmt.Printf("The difference between %s and %s is %v seconds.\n", t1, t2, diff.Seconds())
+	// assert that it takes less than `remote_operation_timeout` to recover from `DeadPrimary`
+	// use the value provided in `remote_operation_timeout` flag to compare with.
+	// We are testing against 9.5 seconds to be safe and prevent flakiness.
+	assert.Less(t, diff.Seconds(), 9.5)
 }
 
 // Failover should not be cross data centers, according to the configuration file
diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go
index 96075aef3c0..713f651203f 100644
--- a/go/test/endtoend/vtorc/utils/utils.go
+++ b/go/test/endtoend/vtorc/utils/utils.go
@@ -18,12 +18,16 @@ package utils
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"io"
+	"math"
 	"net/http"
 	"os"
 	"os/exec"
 	"path"
+	"reflect"
+	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -915,3 +919,140 @@ func WaitForReadOnlyValue(t *testing.T, curPrimary *cluster.Vttablet, expectValu
 	}
 	return false
 }
+
+// WaitForSuccessfulRecoveryCount waits until the given recovery name's count of successful runs matches the count expected
+func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, recoveryName string, countExpected int) {
+	t.Helper()
+	timeout := 15 * time.Second
+	startTime := time.Now()
+	for time.Since(startTime) < timeout {
+		vars := vtorcInstance.GetVars()
+		successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
+		successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
+		if successCount == countExpected {
+			return
+		}
+		time.Sleep(time.Second)
+	}
+	vars := vtorcInstance.GetVars()
+	successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
+	successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
+	assert.EqualValues(t, countExpected, successCount)
+}
+
+// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected.
+func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyspace, shard string, countExpected int) {
+	t.Helper()
+	timeout := 15 * time.Second
+	startTime := time.Now()
+	mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
+	for time.Since(startTime) < timeout {
+		vars := vtorcInstance.GetVars()
+		prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
+		successCount := getIntFromValue(prsCountsMap[mapKey])
+		if successCount == countExpected {
+			return
+		}
+		time.Sleep(time.Second)
+	}
+	vars := vtorcInstance.GetVars()
+	prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
+	successCount := getIntFromValue(prsCountsMap[mapKey])
+	assert.EqualValues(t, countExpected, successCount)
+}
+
+// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected.
+func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyspace, shard string, countExpected int) {
+	t.Helper()
+	timeout := 15 * time.Second
+	startTime := time.Now()
+	mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
+	for time.Since(startTime) < timeout {
+		vars := vtorcInstance.GetVars()
+		ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
+		successCount := getIntFromValue(ersCountsMap[mapKey])
+		if successCount == countExpected {
+			return
+		}
+		time.Sleep(time.Second)
+	}
+	vars := vtorcInstance.GetVars()
+	ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
+	successCount := getIntFromValue(ersCountsMap[mapKey])
+	assert.EqualValues(t, countExpected, successCount)
+}
+
+// getIntFromValue is a helper function to get an integer from the given value.
+// If it is convertible to a float, then we round the number to the nearest integer.
+// If the value is not numeric at all, we return 0.
+func getIntFromValue(val any) int {
+	value := reflect.ValueOf(val)
+	if value.CanFloat() {
+		return int(math.Round(value.Float()))
+	}
+	if value.CanInt() {
+		return int(value.Int())
+	}
+	return 0
+}
+
+// WaitForTabletType waits for the tablet to reach a certain type.
+func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) {
+	t.Helper()
+	err := tablet.VttabletProcess.WaitForTabletTypes([]string{expectedTabletType})
+	require.NoError(t, err)
+}
+
+// WaitForInstancePollSecondsExceededCount waits for 30 seconds and then queries api/aggregated-discovery-metrics.
+// It expects to find minimum occurrence or exact count of `keyName` provided.
+func WaitForInstancePollSecondsExceededCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyName string, minCountExpected float64, enforceEquality bool) {
+	t.Helper()
+	var sinceInSeconds = 30
+	duration := time.Duration(sinceInSeconds)
+	time.Sleep(duration * time.Second)
+
+	statusCode, res, err := vtorcInstance.MakeAPICall("api/aggregated-discovery-metrics?seconds=" + strconv.Itoa(sinceInSeconds))
+	if err != nil {
+		assert.Fail(t, "Not able to call api/aggregated-discovery-metrics")
+	}
+	if statusCode == 200 {
+		resultMap := make(map[string]any)
+		err := json.Unmarshal([]byte(res), &resultMap)
+		if err != nil {
+			assert.Fail(t, "invalid response from api/aggregated-discovery-metrics")
+		}
+		successCount := resultMap[keyName]
+		if iSuccessCount, ok := successCount.(float64); ok {
+			if enforceEquality {
+				assert.Equal(t, iSuccessCount, minCountExpected)
+			} else {
+				assert.GreaterOrEqual(t, iSuccessCount, minCountExpected)
+			}
+			return
+		}
+	}
+	assert.Fail(t, "invalid response from api/aggregated-discovery-metrics")
+}
+
+// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test.
+// This function is supposed to be called as the first defer command from the vtorc tests.
+func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) {
+	// If the test has not failed, then we don't need to print anything.
+	if !t.Failed() {
+		return
+	}
+
+	log.Errorf("Printing VTOrc logs")
+	for _, vtorc := range clusterInstance.VtorcProcesses {
+		if vtorc == nil || vtorc.LogFileName == "" {
+			continue
+		}
+		filePath := path.Join(vtorc.LogDir, vtorc.LogFileName)
+		log.Errorf("Printing file - %s", filePath)
+		content, err := os.ReadFile(filePath)
+		if err != nil {
+			log.Errorf("Error while reading the file - %v", err)
+		}
+		log.Errorf("%s", string(content))
+	}
+}
diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go
index ba846ebc147..f3b2ce8d2b4 100644
--- a/go/vt/vtctl/reparentutil/emergency_reparenter.go
+++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go
@@ -68,9 +68,14 @@ type EmergencyReparentOptions struct {
 
 // counters for Emergency Reparent Shard
 var (
-	ersCounter        = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
-	ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
-	ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
+	// TODO(timvaillancourt): remove legacyERS* gauges in v19+.
+	legacyERSCounter        = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
+	legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
+	legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
+
+	ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run",
+		[]string{"Keyspace", "Shard", "Result"},
+	)
 )
 
 // NewEmergencyReparenter returns a new EmergencyReparenter object, ready to
@@ -98,26 +103,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l
 // keyspace and shard.
 func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) {
 	var err error
+	statsLabels := []string{keyspace, shard}
+
 	// First step is to lock the shard for the given operation, if not already locked
 	if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
 		var unlock func(*error)
 		opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias)
 		ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
 		if err != nil {
+			ersCounter.Add(append(statsLabels, failureResult), 1)
 			return nil, err
 		}
 		defer unlock(&err)
 	}
 
 	// dispatch success or failure of ERS
+	startTime := time.Now()
 	ev := &events.Reparent{}
 	defer func() {
+		reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime))
 		switch err {
 		case nil:
-			ersSuccessCounter.Add(1)
+			legacyERSSuccessCounter.Add(1)
+			ersCounter.Add(append(statsLabels, successResult), 1)
 			event.DispatchUpdate(ev, "finished EmergencyReparentShard")
 		default:
-			ersFailureCounter.Add(1)
+			legacyERSFailureCounter.Add(1)
+			ersCounter.Add(append(statsLabels, failureResult), 1)
 			event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error())
 		}
 	}()
@@ -141,7 +153,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet
 func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) {
 	// log the starting of the operation and increment the counter
 	erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard)
-	ersCounter.Add(1)
+	legacyERSCounter.Add(1)
 
 	var (
 		stoppedReplicationSnapshot *replicationSnapshot
diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go
index af7b33a3151..40c3c295f30 100644
--- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go
+++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go
@@ -2747,10 +2747,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) {
 	}
 }
 
-func TestEmergencyReparenterCounters(t *testing.T) {
-	ersCounter.Set(0)
-	ersSuccessCounter.Set(0)
-	ersFailureCounter.Set(0)
+func TestEmergencyReparenterStats(t *testing.T) {
+	ersCounter.ResetAll()
+	legacyERSCounter.Reset()
+	legacyERSSuccessCounter.Reset()
+	legacyERSFailureCounter.Reset()
+	reparentShardOpTimings.Reset()
 
 	emergencyReparentOps := EmergencyReparentOptions{}
 	tmc := &testutil.TabletManagerClient{
@@ -2879,9 +2881,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
 	require.NoError(t, err)
 
 	// check the counter values
-	require.EqualValues(t, 1, ersCounter.Get())
-	require.EqualValues(t, 1, ersSuccessCounter.Get())
-	require.EqualValues(t, 0, ersFailureCounter.Get())
+	require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts())
+	require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts())
+
+	// check the legacy counter values
+	require.EqualValues(t, 1, legacyERSCounter.Get())
+	require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
+	require.EqualValues(t, 0, legacyERSFailureCounter.Get())
 
 	// set emergencyReparentOps to request a non existent tablet
 	emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
@@ -2894,9 +2900,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
 	require.Error(t, err)
 
 	// check the counter values
-	require.EqualValues(t, 2, ersCounter.Get())
-	require.EqualValues(t, 1, ersSuccessCounter.Get())
-	require.EqualValues(t, 1, ersFailureCounter.Get())
+	require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts())
+	require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts())
+
+	// check the legacy counter values
+	require.EqualValues(t, 2, legacyERSCounter.Get())
+	require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
+	require.EqualValues(t, 1, legacyERSFailureCounter.Get())
 }
 
 func TestEmergencyReparenter_findMostAdvanced(t *testing.T) {
diff --git a/go/vt/vtctl/reparentutil/planned_reparenter.go b/go/vt/vtctl/reparentutil/planned_reparenter.go
index 4f46d234e25..31b222f9df5 100644
--- a/go/vt/vtctl/reparentutil/planned_reparenter.go
+++ b/go/vt/vtctl/reparentutil/planned_reparenter.go
@@ -26,6 +26,7 @@ import (
 
 	"vitess.io/vitess/go/event"
 	"vitess.io/vitess/go/mysql"
+	"vitess.io/vitess/go/stats"
 	"vitess.io/vitess/go/vt/concurrency"
 	"vitess.io/vitess/go/vt/logutil"
 	"vitess.io/vitess/go/vt/topo"
@@ -39,6 +40,13 @@ import (
 	"vitess.io/vitess/go/vt/proto/vtrpc"
 )
 
+// counters for Planned Reparent Shard
+var (
+	prsCounter = stats.NewCountersWithMultiLabels("planned_reparent_counts", "Number of times Planned Reparent Shard has been run",
+		[]string{"Keyspace", "Shard", "Result"},
+	)
+)
+
 // PlannedReparenter performs PlannedReparentShard operations.
 type PlannedReparenter struct {
 	ts     *topo.Server
@@ -88,11 +96,14 @@ func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, log
 // both the current and desired primary are reachable and in a good state.
 func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) {
 	var err error
+	statsLabels := []string{keyspace, shard}
+
 	if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
 		var unlock func(*error)
 		opts.lockAction = pr.getLockAction(opts)
 		ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
 		if err != nil {
+			prsCounter.Add(append(statsLabels, failureResult), 1)
 			return nil, err
 		}
 		defer unlock(&err)
@@ -101,18 +112,23 @@ func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string,
 	if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil {
 		shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard)
 		if err != nil {
+			prsCounter.Add(append(statsLabels, failureResult), 1)
 			return nil, err
 		}
 
 		opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias
 	}
 
+	startTime := time.Now()
 	ev := &events.Reparent{}
 	defer func() {
+		reparentShardOpTimings.Add("PlannedReparentShard", time.Since(startTime))
 		switch err {
 		case nil:
+			prsCounter.Add(append(statsLabels, successResult), 1)
 			event.DispatchUpdate(ev, "finished PlannedReparentShard")
 		default:
+			prsCounter.Add(append(statsLabels, failureResult), 1)
 			event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error())
 		}
 	}()
diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
index 09900c4a624..b186f68b0f2 100644
--- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
+++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
@@ -3719,3 +3719,294 @@ func AssertReparentEventsEqual(t *testing.T, expected *events.Reparent, actual *
 
 	AssertReparentEventsEqualWithMessage(t, expected, actual, "")
 }
+
+// TestPlannedReparenter_verifyAllTabletsReachable tests the functionality of verifyAllTabletsReachable.
+func TestPlannedReparenter_verifyAllTabletsReachable(t *testing.T) {
+	tests := []struct {
+		name         string
+		tmc          tmclient.TabletManagerClient
+		tabletMap    map[string]*topo.TabletInfo
+		remoteOpTime time.Duration
+		wantErr      string
+	}{
+		{
+			name: "Success",
+			tmc: &testutil.TabletManagerClient{
+				PrimaryStatusResults: map[string]struct {
+					Status *replicationdatapb.PrimaryStatus
+					Error  error
+				}{
+					"zone1-0000000200": {
+						Status: &replicationdatapb.PrimaryStatus{},
+					},
+					"zone1-0000000201": {
+						Status: &replicationdatapb.PrimaryStatus{},
+					},
+					"zone1-0000000100": {
+						Status: &replicationdatapb.PrimaryStatus{},
+					},
+				},
+			},
+			tabletMap: map[string]*topo.TabletInfo{
+				"zone1-0000000100": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  100,
+						},
+						Type: topodatapb.TabletType_PRIMARY,
+					},
+				},
+				"zone1-0000000200": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  200,
+						},
+						Type: topodatapb.TabletType_REPLICA,
+					},
+				},
+				"zone1-0000000201": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  201,
+						},
+						Type: topodatapb.TabletType_REPLICA,
+					},
+				},
+			},
+		}, {
+			name: "Failure",
+			tmc: &testutil.TabletManagerClient{
+				PrimaryStatusResults: map[string]struct {
+					Status *replicationdatapb.PrimaryStatus
+					Error  error
+				}{
+					"zone1-0000000200": {
+						Error: fmt.Errorf("primary status failed"),
+					},
+					"zone1-0000000201": {
+						Status: &replicationdatapb.PrimaryStatus{},
+					},
+					"zone1-0000000100": {
+						Status: &replicationdatapb.PrimaryStatus{},
+					},
+				},
+			},
+			tabletMap: map[string]*topo.TabletInfo{
+				"zone1-0000000100": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  100,
+						},
+						Type: topodatapb.TabletType_PRIMARY,
+					},
+				},
+				"zone1-0000000200": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  200,
+						},
+						Type: topodatapb.TabletType_REPLICA,
+					},
+				},
+				"zone1-0000000201": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  201,
+						},
+						Type: topodatapb.TabletType_REPLICA,
+					},
+				},
+			},
+			wantErr: "primary status failed",
+		}, {
+			name: "Timeout",
+			tmc: &testutil.TabletManagerClient{
+				PrimaryStatusDelays: map[string]time.Duration{
+					"zone1-0000000100": 20 * time.Second,
+				},
+				PrimaryStatusResults: map[string]struct {
+					Status *replicationdatapb.PrimaryStatus
+					Error  error
+				}{
+					"zone1-0000000200": {
+						Status: &replicationdatapb.PrimaryStatus{},
+					},
+					"zone1-0000000201": {
+						Status: &replicationdatapb.PrimaryStatus{},
+					},
+					"zone1-0000000100": {
+						Status: &replicationdatapb.PrimaryStatus{},
+					},
+				},
+			},
+			remoteOpTime: 100 * time.Millisecond,
+			tabletMap: map[string]*topo.TabletInfo{
+				"zone1-0000000100": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  100,
+						},
+						Type: topodatapb.TabletType_PRIMARY,
+					},
+				},
+				"zone1-0000000200": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  200,
+						},
+						Type: topodatapb.TabletType_REPLICA,
+					},
+				},
+				"zone1-0000000201": {
+					Tablet: &topodatapb.Tablet{
+						Alias: &topodatapb.TabletAlias{
+							Cell: "zone1",
+							Uid:  201,
+						},
+						Type: topodatapb.TabletType_REPLICA,
+					},
+				},
+			},
+			wantErr: "context deadline exceeded",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx, cancel := context.WithCancel(context.Background())
+			defer cancel()
+			ts := memorytopo.NewServer(ctx, "zone1")
+			defer ts.Close()
+
+			pr := &PlannedReparenter{
+				ts:  ts,
+				tmc: tt.tmc,
+			}
+			if tt.remoteOpTime != 0 {
+				oldTime := topo.RemoteOperationTimeout
+				topo.RemoteOperationTimeout = tt.remoteOpTime
+				defer func() {
+					topo.RemoteOperationTimeout = oldTime
+				}()
+			}
+			err := pr.verifyAllTabletsReachable(context.Background(), tt.tabletMap)
+			if tt.wantErr == "" {
+				require.NoError(t, err)
+				return
+			}
+			require.ErrorContains(t, err, tt.wantErr)
+		})
+	}
+}
+
+func TestPlannedReparenterStats(t *testing.T) {
+	prsCounter.ResetAll()
+	reparentShardOpTimings.Reset()
+
+	tmc := &testutil.TabletManagerClient{
+		PrimaryPositionResults: map[string]struct {
+			Position string
+			Error    error
+		}{
+			"zone1-0000000100": {
+				Position: "position1",
+				Error:    nil,
+			},
+		},
+		PopulateReparentJournalResults: map[string]error{
+			"zone1-0000000100": nil,
+		},
+		SetReplicationSourceResults: map[string]error{
+			"zone1-0000000101": nil,
+		},
+		SetReadWriteResults: map[string]error{
+			"zone1-0000000100": nil,
+		},
+		// This is only needed to verify reachability, so empty results are fine.
+		PrimaryStatusResults: map[string]struct {
+			Status *replicationdatapb.PrimaryStatus
+			Error  error
+		}{
+			"zone1-0000000101": {
+				Status: &replicationdatapb.PrimaryStatus{},
+			},
+			"zone1-0000000100": {
+				Status: &replicationdatapb.PrimaryStatus{},
+			},
+		},
+	}
+	shards := []*vtctldatapb.Shard{
+		{
+			Keyspace: "testkeyspace",
+			Name:     "-",
+		},
+	}
+	tablets := []*topodatapb.Tablet{
+		{
+			Alias: &topodatapb.TabletAlias{
+				Cell: "zone1",
+				Uid:  100,
+			},
+			Type:     topodatapb.TabletType_PRIMARY,
+			Keyspace: "testkeyspace",
+			Shard:    "-",
+		},
+		{
+			Alias: &topodatapb.TabletAlias{
+				Cell: "zone1",
+				Uid:  101,
+			},
+			Type:     topodatapb.TabletType_REPLICA,
+			Keyspace: "testkeyspace",
+			Shard:    "-",
+		},
+	}
+	plannedReparentOps := PlannedReparentOptions{
+		NewPrimaryAlias: &topodatapb.TabletAlias{
+			Cell: "zone1",
+			Uid:  100,
+		},
+	}
+	keyspace := "testkeyspace"
+	shard := "-"
+	ts := memorytopo.NewServer(context.Background(), "zone1")
+
+	ctx := context.Background()
+	logger := logutil.NewMemoryLogger()
+
+	testutil.AddShards(ctx, t, ts, shards...)
+	testutil.AddTablets(ctx, t, ts, &testutil.AddTabletOptions{
+		AlsoSetShardPrimary: true,
+		SkipShardCreation:   false,
+	}, tablets...)
+
+	prp := NewPlannedReparenter(ts, tmc, logger)
+	// run a successful prs
+	_, err := prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps)
+	require.NoError(t, err)
+
+	// check the counter values
+	require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, prsCounter.Counts())
+	require.EqualValues(t, map[string]int64{"All": 1, "PlannedReparentShard": 1}, reparentShardOpTimings.Counts())
+
+	// set plannedReparentOps to request a non existent tablet
+	plannedReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
+		Cell: "bogus",
+		Uid:  100,
+	}
+
+	// run a failing prs
+	_, err = prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps)
+	require.Error(t, err)
+
+	// check the counter values
+	require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, prsCounter.Counts())
+	require.EqualValues(t, map[string]int64{"All": 2, "PlannedReparentShard": 2}, reparentShardOpTimings.Counts())
+}
diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go
index f4cebc3dd7d..ac51a45c65f 100644
--- a/go/vt/vtctl/reparentutil/util.go
+++ b/go/vt/vtctl/reparentutil/util.go
@@ -23,6 +23,7 @@ import (
 	"time"
 
 	"vitess.io/vitess/go/mysql"
+	"vitess.io/vitess/go/stats"
 	"vitess.io/vitess/go/vt/concurrency"
 	"vitess.io/vitess/go/vt/log"
 	"vitess.io/vitess/go/vt/logutil"
@@ -38,6 +39,12 @@ import (
 	"vitess.io/vitess/go/vt/proto/vtrpc"
 )
 
+var (
+	reparentShardOpTimings = stats.NewTimings("reparent_shard_operation_timings", "Timings of reparent shard operations", "Operation")
+	failureResult          = "failure"
+	successResult          = "success"
+)
+
 // ChooseNewPrimary finds a tablet that should become a primary after reparent.
 // The criteria for the new primary-elect are (preferably) to be in the same
 // cell as the current primary, and to be different from avoidPrimaryAlias. The

From f973f540d1f54da4fde456530474b3dbebb2169e Mon Sep 17 00:00:00 2001
From: Tim Vaillancourt <tim@timvaillancourt.com>
Date: Thu, 12 Oct 2023 18:26:26 +0200
Subject: [PATCH 3/7] revert changes to go/test/endtoend

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
---
 go/test/endtoend/vtorc/general/main_test.go   |   4 +-
 go/test/endtoend/vtorc/general/vtorc_test.go  | 242 +++++++++++-------
 .../primaryfailure/primary_failure_test.go    | 230 +----------------
 go/test/endtoend/vtorc/utils/utils.go         | 141 ----------
 4 files changed, 147 insertions(+), 470 deletions(-)

diff --git a/go/test/endtoend/vtorc/general/main_test.go b/go/test/endtoend/vtorc/general/main_test.go
index c52502d7c9b..018e6da21fa 100644
--- a/go/test/endtoend/vtorc/general/main_test.go
+++ b/go/test/endtoend/vtorc/general/main_test.go
@@ -32,8 +32,8 @@ func TestMain(m *testing.M) {
 	var cellInfos []*utils.CellInfo
 	cellInfos = append(cellInfos, &utils.CellInfo{
 		CellName:    utils.Cell1,
-		NumReplicas: 4,
-		NumRdonly:   1,
+		NumReplicas: 6,
+		NumRdonly:   2,
 		UIDBase:     100,
 	})
 
diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go
index 2f2ca2d16ba..560e351fb31 100644
--- a/go/test/endtoend/vtorc/general/vtorc_test.go
+++ b/go/test/endtoend/vtorc/general/vtorc_test.go
@@ -28,7 +28,6 @@ import (
 	"vitess.io/vitess/go/test/endtoend/cluster"
 	"vitess.io/vitess/go/test/endtoend/vtorc/utils"
 	"vitess.io/vitess/go/vt/log"
-	"vitess.io/vitess/go/vt/vtorc/logic"
 )
 
 // Cases to test:
@@ -73,8 +72,6 @@ func TestSingleKeyspace(t *testing.T) {
 
 	utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
 	utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
-	utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], keyspace.Name, shard0.Name, 1)
 }
 
 // Cases to test:
@@ -91,17 +88,96 @@ func TestKeyspaceShard(t *testing.T) {
 
 	utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
 	utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
-	utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VtorcProcesses[0], keyspace.Name, shard0.Name, 1)
 }
 
-// Cases to test:
-// 1. make primary readonly, let vtorc repair
-// 2. make replica ReadWrite, let vtorc repair
-// 3. stop replication, let vtorc repair
-// 4. setup replication from non-primary, let vtorc repair
-// 5. make instance A replicates from B and B from A, wait for repair
-func TestVTOrcRepairs(t *testing.T) {
+// 3. make primary readonly, let orc repair
+func TestPrimaryReadOnly(t *testing.T) {
+	defer cluster.PanicHandler(t)
+	utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{
+		PreventCrossDataCenterPrimaryFailover: true,
+	}, 1, "")
+	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
+	shard0 := &keyspace.Shards[0]
+
+	// find primary from topo
+	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
+	assert.NotNil(t, curPrimary, "should have elected a primary")
+
+	// Make the current primary database read-only.
+	_, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "")
+	require.NoError(t, err)
+
+	// wait for repair
+	match := utils.WaitForReadOnlyValue(t, curPrimary, 0)
+	require.True(t, match)
+}
+
+// 4. make replica ReadWrite, let orc repair
+func TestReplicaReadWrite(t *testing.T) {
+	defer cluster.PanicHandler(t)
+	utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{
+		PreventCrossDataCenterPrimaryFailover: true,
+	}, 1, "")
+	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
+	shard0 := &keyspace.Shards[0]
+
+	// find primary from topo
+	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
+	assert.NotNil(t, curPrimary, "should have elected a primary")
+
+	var replica *cluster.Vttablet
+	for _, tablet := range shard0.Vttablets {
+		// we know we have only two tablets, so the "other" one must be the new primary
+		if tablet.Alias != curPrimary.Alias {
+			replica = tablet
+			break
+		}
+	}
+	// Make the replica database read-write.
+	_, err := utils.RunSQL(t, "set global read_only=OFF", replica, "")
+	require.NoError(t, err)
+
+	// wait for repair
+	match := utils.WaitForReadOnlyValue(t, replica, 1)
+	require.True(t, match)
+}
+
+// 5. stop replication, let orc repair
+func TestStopReplication(t *testing.T) {
+	defer cluster.PanicHandler(t)
+	utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{
+		PreventCrossDataCenterPrimaryFailover: true,
+	}, 1, "")
+	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
+	shard0 := &keyspace.Shards[0]
+
+	// find primary from topo
+	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
+	assert.NotNil(t, curPrimary, "should have elected a primary")
+
+	// TODO(deepthi): we should not need to do this, the DB should be created automatically
+	_, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false)
+	require.NoError(t, err)
+
+	var replica *cluster.Vttablet
+	for _, tablet := range shard0.Vttablets {
+		// we know we have only two tablets, so the "other" one must be the new primary
+		if tablet.Alias != curPrimary.Alias {
+			replica = tablet
+			break
+		}
+	}
+	require.NotNil(t, replica, "should be able to find a replica")
+	// use vtctlclient to stop replication
+	_, err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
+	require.NoError(t, err)
+
+	// check replication is setup correctly
+	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 15*time.Second)
+}
+
+// 6. setup replication from non-primary, let orc repair
+func TestReplicationFromOtherReplica(t *testing.T) {
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVtorc(t, clusterInfo, 3, 0, nil, cluster.VtorcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -112,9 +188,10 @@ func TestVTOrcRepairs(t *testing.T) {
 	// find primary from topo
 	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
 	assert.NotNil(t, curPrimary, "should have elected a primary")
-	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
+
+	// TODO(deepthi): we should not need to do this, the DB should be created automatically
+	_, err := curPrimary.VttabletProcess.QueryTablet(fmt.Sprintf("create database IF NOT EXISTS vt_%s", keyspace.Name), keyspace.Name, false)
+	require.NoError(t, err)
 
 	var replica, otherReplica *cluster.Vttablet
 	for _, tablet := range shard0.Vttablets {
@@ -133,80 +210,17 @@ func TestVTOrcRepairs(t *testing.T) {
 	// check replication is setup correctly
 	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
 
-	t.Run("PrimaryReadOnly", func(t *testing.T) {
-		// Make the current primary database read-only.
-		_, err := utils.RunSQL(t, "set global read_only=ON", curPrimary, "")
-		require.NoError(t, err)
-
-		// wait for repair
-		match := utils.WaitForReadOnlyValue(t, curPrimary, 0)
-		require.True(t, match)
-	})
-
-	t.Run("ReplicaReadWrite", func(t *testing.T) {
-		// Make the replica database read-write.
-		_, err := utils.RunSQL(t, "set global read_only=OFF", replica, "")
-		require.NoError(t, err)
-
-		// wait for repair
-		match := utils.WaitForReadOnlyValue(t, replica, 1)
-		require.True(t, match)
-	})
-
-	t.Run("StopReplication", func(t *testing.T) {
-		// use vtctlclient to stop replication
-		_, err := clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
-		require.NoError(t, err)
-
-		// check replication is setup correctly
-		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
-
-		// Stop just the IO thread on the replica
-		_, err = utils.RunSQL(t, "STOP SLAVE IO_THREAD", replica, "")
-		require.NoError(t, err)
-
-		// check replication is setup correctly
-		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
-
-		// Stop just the SQL thread on the replica
-		_, err = utils.RunSQL(t, "STOP SLAVE SQL_THREAD", replica, "")
-		require.NoError(t, err)
-
-		// check replication is setup correctly
-		utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
-	})
-
-	t.Run("ReplicationFromOtherReplica", func(t *testing.T) {
-		// point replica at otherReplica
-		changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
-			"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort)
-		_, err := utils.RunSQL(t, changeReplicationSourceCommand, replica, "")
-		require.NoError(t, err)
-
-		// wait until the source port is set back correctly by vtorc
-		utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second)
-
-		// check that writes succeed
-		utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
-	})
-
-	t.Run("CircularReplication", func(t *testing.T) {
-		// change the replication source on the primary
-		changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
-			"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+
-			"START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort)
-		_, err := utils.RunSQL(t, changeReplicationSourceCommands, curPrimary, "")
-		require.NoError(t, err)
+	// point replica at otherReplica
+	changeReplicationSourceCommand := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
+		"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1; START SLAVE", utils.Hostname, otherReplica.MySQLPort)
+	_, err = utils.RunSQL(t, changeReplicationSourceCommand, replica, "")
+	require.NoError(t, err)
 
-		// wait for curPrimary to reach stable state
-		time.Sleep(1 * time.Second)
+	// wait until the source port is set back correctly by vtorc
+	utils.CheckSourcePort(t, replica, curPrimary, 15*time.Second)
 
-		// wait for repair
-		err = utils.WaitForReplicationToStop(t, curPrimary)
-		require.NoError(t, err)
-		// check that the writes still succeed
-		utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 10*time.Second)
-	})
+	// check that writes succeed
+	utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
 }
 
 func TestRepairAfterTER(t *testing.T) {
@@ -243,6 +257,48 @@ func TestRepairAfterTER(t *testing.T) {
 	utils.CheckReplication(t, clusterInfo, newPrimary, []*cluster.Vttablet{curPrimary}, 15*time.Second)
 }
 
+// 7. make instance A replicates from B and B from A, wait for repair
+func TestCircularReplication(t *testing.T) {
+	defer cluster.PanicHandler(t)
+	utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, cluster.VtorcConfiguration{
+		PreventCrossDataCenterPrimaryFailover: true,
+	}, 1, "")
+	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
+	shard0 := &keyspace.Shards[0]
+
+	// find primary from topo
+	primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
+	assert.NotNil(t, primary, "should have elected a primary")
+
+	var replica *cluster.Vttablet
+	for _, tablet := range shard0.Vttablets {
+		// we know we have only two tablets, so the "other" one must be the new primary
+		if tablet.Alias != primary.Alias {
+			replica = tablet
+			break
+		}
+	}
+
+	// check replication is setup correctly
+	utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica}, 15*time.Second)
+
+	// change the replication source on the primary
+	changeReplicationSourceCommands := fmt.Sprintf("STOP SLAVE; RESET SLAVE ALL;"+
+		"CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+
+		"START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort)
+	_, err := utils.RunSQL(t, changeReplicationSourceCommands, primary, "")
+	require.NoError(t, err)
+
+	// wait for primary to reach stable state
+	time.Sleep(1 * time.Second)
+
+	// wait for repair
+	err = utils.WaitForReplicationToStop(t, primary)
+	require.NoError(t, err)
+	// check that the writes still succeed
+	utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{replica}, 10*time.Second)
+}
+
 // TestSemiSync tests that semi-sync is setup correctly by vtorc if it is incorrectly set
 func TestSemiSync(t *testing.T) {
 	// stop any vtorc instance running due to a previous test.
@@ -322,9 +378,6 @@ func TestVtorcWithPrs(t *testing.T) {
 	// find primary from topo
 	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
 	assert.NotNil(t, curPrimary, "should have elected a primary")
-	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
 
 	// find any replica tablet other than the current primary
 	var replica *cluster.Vttablet
@@ -350,15 +403,6 @@ func TestVtorcWithPrs(t *testing.T) {
 
 	// check that the replica gets promoted
 	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
-
-	// Verify that VTOrc didn't run any other recovery
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
-	utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0)
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
 	utils.VerifyWritesSucceed(t, clusterInfo, replica, shard0.Vttablets, 10*time.Second)
 }
 
diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
index 32c7babe33b..a922db9f010 100644
--- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
+++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
@@ -17,8 +17,6 @@ limitations under the License.
 package primaryfailure
 
 import (
-	"fmt"
-	"path"
 	"testing"
 	"time"
 
@@ -42,70 +40,6 @@ func TestDownPrimary(t *testing.T) {
 	// find primary from topo
 	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
 	assert.NotNil(t, curPrimary, "should have elected a primary")
-	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
-
-	// find the replica and rdonly tablets
-	var replica, rdonly *cluster.Vttablet
-	for _, tablet := range shard0.Vttablets {
-		// we know we have only two replcia tablets, so the one not the primary must be the other replica
-		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
-			replica = tablet
-		}
-		if tablet.Type == "rdonly" {
-			rdonly = tablet
-		}
-	}
-	assert.NotNil(t, replica, "could not find replica tablet")
-	assert.NotNil(t, rdonly, "could not find rdonly tablet")
-
-	// Start a cross-cell replica
-	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
-
-	// check that the replication is setup correctly before we failover
-	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)
-	// since all tablets are up and running, InstancePollSecondsExceeded should have `0` zero value
-	utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 0, true)
-	// Make the rdonly vttablet unavailable
-	err := rdonly.VttabletProcess.TearDown()
-	require.NoError(t, err)
-	err = rdonly.MysqlctlProcess.Stop()
-	require.NoError(t, err)
-	// We have bunch of Vttablets down. Therefore we expect at least 1 occurrence of InstancePollSecondsExceeded
-	utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 1, false)
-	// Make the current primary vttablet unavailable.
-	err = curPrimary.VttabletProcess.TearDown()
-	require.NoError(t, err)
-	err = curPrimary.MysqlctlProcess.Stop()
-	require.NoError(t, err)
-	defer func() {
-		// we remove the tablet from our global list
-		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
-		utils.PermanentlyRemoveVttablet(clusterInfo, rdonly)
-	}()
-
-	// check that the replica gets promoted
-	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
-
-	// also check that the replication is working correctly after failover
-	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
-}
-
-// bring down primary before VTOrc has started, let vtorc repair.
-func TestDownPrimaryBeforeVTOrc(t *testing.T) {
-	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
-	defer cluster.PanicHandler(t)
-	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none")
-	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
-	shard0 := &keyspace.Shards[0]
-	curPrimary := shard0.Vttablets[0]
-
-	// Promote the first tablet as the primary
-	err := clusterInfo.ClusterInstance.VtctlclientProcess.InitializeShard(keyspace.Name, shard0.Name, clusterInfo.ClusterInstance.Cell, curPrimary.TabletUID)
-	require.NoError(t, err)
 
 	// find the replica and rdonly tablets
 	var replica, rdonly *cluster.Vttablet
@@ -124,140 +58,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) {
 	// check that the replication is setup correctly before we failover
 	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second)
 
-	// Make the current primary vttablet unavailable.
-	_ = curPrimary.VttabletProcess.TearDown()
-	err = curPrimary.MysqlctlProcess.Stop()
-	require.NoError(t, err)
-
-	// Start a VTOrc instance
-	utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
-		PreventCrossDataCenterPrimaryFailover: true,
-	}, 1)
-
-	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
-
-	defer func() {
-		// we remove the tablet from our global list
-		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
-	}()
-
-	// check that the replica gets promoted
-	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
-
-	// also check that the replication is working correctly after failover
-	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
-}
-
-// delete the primary record and let vtorc repair.
-func TestDeletedPrimaryTablet(t *testing.T) {
-	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
-	defer cluster.PanicHandler(t)
-	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{}, 1, "none")
-	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
-	shard0 := &keyspace.Shards[0]
-	// find primary from topo
-	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
-	assert.NotNil(t, curPrimary, "should have elected a primary")
-	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
-
-	// find the replica and rdonly tablets
-	var replica, rdonly *cluster.Vttablet
-	for _, tablet := range shard0.Vttablets {
-		// we know we have only two replcia tablets, so the one not the primary must be the other replica
-		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
-			replica = tablet
-		}
-		if tablet.Type == "rdonly" {
-			rdonly = tablet
-		}
-	}
-	assert.NotNil(t, replica, "could not find replica tablet")
-	assert.NotNil(t, rdonly, "could not find rdonly tablet")
-
-	// check that the replication is setup correctly before we failover
-	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)
-
-	// Disable VTOrc recoveries
-	vtOrcProcess.DisableGlobalRecoveries(t)
-	// use vtctlclient to stop replication on the replica
-	_, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
-	require.NoError(t, err)
-	// insert a write that is not available on the replica.
-	utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly}, 10*time.Second)
-
-	// Make the current primary vttablet unavailable and delete its tablet record.
-	_ = curPrimary.VttabletProcess.TearDown()
-	err = curPrimary.MysqlctlProcess.Stop()
-	require.NoError(t, err)
-	// use vtctlclient to start replication on the replica back
-	_, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StartReplication", replica.Alias)
-	require.NoError(t, err)
-	err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", curPrimary.Alias)
-	require.NoError(t, err)
-	// Enable VTOrc recoveries now
-	vtOrcProcess.EnableGlobalRecoveries(t)
-
-	defer func() {
-		// we remove the tablet from our global list
-		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
-	}()
-
-	// check that the replica gets promoted. Also verify that it has all the writes.
-	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
-	utils.CheckTabletUptoDate(t, clusterInfo, replica)
-
-	// also check that the replication is working correctly after failover
-	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1)
-}
-
-// TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead.
-// Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know
-// that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag.
-func TestDeadPrimaryRecoversImmediately(t *testing.T) {
-	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
-	defer cluster.PanicHandler(t)
-	// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
-	// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
-	// If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test.
-	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{
-		PreventCrossDataCenterPrimaryFailover: true,
-	}, 1, "semi_sync")
-	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
-	shard0 := &keyspace.Shards[0]
-	// find primary from topo
-	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
-	assert.NotNil(t, curPrimary, "should have elected a primary")
-	vtOrcProcess := clusterInfo.ClusterInstance.VtorcProcesses[0]
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
-
-	// find the replica and rdonly tablets
-	var replica, rdonly *cluster.Vttablet
-	for _, tablet := range shard0.Vttablets {
-		// we know we have only two replcia tablets, so the one not the primary must be the other replica
-		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
-			replica = tablet
-		}
-		if tablet.Type == "rdonly" {
-			rdonly = tablet
-		}
-	}
-	assert.NotNil(t, replica, "could not find replica tablet")
-	assert.NotNil(t, rdonly, "could not find rdonly tablet")
-
-	// Start a cross-cell replica
-	crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
-
-	// check that the replication is setup correctly before we failover
-	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)
-
-	// Make the current primary vttablet unavailable.
-	curPrimary.VttabletProcess.Kill()
+	// Make the current primary database unavailable.
 	err := curPrimary.MysqlctlProcess.Stop()
 	require.NoError(t, err)
 	defer func() {
@@ -268,34 +69,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
 	// check that the replica gets promoted
 	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
 	// also check that the replication is working correctly after failover
-	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
-	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
-	utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
-
-	// Parse log file and find out how much time it took for DeadPrimary to recover.
-	logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
-	// log prefix printed at the end of analysis where we conclude we have DeadPrimary
-	t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery")
-	// log prefix printed at the end of recovery
-	t2 := extractTimeFromLog(t, logFile, "auditType:RecoverDeadPrimary")
-	curr := time.Now().Format("2006-01-02")
-	timeLayout := "2006-01-02 15:04:05.000000"
-	timeStr1 := fmt.Sprintf("%s %s", curr, t1)
-	timeStr2 := fmt.Sprintf("%s %s", curr, t2)
-	time1, err := time.Parse(timeLayout, timeStr1)
-	if err != nil {
-		t.Errorf("unable to parse time %s", err.Error())
-	}
-	time2, err := time.Parse(timeLayout, timeStr2)
-	if err != nil {
-		t.Errorf("unable to parse time %s", err.Error())
-	}
-	diff := time2.Sub(time1)
-	fmt.Printf("The difference between %s and %s is %v seconds.\n", t1, t2, diff.Seconds())
-	// assert that it takes less than `remote_operation_timeout` to recover from `DeadPrimary`
-	// use the value provided in `remote_operation_timeout` flag to compare with.
-	// We are testing against 9.5 seconds to be safe and prevent flakiness.
-	assert.Less(t, diff.Seconds(), 9.5)
+	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
 }
 
 // Failover should not be cross data centers, according to the configuration file
diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go
index 713f651203f..96075aef3c0 100644
--- a/go/test/endtoend/vtorc/utils/utils.go
+++ b/go/test/endtoend/vtorc/utils/utils.go
@@ -18,16 +18,12 @@ package utils
 
 import (
 	"context"
-	"encoding/json"
 	"fmt"
 	"io"
-	"math"
 	"net/http"
 	"os"
 	"os/exec"
 	"path"
-	"reflect"
-	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -919,140 +915,3 @@ func WaitForReadOnlyValue(t *testing.T, curPrimary *cluster.Vttablet, expectValu
 	}
 	return false
 }
-
-// WaitForSuccessfulRecoveryCount waits until the given recovery name's count of successful runs matches the count expected
-func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, recoveryName string, countExpected int) {
-	t.Helper()
-	timeout := 15 * time.Second
-	startTime := time.Now()
-	for time.Since(startTime) < timeout {
-		vars := vtorcInstance.GetVars()
-		successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
-		successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
-		if successCount == countExpected {
-			return
-		}
-		time.Sleep(time.Second)
-	}
-	vars := vtorcInstance.GetVars()
-	successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
-	successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
-	assert.EqualValues(t, countExpected, successCount)
-}
-
-// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected.
-func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyspace, shard string, countExpected int) {
-	t.Helper()
-	timeout := 15 * time.Second
-	startTime := time.Now()
-	mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
-	for time.Since(startTime) < timeout {
-		vars := vtorcInstance.GetVars()
-		prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
-		successCount := getIntFromValue(prsCountsMap[mapKey])
-		if successCount == countExpected {
-			return
-		}
-		time.Sleep(time.Second)
-	}
-	vars := vtorcInstance.GetVars()
-	prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
-	successCount := getIntFromValue(prsCountsMap[mapKey])
-	assert.EqualValues(t, countExpected, successCount)
-}
-
-// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected.
-func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyspace, shard string, countExpected int) {
-	t.Helper()
-	timeout := 15 * time.Second
-	startTime := time.Now()
-	mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
-	for time.Since(startTime) < timeout {
-		vars := vtorcInstance.GetVars()
-		ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
-		successCount := getIntFromValue(ersCountsMap[mapKey])
-		if successCount == countExpected {
-			return
-		}
-		time.Sleep(time.Second)
-	}
-	vars := vtorcInstance.GetVars()
-	ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
-	successCount := getIntFromValue(ersCountsMap[mapKey])
-	assert.EqualValues(t, countExpected, successCount)
-}
-
-// getIntFromValue is a helper function to get an integer from the given value.
-// If it is convertible to a float, then we round the number to the nearest integer.
-// If the value is not numeric at all, we return 0.
-func getIntFromValue(val any) int {
-	value := reflect.ValueOf(val)
-	if value.CanFloat() {
-		return int(math.Round(value.Float()))
-	}
-	if value.CanInt() {
-		return int(value.Int())
-	}
-	return 0
-}
-
-// WaitForTabletType waits for the tablet to reach a certain type.
-func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) {
-	t.Helper()
-	err := tablet.VttabletProcess.WaitForTabletTypes([]string{expectedTabletType})
-	require.NoError(t, err)
-}
-
-// WaitForInstancePollSecondsExceededCount waits for 30 seconds and then queries api/aggregated-discovery-metrics.
-// It expects to find minimum occurrence or exact count of `keyName` provided.
-func WaitForInstancePollSecondsExceededCount(t *testing.T, vtorcInstance *cluster.VtorcProcess, keyName string, minCountExpected float64, enforceEquality bool) {
-	t.Helper()
-	var sinceInSeconds = 30
-	duration := time.Duration(sinceInSeconds)
-	time.Sleep(duration * time.Second)
-
-	statusCode, res, err := vtorcInstance.MakeAPICall("api/aggregated-discovery-metrics?seconds=" + strconv.Itoa(sinceInSeconds))
-	if err != nil {
-		assert.Fail(t, "Not able to call api/aggregated-discovery-metrics")
-	}
-	if statusCode == 200 {
-		resultMap := make(map[string]any)
-		err := json.Unmarshal([]byte(res), &resultMap)
-		if err != nil {
-			assert.Fail(t, "invalid response from api/aggregated-discovery-metrics")
-		}
-		successCount := resultMap[keyName]
-		if iSuccessCount, ok := successCount.(float64); ok {
-			if enforceEquality {
-				assert.Equal(t, iSuccessCount, minCountExpected)
-			} else {
-				assert.GreaterOrEqual(t, iSuccessCount, minCountExpected)
-			}
-			return
-		}
-	}
-	assert.Fail(t, "invalid response from api/aggregated-discovery-metrics")
-}
-
-// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test.
-// This function is supposed to be called as the first defer command from the vtorc tests.
-func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) {
-	// If the test has not failed, then we don't need to print anything.
-	if !t.Failed() {
-		return
-	}
-
-	log.Errorf("Printing VTOrc logs")
-	for _, vtorc := range clusterInstance.VtorcProcesses {
-		if vtorc == nil || vtorc.LogFileName == "" {
-			continue
-		}
-		filePath := path.Join(vtorc.LogDir, vtorc.LogFileName)
-		log.Errorf("Printing file - %s", filePath)
-		content, err := os.ReadFile(filePath)
-		if err != nil {
-			log.Errorf("Error while reading the file - %v", err)
-		}
-		log.Errorf("%s", string(content))
-	}
-}

From 76e23159767b8bd2f007fb0a94a578a31aa943b9 Mon Sep 17 00:00:00 2001
From: Tim Vaillancourt <tim@timvaillancourt.com>
Date: Thu, 12 Oct 2023 19:06:37 +0200
Subject: [PATCH 4/7] remove unrelated+incompatible test

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
---
 .../planned_reparenter_flaky_test.go          | 186 ------------------
 1 file changed, 186 deletions(-)

diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
index b186f68b0f2..ca158f8c124 100644
--- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
+++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
@@ -3720,192 +3720,6 @@ func AssertReparentEventsEqual(t *testing.T, expected *events.Reparent, actual *
 	AssertReparentEventsEqualWithMessage(t, expected, actual, "")
 }
 
-// TestPlannedReparenter_verifyAllTabletsReachable tests the functionality of verifyAllTabletsReachable.
-func TestPlannedReparenter_verifyAllTabletsReachable(t *testing.T) {
-	tests := []struct {
-		name         string
-		tmc          tmclient.TabletManagerClient
-		tabletMap    map[string]*topo.TabletInfo
-		remoteOpTime time.Duration
-		wantErr      string
-	}{
-		{
-			name: "Success",
-			tmc: &testutil.TabletManagerClient{
-				PrimaryStatusResults: map[string]struct {
-					Status *replicationdatapb.PrimaryStatus
-					Error  error
-				}{
-					"zone1-0000000200": {
-						Status: &replicationdatapb.PrimaryStatus{},
-					},
-					"zone1-0000000201": {
-						Status: &replicationdatapb.PrimaryStatus{},
-					},
-					"zone1-0000000100": {
-						Status: &replicationdatapb.PrimaryStatus{},
-					},
-				},
-			},
-			tabletMap: map[string]*topo.TabletInfo{
-				"zone1-0000000100": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  100,
-						},
-						Type: topodatapb.TabletType_PRIMARY,
-					},
-				},
-				"zone1-0000000200": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  200,
-						},
-						Type: topodatapb.TabletType_REPLICA,
-					},
-				},
-				"zone1-0000000201": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  201,
-						},
-						Type: topodatapb.TabletType_REPLICA,
-					},
-				},
-			},
-		}, {
-			name: "Failure",
-			tmc: &testutil.TabletManagerClient{
-				PrimaryStatusResults: map[string]struct {
-					Status *replicationdatapb.PrimaryStatus
-					Error  error
-				}{
-					"zone1-0000000200": {
-						Error: fmt.Errorf("primary status failed"),
-					},
-					"zone1-0000000201": {
-						Status: &replicationdatapb.PrimaryStatus{},
-					},
-					"zone1-0000000100": {
-						Status: &replicationdatapb.PrimaryStatus{},
-					},
-				},
-			},
-			tabletMap: map[string]*topo.TabletInfo{
-				"zone1-0000000100": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  100,
-						},
-						Type: topodatapb.TabletType_PRIMARY,
-					},
-				},
-				"zone1-0000000200": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  200,
-						},
-						Type: topodatapb.TabletType_REPLICA,
-					},
-				},
-				"zone1-0000000201": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  201,
-						},
-						Type: topodatapb.TabletType_REPLICA,
-					},
-				},
-			},
-			wantErr: "primary status failed",
-		}, {
-			name: "Timeout",
-			tmc: &testutil.TabletManagerClient{
-				PrimaryStatusDelays: map[string]time.Duration{
-					"zone1-0000000100": 20 * time.Second,
-				},
-				PrimaryStatusResults: map[string]struct {
-					Status *replicationdatapb.PrimaryStatus
-					Error  error
-				}{
-					"zone1-0000000200": {
-						Status: &replicationdatapb.PrimaryStatus{},
-					},
-					"zone1-0000000201": {
-						Status: &replicationdatapb.PrimaryStatus{},
-					},
-					"zone1-0000000100": {
-						Status: &replicationdatapb.PrimaryStatus{},
-					},
-				},
-			},
-			remoteOpTime: 100 * time.Millisecond,
-			tabletMap: map[string]*topo.TabletInfo{
-				"zone1-0000000100": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  100,
-						},
-						Type: topodatapb.TabletType_PRIMARY,
-					},
-				},
-				"zone1-0000000200": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  200,
-						},
-						Type: topodatapb.TabletType_REPLICA,
-					},
-				},
-				"zone1-0000000201": {
-					Tablet: &topodatapb.Tablet{
-						Alias: &topodatapb.TabletAlias{
-							Cell: "zone1",
-							Uid:  201,
-						},
-						Type: topodatapb.TabletType_REPLICA,
-					},
-				},
-			},
-			wantErr: "context deadline exceeded",
-		},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			ctx, cancel := context.WithCancel(context.Background())
-			defer cancel()
-			ts := memorytopo.NewServer(ctx, "zone1")
-			defer ts.Close()
-
-			pr := &PlannedReparenter{
-				ts:  ts,
-				tmc: tt.tmc,
-			}
-			if tt.remoteOpTime != 0 {
-				oldTime := topo.RemoteOperationTimeout
-				topo.RemoteOperationTimeout = tt.remoteOpTime
-				defer func() {
-					topo.RemoteOperationTimeout = oldTime
-				}()
-			}
-			err := pr.verifyAllTabletsReachable(context.Background(), tt.tabletMap)
-			if tt.wantErr == "" {
-				require.NoError(t, err)
-				return
-			}
-			require.ErrorContains(t, err, tt.wantErr)
-		})
-	}
-}
-
 func TestPlannedReparenterStats(t *testing.T) {
 	prsCounter.ResetAll()
 	reparentShardOpTimings.Reset()

From 7b9419f0569a47fc29d54230bb0c8f05370fbe53 Mon Sep 17 00:00:00 2001
From: Tim Vaillancourt <tim@timvaillancourt.com>
Date: Thu, 12 Oct 2023 19:08:32 +0200
Subject: [PATCH 5/7] Fix memorytopo.NewServer(...) for v14

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
---
 go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
index ca158f8c124..fb0087e545c 100644
--- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
+++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
@@ -3790,7 +3790,7 @@ func TestPlannedReparenterStats(t *testing.T) {
 	}
 	keyspace := "testkeyspace"
 	shard := "-"
-	ts := memorytopo.NewServer(context.Background(), "zone1")
+	ts := memorytopo.NewServer("zone1")
 
 	ctx := context.Background()
 	logger := logutil.NewMemoryLogger()

From 0e3466bf954b409ad41f8ae07184d69f4a29475f Mon Sep 17 00:00:00 2001
From: Tim Vaillancourt <tim@timvaillancourt.com>
Date: Thu, 12 Oct 2023 20:00:38 +0200
Subject: [PATCH 6/7] remove non-existant "PrimaryStatusResults" field

Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
---
 .../reparentutil/planned_reparenter_flaky_test.go    | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
index fb0087e545c..ec20eb19352 100644
--- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
+++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
@@ -3743,18 +3743,6 @@ func TestPlannedReparenterStats(t *testing.T) {
 		SetReadWriteResults: map[string]error{
 			"zone1-0000000100": nil,
 		},
-		// This is only needed to verify reachability, so empty results are fine.
-		PrimaryStatusResults: map[string]struct {
-			Status *replicationdatapb.PrimaryStatus
-			Error  error
-		}{
-			"zone1-0000000101": {
-				Status: &replicationdatapb.PrimaryStatus{},
-			},
-			"zone1-0000000100": {
-				Status: &replicationdatapb.PrimaryStatus{},
-			},
-		},
 	}
 	shards := []*vtctldatapb.Shard{
 		{

From 9299a03d4a32901d5d5194a03f2b89de3f51f7ff Mon Sep 17 00:00:00 2001
From: Rohit Nayak <57520317+rohit-nayak-ps@users.noreply.github.com>
Date: Thu, 28 Sep 2023 21:23:13 +0200
Subject: [PATCH 7/7] Remove FOSSA Test from CI until we can do it in a secure
 way (#14119)

Signed-off-by: Rohit Nayak <rohit@planetscale.com>
---
 .github/workflows/static_checks_etc.yml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/static_checks_etc.yml b/.github/workflows/static_checks_etc.yml
index 287f72686fc..2e7e6f585db 100644
--- a/.github/workflows/static_checks_etc.yml
+++ b/.github/workflows/static_checks_etc.yml
@@ -25,12 +25,6 @@ jobs:
         if: steps.skip-workflow.outputs.skip-workflow == 'false'
         uses: actions/checkout@v3
 
-      - name: Run FOSSA scan and upload build data
-        if: steps.skip-workflow.outputs.skip-workflow == 'false'
-        uses: fossa-contrib/fossa-action@v1
-        with:
-          fossa-api-key: 76d7483ea206d530d9452e44bffe7ba8
-
       - name: Check for changes in Go files
         if: steps.skip-workflow.outputs.skip-workflow == 'false'
         uses: frouioui/paths-filter@main
@@ -211,4 +205,4 @@ jobs:
           echo 'We wish to maintain a consistent changelog directory, please run `go run ./go/tools/releases/releases.go`, commit and push again.'
           echo 'Running `go run ./go/tools/releases/releases.go` on CI yields the following changes:'
           echo "$output"
-          echo ""
\ No newline at end of file
+          echo ""