diff --git a/go.mod b/go.mod index fcea7fa65f9..40f2b19e449 100644 --- a/go.mod +++ b/go.mod @@ -102,7 +102,7 @@ require ( github.com/kr/text v0.2.0 github.com/mitchellh/mapstructure v1.5.0 github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249 - github.com/slackhq/vitess-addons v0.19.0 + github.com/slackhq/vitess-addons v0.19.1 github.com/slok/noglog v0.2.0 github.com/spf13/afero v1.11.0 github.com/spf13/jwalterweatherman v1.1.0 diff --git a/go.sum b/go.sum index e461d672eaa..16bed616cd0 100644 --- a/go.sum +++ b/go.sum @@ -450,8 +450,8 @@ github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6Mwd github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sjmudd/stopwatch v0.1.1 h1:x45OvxFB5OtCkjvYtzRF5fWB857Jzjjk84Oyd5C5ebw= github.com/sjmudd/stopwatch v0.1.1/go.mod h1:BLw0oIQJ1YLXBO/q9ufK/SgnKBVIkC2qrm6uy78Zw6U= -github.com/slackhq/vitess-addons v0.19.0 h1:+dWkQENsu8YYgsKesOKWqb3+vj66OY1WMvYOn9lmZ+I= -github.com/slackhq/vitess-addons v0.19.0/go.mod h1:E7i+cxyIY+I4An/JAvalQ9Ze2MjKlEx0u2nFXE4fgR0= +github.com/slackhq/vitess-addons v0.19.1 h1:k8f8pAJ2zqtetN+dnehAs7DFcZnI9IQRSL18ZMwNRCw= +github.com/slackhq/vitess-addons v0.19.1/go.mod h1:ZMzBBtadSA1MEuNIfZerztxLMhRFO+tmBZxv5HuV4lE= github.com/slok/noglog v0.2.0 h1:1czu4l2EoJ8L92UwdSXXa1Y+c5TIjFAFm2P+mjej95E= github.com/slok/noglog v0.2.0/go.mod h1:TfKxwpEZPT+UA83bQ6RME146k0MM4e8mwHLf6bhcGDI= github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index c1fc2c8f9fb..6b45721540e 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -21,9 +21,11 @@ import ( "encoding/json" "fmt" "math/rand" + "os" "time" "github.com/patrickmn/go-cache" + "github.com/slackhq/vitess-addons/go/external" "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/log" @@ -81,6 +83,9 @@ var ( // recoveriesFailureCounter counts the number of failed recoveries that VTOrc has performed recoveriesFailureCounter = stats.NewCountersWithSingleLabel("FailedRecoveries", "Count of the different failed recoveries performed", "RecoveryType", actionableRecoveriesNames...) + + vtopsExec = external.NewExecVTOps(os.Getenv("VTOPS_PATH"), os.Getenv("VTOPS_HTTP_PROXY"), "vtorc", os.Getenv("HOSTNAME")) + vtopsSlackChannel = os.Getenv("SLACK_CHANNEL") ) // recoveryFunction is the code of the recovery function to be used @@ -297,6 +302,7 @@ func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R _ = AuditTopologyRecovery(topologyRecovery, message) _ = inst.AuditOperation(recoveryName, analysisEntry.AnalyzedInstanceAlias, message) _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%v: successfully promoted %+v", recoveryName, promotedReplica.InstanceAlias)) + vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true) } } @@ -590,7 +596,6 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (err error) { countPendingRecoveries.Add(1) defer countPendingRecoveries.Add(-1) - checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode) analysisEntry.IsActionableRecovery = isActionableRecovery @@ -607,6 +612,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er return nil } + // we have a recovery function; its execution still depends on filters if not disabled. if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceAlias) { log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery) @@ -707,15 +713,23 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) { log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery) } + recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry) if !recoveryAttempted { + log.Infof("No recovery attempted on %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis) return err } recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode) recoveriesCounter.Add(recoveryName, 1) if err != nil { + message := fmt.Sprintf("Recovery failed on %s for problem %s. Error: %s", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis, err.Error()) + log.Info(message) + vtopsExec.SendSlackMessage(message, vtopsSlackChannel, true) recoveriesFailureCounter.Add(recoveryName, 1) } else { + message := fmt.Sprintf("Recovery succeeded on %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis) + log.Info(message) + vtopsExec.SendSlackMessage(message, vtopsSlackChannel, true) recoveriesSuccessfulCounter.Add(recoveryName, 1) } if topologyRecovery == nil { @@ -813,6 +827,7 @@ func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R _ = AuditTopologyRecovery(topologyRecovery, message) _ = inst.AuditOperation(string(analysisEntry.Analysis), analysisEntry.AnalyzedInstanceAlias, message) _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%+v: successfully promoted %+v", analysisEntry.Analysis, promotedReplica.InstanceAlias)) + vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true) } }