Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix gprestore/gpbackup hanging in case the helper goes down #88

Merged
merged 30 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ce355b2
Fix gprestore hanging on copy in case the helper goes down at start
whitehawk Jun 5, 2024
d8a2d22
Fix spaces in shell cmd for COPY query
whitehawk Jun 5, 2024
6384cd0
Fix format issues
whitehawk Jun 5, 2024
acbc6c8
Fix unit test
whitehawk Jun 5, 2024
e1d993b
Fix end-2-end test
whitehawk Jun 7, 2024
db9a617
Fix style
whitehawk Jun 7, 2024
f992b46
Fix case for single data file
whitehawk Jun 7, 2024
05eb55f
Add test
whitehawk Jun 9, 2024
066a01b
Rework solution
whitehawk Jun 9, 2024
3213203
Revert "Fix unit test"
whitehawk Jun 9, 2024
3598ed8
Remove unrelated style change done by goimports
whitehawk Jun 9, 2024
99d717d
Fix test
whitehawk Jun 9, 2024
d305ae9
Improve and simplify solution
whitehawk Jun 10, 2024
473216c
Fix the backup helper
whitehawk Jun 11, 2024
1c4337f
Update solution to avoid partial restore
whitehawk Jun 18, 2024
21f1984
Fix code style in unit test
whitehawk Jun 20, 2024
c0fd30a
Update changes in unit test
whitehawk Jun 20, 2024
f6c13e6
Fix hanging in case the restore helper fails to get oid list from a file
whitehawk Jun 24, 2024
bac9ce9
Add more logging
whitehawk Jun 27, 2024
6a7e453
Merge branch 'master' into ADBDEV-5693
Stolb27 Jun 28, 2024
ecb4eaf
Fix test after merge
whitehawk Jul 1, 2024
7dd829e
Update delta in the example plugin
whitehawk Jul 21, 2024
1256289
Merge branch 'master' into ADBDEV-5693
whitehawk Aug 5, 2024
9384554
Fix comments and spaces
whitehawk Aug 5, 2024
7102051
Add execution context into backup
whitehawk Aug 7, 2024
be1b0c6
Update test
whitehawk Aug 7, 2024
30d89e2
Update the test
whitehawk Aug 9, 2024
465ce05
Simplify test and plugin code
whitehawk Aug 9, 2024
405a815
Add test with resize cluster
whitehawk Aug 13, 2024
c188c7c
Merge branch 'master' into ADBDEV-5693
Stolb27 Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion end_to_end/end_to_end_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2302,7 +2302,7 @@ LANGUAGE plpgsql NO SQL;`)
output, err := gprestoreCmd.CombinedOutput()
Expect(err).To(HaveOccurred())
Expect(string(output)).To(ContainSubstring(`Error loading data into table public.t1`))
Expect(string(output)).To(ContainSubstring(`Error loading data into table public.t2`))
Expect(string(output)).To(SatisfyAny(ContainSubstring(`Error loading data into table public.t2`), ContainSubstring(`Expected to restore 1000000 rows to table public.t2, but restored 0 instead`)))
RekGRpth marked this conversation as resolved.
Show resolved Hide resolved
assertArtifactsCleaned(restoreConn, "20240502095933")
testhelper.AssertQueryRuns(restoreConn, "DROP TABLE t0; DROP TABLE t1; DROP TABLE t2; DROP TABLE t3; DROP TABLE t4;")
})
Expand Down
38 changes: 38 additions & 0 deletions end_to_end/plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"os"
"os/exec"
path "path/filepath"
"time"

"github.com/greenplum-db/gp-common-go-libs/cluster"
"github.com/greenplum-db/gp-common-go-libs/dbconn"
Expand All @@ -14,6 +15,7 @@ import (
"github.com/greenplum-db/gpbackup/testutils"
"github.com/greenplum-db/gpbackup/utils"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

func copyPluginToAllHosts(conn *dbconn.DBConn, pluginPath string) {
Expand Down Expand Up @@ -306,6 +308,42 @@ var _ = Describe("End to End plugin tests", func() {
Skip("This test is only needed for the most recent backup versions")
}
})
It("Will not hang if gpbackup and gprestore runs with single-data-file and plugin fails at init for restore", func(ctx SpecContext) {
pluginExecutablePath := fmt.Sprintf("%s/src/github.com/greenplum-db/gpbackup/plugins/example_plugin.bash", os.Getenv("GOPATH"))
copyPluginToAllHosts(backupConn, pluginExecutablePath)

testhelper.AssertQueryRuns(backupConn, "CREATE TABLE t0(a int);")
testhelper.AssertQueryRuns(backupConn, "INSERT INTO t0 SELECT i FROM generate_series(1, 10)i;")
defer testhelper.AssertQueryRuns(backupConn, "DROP TABLE t0;")

timestamp := gpbackup(gpbackupPath, backupHelperPath,
"--single-data-file",
"--plugin-config", pluginConfigPath)

backupCluster.GenerateAndExecuteCommand(
"Instruct plugin to fail",
cluster.ON_HOSTS,
func(contentID int) string {
return fmt.Sprintf("touch /tmp/GPBACKUP_PLUGIN_FAIL")
})

gprestoreCmd := exec.Command(gprestorePath,
"--timestamp", timestamp,
"--redirect-db", "restoredb",
"--plugin-config", pluginConfigPath)

_, err := gprestoreCmd.CombinedOutput()
Expect(err).To(HaveOccurred())
RekGRpth marked this conversation as resolved.
Show resolved Hide resolved

backupCluster.GenerateAndExecuteCommand(
"Unset plugin failure",
cluster.ON_HOSTS,
func(contentID int) string {
return fmt.Sprintf("rm /tmp/GPBACKUP_PLUGIN_FAIL")
})

assertArtifactsCleaned(restoreConn, timestamp)
}, SpecTimeout(time.Second*30))
It("runs gpbackup and gprestore with plugin, single-data-file, and no-compression", func() {
pluginExecutablePath := fmt.Sprintf("%s/src/github.com/greenplum-db/gpbackup/plugins/example_plugin.bash", os.Getenv("GOPATH"))
copyPluginToAllHosts(backupConn, pluginExecutablePath)
Expand Down
27 changes: 27 additions & 0 deletions helper/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,33 @@ func DoCleanup() {

for pipeName, _ := range pipesMap {
log("Removing pipe %s", pipeName)

/*
* In case restore helper encountered an error, it is possible that the pipe file has been already opened for
* reading by another process, but nothing has been written to the pipe yet. In this case, the reader process
* may hang forever after the pipe is removed. To fix this problem, the pipe file is opened and closed
* to generate an EOF before it is deleted.
*
* Also, it is possible, that after EOF is issued (releasing the current reader process), but before the pipe is
* removed, a new reader process can start reading the pipe. To avoid such situation, we close the file handle
* after we have removed the file.
*
* Similar problem can happen with backup helper. If it finishes its work right after start, before opening the
* pipe for reading, the writing side will stay hanging on the other pipe's end. So, we also open and close the
* pipe to release the writing side.
*/
if *restoreAgent {
fileHandlePipe, err := os.OpenFile(pipeName, os.O_WRONLY|unix.O_NONBLOCK, os.ModeNamedPipe)
if err == nil {
defer fileHandlePipe.Close()
dkovalev1 marked this conversation as resolved.
Show resolved Hide resolved
}
} else if *backupAgent {
fileHandlePipe, err := os.OpenFile(pipeName, os.O_RDONLY|unix.O_NONBLOCK, os.ModeNamedPipe)
if err == nil {
defer fileHandlePipe.Close()
}
}

err = deletePipe(pipeName)
if err != nil {
log("Encountered error removing pipe %s: %v", pipeName, err)
Expand Down
4 changes: 2 additions & 2 deletions helper/restore_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ func doRestoreAgent() error {
}
}

preloadCreatedPipes(oidList, *copyQueue)

if *singleDataFile {
contentToRestore := *content
segmentTOC = make(map[int]*toc.SegmentTOC)
Expand Down Expand Up @@ -173,8 +175,6 @@ func doRestoreAgent() error {
}
}

preloadCreatedPipes(oidList, *copyQueue)

var currentPipe string
for i, oid := range oidList {
if wasTerminated {
Expand Down
4 changes: 4 additions & 0 deletions plugins/example_plugin.bash
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ restore_data() {
echo "restore_data $1 $2" >> /tmp/plugin_out.txt
filename=`basename "$2"`
timestamp_dir=`basename $(dirname "$2")`
if [ -e "/tmp/GPBACKUP_PLUGIN_FAIL" ] ; then
sleep 3
echo 'error' >&2
fi
timestamp_day_dir=${timestamp_dir%??????}
cat /tmp/plugin_dest/$timestamp_day_dir/$timestamp_dir/$filename
}
Expand Down
Loading