(bugfix) Track rsync process state for running backups
Record rsync process pid and execution phase while normal backup runs are active so the worker can reconcile stale running rows when rsync has already disappeared. Keep finalizing runs out of the missing-process path to avoid marking slow post-rsync stats collection as a failed transfer. Closes #54
This commit is contained in:
@@ -160,16 +160,19 @@ class BackupWorkerTests(TestCase):
|
||||
kwargs["state_callback"](
|
||||
{
|
||||
"status": "running",
|
||||
"phase": "rsync",
|
||||
"snapshot": str(snapshot_dir),
|
||||
"log": str(log_path),
|
||||
"rsync": {"command": ["rsync"], "exit_code": None},
|
||||
"rsync": {"command": ["rsync"], "exit_code": None, "pid": 1234, "pgid": 1234},
|
||||
}
|
||||
)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.snapshot_path, str(snapshot_dir))
|
||||
self.assertEqual(run.result["execution"]["phase"], "rsync")
|
||||
self.assertEqual(run.result["execution"]["log"], str(log_path))
|
||||
self.assertEqual(run.result["execution"]["snapshot"], str(snapshot_dir))
|
||||
self.assertEqual(run.result["rsync"]["command"], ["rsync"])
|
||||
self.assertEqual(run.result["rsync"]["pid"], 1234)
|
||||
return {
|
||||
"ok": True,
|
||||
"dry_run": False,
|
||||
@@ -228,6 +231,49 @@ class BackupWorkerTests(TestCase):
|
||||
self.assertEqual(run.result["failure"]["category"], "transport")
|
||||
self.assertIn("Broken pipe", "\n".join(run.result["rsync"]["log_tail"]))
|
||||
|
||||
def test_worker_reconciles_real_run_when_rsync_process_disappears(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log_path.write_text("sending incremental file list\n", encoding="utf-8")
|
||||
run.status = BackupRun.Status.RUNNING
|
||||
run.started_at = timezone.now() - timedelta(minutes=10)
|
||||
run.result["execution"] = {
|
||||
"phase": "rsync",
|
||||
"log": str(log_path),
|
||||
"heartbeat_at": (timezone.now() - timedelta(minutes=10)).isoformat(),
|
||||
}
|
||||
run.result["rsync"] = {"pid": 999999, "pgid": 999999, "command": ["rsync"]}
|
||||
run.save(update_fields=["status", "started_at", "result"])
|
||||
|
||||
reconciled = reconcile_running_runs(grace_seconds=300, stale_worker_seconds=24 * 60 * 60)
|
||||
|
||||
self.assertEqual(reconciled, 1)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.FAILED)
|
||||
self.assertEqual(run.result["failure"]["category"], "rsync_process")
|
||||
self.assertEqual(run.rsync_exit_code, 255)
|
||||
|
||||
def test_worker_does_not_reconcile_missing_rsync_process_during_finalizing_phase(self) -> None:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
run.status = BackupRun.Status.RUNNING
|
||||
run.started_at = timezone.now() - timedelta(minutes=10)
|
||||
run.result["execution"] = {
|
||||
"phase": "finalizing",
|
||||
"heartbeat_at": (timezone.now() - timedelta(minutes=10)).isoformat(),
|
||||
}
|
||||
run.result["rsync"] = {"pid": 999999, "pgid": 999999, "command": ["rsync"], "exit_code": 0}
|
||||
run.save(update_fields=["status", "started_at", "result"])
|
||||
|
||||
reconciled = reconcile_running_runs(grace_seconds=300, stale_worker_seconds=24 * 60 * 60)
|
||||
|
||||
self.assertEqual(reconciled, 0)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.RUNNING)
|
||||
|
||||
def test_worker_does_not_fail_real_run_for_vanished_file_warning_log(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
|
||||
Reference in New Issue
Block a user