(bugfix) Reconcile real rsync failures from worker logs
Record live rsync log paths for normal backup runs so the worker can recover stale running state after terminal rsync errors. Treat rsync vanished-file exit code 24 as a warning and keep the completed snapshot instead of failing the run into incomplete state. Closes #54
This commit is contained in:
@@ -85,6 +85,37 @@ class BackupWorkerTests(TestCase):
|
||||
self.assertEqual(SnapshotRecord.objects.count(), 1)
|
||||
self.assertEqual(run.snapshot, SnapshotRecord.objects.get())
|
||||
|
||||
def test_worker_records_warning_status_from_completed_run(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
backup_root = Path(tmp) / "backups"
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(backup_root))
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
snapshot_dir = backup_root / host.host / "scheduled" / "20260519-021500Z__ABCDEFGH"
|
||||
meta_dir = snapshot_dir / "meta"
|
||||
meta_dir.mkdir(parents=True)
|
||||
write_yaml_atomic(meta_dir / "meta.yaml", {"status": "warning", "started_at": "2026-05-19T02:15:00Z"})
|
||||
run = queue_backup_run(host=host)
|
||||
|
||||
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
|
||||
run_scheduled.return_value = {
|
||||
"ok": True,
|
||||
"status": "warning",
|
||||
"dry_run": False,
|
||||
"host": host.host,
|
||||
"snapshot": str(snapshot_dir),
|
||||
"base": None,
|
||||
"warning": {"category": "vanished"},
|
||||
"rsync": {"exit_code": 24},
|
||||
}
|
||||
|
||||
count = Command()._run_once(prefix=Path(tmp) / "home")
|
||||
|
||||
self.assertEqual(count, 1)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.WARNING)
|
||||
self.assertEqual(run.rsync_exit_code, 24)
|
||||
self.assertEqual(run.result["warning"]["category"], "vanished")
|
||||
|
||||
def test_worker_refreshes_heartbeat_while_run_is_active(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
|
||||
@@ -116,6 +147,41 @@ class BackupWorkerTests(TestCase):
|
||||
run_scheduled.side_effect = fake_run_scheduled
|
||||
Command()._run_once(prefix=Path(tmp) / "home")
|
||||
|
||||
def test_worker_records_real_run_log_path_while_running(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
snapshot_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH"
|
||||
log_path = snapshot_dir / "meta" / "rsync.log"
|
||||
|
||||
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
|
||||
def fake_run_scheduled(**kwargs):
|
||||
kwargs["state_callback"](
|
||||
{
|
||||
"status": "running",
|
||||
"snapshot": str(snapshot_dir),
|
||||
"log": str(log_path),
|
||||
"rsync": {"command": ["rsync"], "exit_code": None},
|
||||
}
|
||||
)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.snapshot_path, str(snapshot_dir))
|
||||
self.assertEqual(run.result["execution"]["log"], str(log_path))
|
||||
self.assertEqual(run.result["execution"]["snapshot"], str(snapshot_dir))
|
||||
self.assertEqual(run.result["rsync"]["command"], ["rsync"])
|
||||
return {
|
||||
"ok": True,
|
||||
"dry_run": False,
|
||||
"host": host.host,
|
||||
"snapshot": "",
|
||||
"base": None,
|
||||
"rsync": {"exit_code": 0},
|
||||
}
|
||||
|
||||
run_scheduled.side_effect = fake_run_scheduled
|
||||
Command()._run_once(prefix=Path(tmp) / "home")
|
||||
|
||||
def test_worker_reconciles_stale_real_run_after_heartbeat_timeout(self) -> None:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
@@ -136,6 +202,54 @@ class BackupWorkerTests(TestCase):
|
||||
self.assertEqual(run.result["failure"]["category"], "worker")
|
||||
self.assertIn("heartbeat stopped", run.result["failure"]["message"])
|
||||
|
||||
def test_worker_reconciles_real_run_with_terminal_broken_pipe_log(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log_path.write_text(
|
||||
"rsync error: unexplained error (code 255) at rsync.c(716) [generator=3.4.1]\n"
|
||||
"rsync error: received SIGUSR1 (code 19) at main.c(1600) [receiver=3.4.1]\n"
|
||||
"rsync: [generator] write error: Broken pipe (32)\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
run.status = BackupRun.Status.RUNNING
|
||||
run.started_at = timezone.now()
|
||||
run.result["execution"] = {"log": str(log_path)}
|
||||
run.save(update_fields=["status", "started_at", "result"])
|
||||
|
||||
reconciled = reconcile_running_runs()
|
||||
|
||||
self.assertEqual(reconciled, 1)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.FAILED)
|
||||
self.assertEqual(run.rsync_exit_code, 255)
|
||||
self.assertEqual(run.result["failure"]["category"], "transport")
|
||||
self.assertIn("Broken pipe", "\n".join(run.result["rsync"]["log_tail"]))
|
||||
|
||||
def test_worker_does_not_fail_real_run_for_vanished_file_warning_log(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log_path.write_text(
|
||||
"file has vanished: \"/var/lib/app/session\"\n"
|
||||
"rsync warning: some files vanished before they could be transferred (code 24) at main.c(1338) [sender=3.4.1]\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
run.status = BackupRun.Status.RUNNING
|
||||
run.started_at = timezone.now()
|
||||
run.result["execution"] = {"log": str(log_path)}
|
||||
run.save(update_fields=["status", "started_at", "result"])
|
||||
|
||||
reconciled = reconcile_running_runs()
|
||||
|
||||
self.assertEqual(reconciled, 0)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.RUNNING)
|
||||
|
||||
def test_worker_records_dry_run_log_path_while_running(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
|
||||
|
||||
Reference in New Issue
Block a user