(bugfix) Reconcile real rsync failures from worker logs

Record live rsync log paths for normal backup runs so the worker can
recover stale running state after terminal rsync errors.

Treat rsync vanished-file exit code 24 as a warning and keep the
completed snapshot instead of failing the run into incomplete state.

Closes #54
This commit is contained in:
2026-05-23 00:23:14 +02:00
parent 8633cbea26
commit 6eb1b4add3
4 changed files with 285 additions and 9 deletions

View File

@@ -85,6 +85,37 @@ class BackupWorkerTests(TestCase):
self.assertEqual(SnapshotRecord.objects.count(), 1)
self.assertEqual(run.snapshot, SnapshotRecord.objects.get())
def test_worker_records_warning_status_from_completed_run(self) -> None:
with TemporaryDirectory() as tmp:
backup_root = Path(tmp) / "backups"
GlobalConfig.objects.create(name="default", backup_root=str(backup_root))
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
snapshot_dir = backup_root / host.host / "scheduled" / "20260519-021500Z__ABCDEFGH"
meta_dir = snapshot_dir / "meta"
meta_dir.mkdir(parents=True)
write_yaml_atomic(meta_dir / "meta.yaml", {"status": "warning", "started_at": "2026-05-19T02:15:00Z"})
run = queue_backup_run(host=host)
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
run_scheduled.return_value = {
"ok": True,
"status": "warning",
"dry_run": False,
"host": host.host,
"snapshot": str(snapshot_dir),
"base": None,
"warning": {"category": "vanished"},
"rsync": {"exit_code": 24},
}
count = Command()._run_once(prefix=Path(tmp) / "home")
self.assertEqual(count, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.WARNING)
self.assertEqual(run.rsync_exit_code, 24)
self.assertEqual(run.result["warning"]["category"], "vanished")
def test_worker_refreshes_heartbeat_while_run_is_active(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
@@ -116,6 +147,41 @@ class BackupWorkerTests(TestCase):
run_scheduled.side_effect = fake_run_scheduled
Command()._run_once(prefix=Path(tmp) / "home")
def test_worker_records_real_run_log_path_while_running(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
snapshot_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH"
log_path = snapshot_dir / "meta" / "rsync.log"
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
def fake_run_scheduled(**kwargs):
kwargs["state_callback"](
{
"status": "running",
"snapshot": str(snapshot_dir),
"log": str(log_path),
"rsync": {"command": ["rsync"], "exit_code": None},
}
)
run.refresh_from_db()
self.assertEqual(run.snapshot_path, str(snapshot_dir))
self.assertEqual(run.result["execution"]["log"], str(log_path))
self.assertEqual(run.result["execution"]["snapshot"], str(snapshot_dir))
self.assertEqual(run.result["rsync"]["command"], ["rsync"])
return {
"ok": True,
"dry_run": False,
"host": host.host,
"snapshot": "",
"base": None,
"rsync": {"exit_code": 0},
}
run_scheduled.side_effect = fake_run_scheduled
Command()._run_once(prefix=Path(tmp) / "home")
def test_worker_reconciles_stale_real_run_after_heartbeat_timeout(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
@@ -136,6 +202,54 @@ class BackupWorkerTests(TestCase):
self.assertEqual(run.result["failure"]["category"], "worker")
self.assertIn("heartbeat stopped", run.result["failure"]["message"])
def test_worker_reconciles_real_run_with_terminal_broken_pipe_log(self) -> None:
with TemporaryDirectory() as tmp:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text(
"rsync error: unexplained error (code 255) at rsync.c(716) [generator=3.4.1]\n"
"rsync error: received SIGUSR1 (code 19) at main.c(1600) [receiver=3.4.1]\n"
"rsync: [generator] write error: Broken pipe (32)\n",
encoding="utf-8",
)
run.status = BackupRun.Status.RUNNING
run.started_at = timezone.now()
run.result["execution"] = {"log": str(log_path)}
run.save(update_fields=["status", "started_at", "result"])
reconciled = reconcile_running_runs()
self.assertEqual(reconciled, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.FAILED)
self.assertEqual(run.rsync_exit_code, 255)
self.assertEqual(run.result["failure"]["category"], "transport")
self.assertIn("Broken pipe", "\n".join(run.result["rsync"]["log_tail"]))
def test_worker_does_not_fail_real_run_for_vanished_file_warning_log(self) -> None:
with TemporaryDirectory() as tmp:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text(
"file has vanished: \"/var/lib/app/session\"\n"
"rsync warning: some files vanished before they could be transferred (code 24) at main.c(1338) [sender=3.4.1]\n",
encoding="utf-8",
)
run.status = BackupRun.Status.RUNNING
run.started_at = timezone.now()
run.result["execution"] = {"log": str(log_path)}
run.save(update_fields=["status", "started_at", "result"])
reconciled = reconcile_running_runs()
self.assertEqual(reconciled, 0)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.RUNNING)
def test_worker_records_dry_run_log_path_while_running(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))