(bugfix) Reconcile real rsync failures from worker logs

Record live rsync log paths for normal backup runs so the worker can
recover stale running state after terminal rsync errors.

Treat rsync vanished-file exit code 24 as a warning and keep the
completed snapshot instead of failing the run into incomplete state.

Closes #54
This commit is contained in:
2026-05-23 00:23:14 +02:00
parent 8633cbea26
commit 6eb1b4add3
4 changed files with 285 additions and 9 deletions

View File

@@ -85,6 +85,37 @@ class BackupWorkerTests(TestCase):
self.assertEqual(SnapshotRecord.objects.count(), 1)
self.assertEqual(run.snapshot, SnapshotRecord.objects.get())
def test_worker_records_warning_status_from_completed_run(self) -> None:
with TemporaryDirectory() as tmp:
backup_root = Path(tmp) / "backups"
GlobalConfig.objects.create(name="default", backup_root=str(backup_root))
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
snapshot_dir = backup_root / host.host / "scheduled" / "20260519-021500Z__ABCDEFGH"
meta_dir = snapshot_dir / "meta"
meta_dir.mkdir(parents=True)
write_yaml_atomic(meta_dir / "meta.yaml", {"status": "warning", "started_at": "2026-05-19T02:15:00Z"})
run = queue_backup_run(host=host)
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
run_scheduled.return_value = {
"ok": True,
"status": "warning",
"dry_run": False,
"host": host.host,
"snapshot": str(snapshot_dir),
"base": None,
"warning": {"category": "vanished"},
"rsync": {"exit_code": 24},
}
count = Command()._run_once(prefix=Path(tmp) / "home")
self.assertEqual(count, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.WARNING)
self.assertEqual(run.rsync_exit_code, 24)
self.assertEqual(run.result["warning"]["category"], "vanished")
def test_worker_refreshes_heartbeat_while_run_is_active(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
@@ -116,6 +147,41 @@ class BackupWorkerTests(TestCase):
run_scheduled.side_effect = fake_run_scheduled
Command()._run_once(prefix=Path(tmp) / "home")
def test_worker_records_real_run_log_path_while_running(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
snapshot_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH"
log_path = snapshot_dir / "meta" / "rsync.log"
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
def fake_run_scheduled(**kwargs):
kwargs["state_callback"](
{
"status": "running",
"snapshot": str(snapshot_dir),
"log": str(log_path),
"rsync": {"command": ["rsync"], "exit_code": None},
}
)
run.refresh_from_db()
self.assertEqual(run.snapshot_path, str(snapshot_dir))
self.assertEqual(run.result["execution"]["log"], str(log_path))
self.assertEqual(run.result["execution"]["snapshot"], str(snapshot_dir))
self.assertEqual(run.result["rsync"]["command"], ["rsync"])
return {
"ok": True,
"dry_run": False,
"host": host.host,
"snapshot": "",
"base": None,
"rsync": {"exit_code": 0},
}
run_scheduled.side_effect = fake_run_scheduled
Command()._run_once(prefix=Path(tmp) / "home")
def test_worker_reconciles_stale_real_run_after_heartbeat_timeout(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
@@ -136,6 +202,54 @@ class BackupWorkerTests(TestCase):
self.assertEqual(run.result["failure"]["category"], "worker")
self.assertIn("heartbeat stopped", run.result["failure"]["message"])
def test_worker_reconciles_real_run_with_terminal_broken_pipe_log(self) -> None:
with TemporaryDirectory() as tmp:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text(
"rsync error: unexplained error (code 255) at rsync.c(716) [generator=3.4.1]\n"
"rsync error: received SIGUSR1 (code 19) at main.c(1600) [receiver=3.4.1]\n"
"rsync: [generator] write error: Broken pipe (32)\n",
encoding="utf-8",
)
run.status = BackupRun.Status.RUNNING
run.started_at = timezone.now()
run.result["execution"] = {"log": str(log_path)}
run.save(update_fields=["status", "started_at", "result"])
reconciled = reconcile_running_runs()
self.assertEqual(reconciled, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.FAILED)
self.assertEqual(run.rsync_exit_code, 255)
self.assertEqual(run.result["failure"]["category"], "transport")
self.assertIn("Broken pipe", "\n".join(run.result["rsync"]["log_tail"]))
def test_worker_does_not_fail_real_run_for_vanished_file_warning_log(self) -> None:
with TemporaryDirectory() as tmp:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text(
"file has vanished: \"/var/lib/app/session\"\n"
"rsync warning: some files vanished before they could be transferred (code 24) at main.c(1338) [sender=3.4.1]\n",
encoding="utf-8",
)
run.status = BackupRun.Status.RUNNING
run.started_at = timezone.now()
run.result["execution"] = {"log": str(log_path)}
run.save(update_fields=["status", "started_at", "result"])
reconciled = reconcile_running_runs()
self.assertEqual(reconciled, 0)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.RUNNING)
def test_worker_records_dry_run_log_path_while_running(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))

View File

@@ -256,6 +256,62 @@ class RunScheduledConfigSourceTests(SimpleTestCase):
self.assertIn("stats:", meta_text)
self.assertIn("files_total: 10", meta_text)
def test_real_run_reports_running_state_callback_before_rsync_returns(self) -> None:
states = []
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
self.assertEqual(len(states), 1)
self.assertEqual(states[0]["status"], "running")
self.assertEqual(states[0]["log"], str(log_path))
self.assertEqual(states[0]["rsync"]["command"], command)
log_path.write_text("Number of files: 1\n", encoding="utf-8")
return RsyncResult(exit_code=0, command=command)
with TemporaryDirectory() as tmp:
with patch("pobsync.commands.run_scheduled.run_rsync", side_effect=fake_run_rsync):
run_scheduled(
prefix=Path(tmp) / "home",
host="web-01",
dry_run=False,
config_source=FakeConfigSource(backup_root=str(Path(tmp) / "backups")),
state_callback=states.append,
)
self.assertEqual(len(states), 1)
self.assertIn("/.incomplete/", states[0]["snapshot"])
def test_real_run_keeps_snapshot_with_warning_for_vanished_files(self) -> None:
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
log_path.write_text(
"file has vanished: \"/var/lib/app/session\"\n"
"rsync warning: some files vanished before they could be transferred (code 24) at main.c(1338) [sender=3.4.1]\n",
encoding="utf-8",
)
data_dir = log_path.parent.parent / "data"
data_dir.mkdir(parents=True, exist_ok=True)
(data_dir / "payload.txt").write_text("payload", encoding="utf-8")
return RsyncResult(exit_code=24, command=command)
with TemporaryDirectory() as tmp:
backup_root = Path(tmp) / "backups"
with patch("pobsync.commands.run_scheduled.run_rsync", side_effect=fake_run_rsync):
result = run_scheduled(
prefix=Path(tmp) / "home",
host="web-01",
dry_run=False,
config_source=FakeConfigSource(backup_root=str(backup_root)),
)
snapshot = Path(result["snapshot"])
self.assertTrue((snapshot / "data" / "payload.txt").exists())
self.assertTrue(result["ok"])
self.assertEqual(result["status"], "warning")
self.assertEqual(result["rsync"]["exit_code"], 24)
self.assertEqual(result["warning"]["category"], "vanished")
self.assertEqual(snapshot.parent.name, "scheduled")
self.assertIn("file has vanished", "\n".join(result["rsync"]["log_tail"]))
def test_dry_run_reports_cancelled_rsync(self) -> None:
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
self.assertTrue(cancel_check())