(bugfix) Reconcile real rsync failures from worker logs
Record live rsync log paths for normal backup runs so the worker can recover stale running state after terminal rsync errors. Treat rsync vanished-file exit code 24 as a warning and keep the completed snapshot instead of failing the run into incomplete state. Closes #54
This commit is contained in:
@@ -85,6 +85,37 @@ class BackupWorkerTests(TestCase):
|
||||
self.assertEqual(SnapshotRecord.objects.count(), 1)
|
||||
self.assertEqual(run.snapshot, SnapshotRecord.objects.get())
|
||||
|
||||
def test_worker_records_warning_status_from_completed_run(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
backup_root = Path(tmp) / "backups"
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(backup_root))
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
snapshot_dir = backup_root / host.host / "scheduled" / "20260519-021500Z__ABCDEFGH"
|
||||
meta_dir = snapshot_dir / "meta"
|
||||
meta_dir.mkdir(parents=True)
|
||||
write_yaml_atomic(meta_dir / "meta.yaml", {"status": "warning", "started_at": "2026-05-19T02:15:00Z"})
|
||||
run = queue_backup_run(host=host)
|
||||
|
||||
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
|
||||
run_scheduled.return_value = {
|
||||
"ok": True,
|
||||
"status": "warning",
|
||||
"dry_run": False,
|
||||
"host": host.host,
|
||||
"snapshot": str(snapshot_dir),
|
||||
"base": None,
|
||||
"warning": {"category": "vanished"},
|
||||
"rsync": {"exit_code": 24},
|
||||
}
|
||||
|
||||
count = Command()._run_once(prefix=Path(tmp) / "home")
|
||||
|
||||
self.assertEqual(count, 1)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.WARNING)
|
||||
self.assertEqual(run.rsync_exit_code, 24)
|
||||
self.assertEqual(run.result["warning"]["category"], "vanished")
|
||||
|
||||
def test_worker_refreshes_heartbeat_while_run_is_active(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
|
||||
@@ -116,6 +147,41 @@ class BackupWorkerTests(TestCase):
|
||||
run_scheduled.side_effect = fake_run_scheduled
|
||||
Command()._run_once(prefix=Path(tmp) / "home")
|
||||
|
||||
def test_worker_records_real_run_log_path_while_running(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
snapshot_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH"
|
||||
log_path = snapshot_dir / "meta" / "rsync.log"
|
||||
|
||||
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
|
||||
def fake_run_scheduled(**kwargs):
|
||||
kwargs["state_callback"](
|
||||
{
|
||||
"status": "running",
|
||||
"snapshot": str(snapshot_dir),
|
||||
"log": str(log_path),
|
||||
"rsync": {"command": ["rsync"], "exit_code": None},
|
||||
}
|
||||
)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.snapshot_path, str(snapshot_dir))
|
||||
self.assertEqual(run.result["execution"]["log"], str(log_path))
|
||||
self.assertEqual(run.result["execution"]["snapshot"], str(snapshot_dir))
|
||||
self.assertEqual(run.result["rsync"]["command"], ["rsync"])
|
||||
return {
|
||||
"ok": True,
|
||||
"dry_run": False,
|
||||
"host": host.host,
|
||||
"snapshot": "",
|
||||
"base": None,
|
||||
"rsync": {"exit_code": 0},
|
||||
}
|
||||
|
||||
run_scheduled.side_effect = fake_run_scheduled
|
||||
Command()._run_once(prefix=Path(tmp) / "home")
|
||||
|
||||
def test_worker_reconciles_stale_real_run_after_heartbeat_timeout(self) -> None:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
@@ -136,6 +202,54 @@ class BackupWorkerTests(TestCase):
|
||||
self.assertEqual(run.result["failure"]["category"], "worker")
|
||||
self.assertIn("heartbeat stopped", run.result["failure"]["message"])
|
||||
|
||||
def test_worker_reconciles_real_run_with_terminal_broken_pipe_log(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log_path.write_text(
|
||||
"rsync error: unexplained error (code 255) at rsync.c(716) [generator=3.4.1]\n"
|
||||
"rsync error: received SIGUSR1 (code 19) at main.c(1600) [receiver=3.4.1]\n"
|
||||
"rsync: [generator] write error: Broken pipe (32)\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
run.status = BackupRun.Status.RUNNING
|
||||
run.started_at = timezone.now()
|
||||
run.result["execution"] = {"log": str(log_path)}
|
||||
run.save(update_fields=["status", "started_at", "result"])
|
||||
|
||||
reconciled = reconcile_running_runs()
|
||||
|
||||
self.assertEqual(reconciled, 1)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.FAILED)
|
||||
self.assertEqual(run.rsync_exit_code, 255)
|
||||
self.assertEqual(run.result["failure"]["category"], "transport")
|
||||
self.assertIn("Broken pipe", "\n".join(run.result["rsync"]["log_tail"]))
|
||||
|
||||
def test_worker_does_not_fail_real_run_for_vanished_file_warning_log(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
log_path = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-021500Z__ABCDEFGH" / "meta" / "rsync.log"
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log_path.write_text(
|
||||
"file has vanished: \"/var/lib/app/session\"\n"
|
||||
"rsync warning: some files vanished before they could be transferred (code 24) at main.c(1338) [sender=3.4.1]\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
run.status = BackupRun.Status.RUNNING
|
||||
run.started_at = timezone.now()
|
||||
run.result["execution"] = {"log": str(log_path)}
|
||||
run.save(update_fields=["status", "started_at", "result"])
|
||||
|
||||
reconciled = reconcile_running_runs()
|
||||
|
||||
self.assertEqual(reconciled, 0)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.RUNNING)
|
||||
|
||||
def test_worker_records_dry_run_log_path_while_running(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
|
||||
|
||||
@@ -256,6 +256,62 @@ class RunScheduledConfigSourceTests(SimpleTestCase):
|
||||
self.assertIn("stats:", meta_text)
|
||||
self.assertIn("files_total: 10", meta_text)
|
||||
|
||||
def test_real_run_reports_running_state_callback_before_rsync_returns(self) -> None:
|
||||
states = []
|
||||
|
||||
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
|
||||
self.assertEqual(len(states), 1)
|
||||
self.assertEqual(states[0]["status"], "running")
|
||||
self.assertEqual(states[0]["log"], str(log_path))
|
||||
self.assertEqual(states[0]["rsync"]["command"], command)
|
||||
log_path.write_text("Number of files: 1\n", encoding="utf-8")
|
||||
return RsyncResult(exit_code=0, command=command)
|
||||
|
||||
with TemporaryDirectory() as tmp:
|
||||
with patch("pobsync.commands.run_scheduled.run_rsync", side_effect=fake_run_rsync):
|
||||
run_scheduled(
|
||||
prefix=Path(tmp) / "home",
|
||||
host="web-01",
|
||||
dry_run=False,
|
||||
config_source=FakeConfigSource(backup_root=str(Path(tmp) / "backups")),
|
||||
state_callback=states.append,
|
||||
)
|
||||
|
||||
self.assertEqual(len(states), 1)
|
||||
self.assertIn("/.incomplete/", states[0]["snapshot"])
|
||||
|
||||
def test_real_run_keeps_snapshot_with_warning_for_vanished_files(self) -> None:
|
||||
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
|
||||
log_path.write_text(
|
||||
"file has vanished: \"/var/lib/app/session\"\n"
|
||||
"rsync warning: some files vanished before they could be transferred (code 24) at main.c(1338) [sender=3.4.1]\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
data_dir = log_path.parent.parent / "data"
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
(data_dir / "payload.txt").write_text("payload", encoding="utf-8")
|
||||
return RsyncResult(exit_code=24, command=command)
|
||||
|
||||
with TemporaryDirectory() as tmp:
|
||||
backup_root = Path(tmp) / "backups"
|
||||
with patch("pobsync.commands.run_scheduled.run_rsync", side_effect=fake_run_rsync):
|
||||
result = run_scheduled(
|
||||
prefix=Path(tmp) / "home",
|
||||
host="web-01",
|
||||
dry_run=False,
|
||||
config_source=FakeConfigSource(backup_root=str(backup_root)),
|
||||
)
|
||||
|
||||
snapshot = Path(result["snapshot"])
|
||||
self.assertTrue((snapshot / "data" / "payload.txt").exists())
|
||||
|
||||
self.assertTrue(result["ok"])
|
||||
self.assertEqual(result["status"], "warning")
|
||||
self.assertEqual(result["rsync"]["exit_code"], 24)
|
||||
self.assertEqual(result["warning"]["category"], "vanished")
|
||||
self.assertEqual(snapshot.parent.name, "scheduled")
|
||||
self.assertIn("file has vanished", "\n".join(result["rsync"]["log_tail"]))
|
||||
|
||||
def test_dry_run_reports_cancelled_rsync(self) -> None:
|
||||
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
|
||||
self.assertTrue(cancel_check())
|
||||
|
||||
Reference in New Issue
Block a user