(release) Track worker heartbeat for running jobs

Record worker pid, host, claim time, and heartbeat metadata on running
backup jobs so operators can see which worker owns a run.

Refresh the heartbeat while rsync is active and reconcile stale running
runs when the worker heartbeat stops. Add a worker option to tune or
disable stale-run reconciliation.

Refs #11
This commit is contained in:
2026-05-21 03:16:38 +02:00
parent 404b7f7500
commit 4c8ed24561
6 changed files with 184 additions and 10 deletions

View File

@@ -61,6 +61,9 @@ class BackupWorkerTests(TestCase):
def fake_run_scheduled(**kwargs):
run.refresh_from_db()
self.assertIn("execution", run.result)
self.assertIn("worker_pid", run.result["execution"])
self.assertIn("worker_host", run.result["execution"])
self.assertIn("heartbeat_at", run.result["execution"])
return {
"ok": True,
"dry_run": False,
@@ -82,6 +85,57 @@ class BackupWorkerTests(TestCase):
self.assertEqual(SnapshotRecord.objects.count(), 1)
self.assertEqual(run.snapshot, SnapshotRecord.objects.get())
def test_worker_refreshes_heartbeat_while_run_is_active(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
def fake_run_scheduled(**kwargs):
run.refresh_from_db()
old_heartbeat = timezone.now() - timedelta(seconds=120)
run.result["execution"]["heartbeat_at"] = old_heartbeat.isoformat()
run.save(update_fields=["result"])
self.assertFalse(kwargs["cancel_check"]())
run.refresh_from_db()
self.assertGreater(
timezone.datetime.fromisoformat(run.result["execution"]["heartbeat_at"]),
old_heartbeat,
)
return {
"ok": True,
"dry_run": False,
"host": host.host,
"snapshot": "",
"base": None,
"rsync": {"exit_code": 0},
}
run_scheduled.side_effect = fake_run_scheduled
Command()._run_once(prefix=Path(tmp) / "home")
def test_worker_reconciles_stale_real_run_after_heartbeat_timeout(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host)
run.status = BackupRun.Status.RUNNING
run.started_at = timezone.now() - timedelta(seconds=120)
run.result["execution"] = {
"worker_pid": 123,
"worker_host": "backup",
"heartbeat_at": (timezone.now() - timedelta(seconds=90)).isoformat(),
}
run.save(update_fields=["status", "started_at", "result"])
reconciled = reconcile_running_runs(stale_worker_seconds=30)
self.assertEqual(reconciled, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.FAILED)
self.assertEqual(run.result["failure"]["category"], "worker")
self.assertIn("heartbeat stopped", run.result["failure"]["message"])
def test_worker_records_dry_run_log_path_while_running(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))

View File

@@ -1373,6 +1373,29 @@ class ViewTests(TestCase):
self.assertContains(response, "Cancel run")
self.assertContains(response, reverse("cancel_run", args=[run.id]))
def test_run_detail_renders_worker_execution_metadata(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = BackupRun.objects.create(
host=host,
status=BackupRun.Status.RUNNING,
result={
"execution": {
"worker_host": "backup-01",
"worker_pid": 4242,
"heartbeat_at": "2026-05-21T10:30:00+00:00",
}
},
)
response = self.client.get(reverse("run_detail", args=[run.id]))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Worker:")
self.assertContains(response, "backup-01")
self.assertContains(response, "pid 4242")
self.assertContains(response, "Worker heartbeat:")
def test_cancel_run_marks_queued_run_cancelled(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")