(release) Track worker heartbeat for running jobs
Record worker pid, host, claim time, and heartbeat metadata on running backup jobs so operators can see which worker owns a run. Refresh the heartbeat while rsync is active and reconcile stale running runs when the worker heartbeat stops. Add a worker option to tune or disable stale-run reconciliation. Refs #11
This commit is contained in:
@@ -61,6 +61,9 @@ class BackupWorkerTests(TestCase):
|
||||
def fake_run_scheduled(**kwargs):
|
||||
run.refresh_from_db()
|
||||
self.assertIn("execution", run.result)
|
||||
self.assertIn("worker_pid", run.result["execution"])
|
||||
self.assertIn("worker_host", run.result["execution"])
|
||||
self.assertIn("heartbeat_at", run.result["execution"])
|
||||
return {
|
||||
"ok": True,
|
||||
"dry_run": False,
|
||||
@@ -82,6 +85,57 @@ class BackupWorkerTests(TestCase):
|
||||
self.assertEqual(SnapshotRecord.objects.count(), 1)
|
||||
self.assertEqual(run.snapshot, SnapshotRecord.objects.get())
|
||||
|
||||
def test_worker_refreshes_heartbeat_while_run_is_active(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
|
||||
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
|
||||
def fake_run_scheduled(**kwargs):
|
||||
run.refresh_from_db()
|
||||
old_heartbeat = timezone.now() - timedelta(seconds=120)
|
||||
run.result["execution"]["heartbeat_at"] = old_heartbeat.isoformat()
|
||||
run.save(update_fields=["result"])
|
||||
|
||||
self.assertFalse(kwargs["cancel_check"]())
|
||||
run.refresh_from_db()
|
||||
self.assertGreater(
|
||||
timezone.datetime.fromisoformat(run.result["execution"]["heartbeat_at"]),
|
||||
old_heartbeat,
|
||||
)
|
||||
return {
|
||||
"ok": True,
|
||||
"dry_run": False,
|
||||
"host": host.host,
|
||||
"snapshot": "",
|
||||
"base": None,
|
||||
"rsync": {"exit_code": 0},
|
||||
}
|
||||
|
||||
run_scheduled.side_effect = fake_run_scheduled
|
||||
Command()._run_once(prefix=Path(tmp) / "home")
|
||||
|
||||
def test_worker_reconciles_stale_real_run_after_heartbeat_timeout(self) -> None:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = queue_backup_run(host=host)
|
||||
run.status = BackupRun.Status.RUNNING
|
||||
run.started_at = timezone.now() - timedelta(seconds=120)
|
||||
run.result["execution"] = {
|
||||
"worker_pid": 123,
|
||||
"worker_host": "backup",
|
||||
"heartbeat_at": (timezone.now() - timedelta(seconds=90)).isoformat(),
|
||||
}
|
||||
run.save(update_fields=["status", "started_at", "result"])
|
||||
|
||||
reconciled = reconcile_running_runs(stale_worker_seconds=30)
|
||||
|
||||
self.assertEqual(reconciled, 1)
|
||||
run.refresh_from_db()
|
||||
self.assertEqual(run.status, BackupRun.Status.FAILED)
|
||||
self.assertEqual(run.result["failure"]["category"], "worker")
|
||||
self.assertIn("heartbeat stopped", run.result["failure"]["message"])
|
||||
|
||||
def test_worker_records_dry_run_log_path_while_running(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
|
||||
|
||||
@@ -1373,6 +1373,29 @@ class ViewTests(TestCase):
|
||||
self.assertContains(response, "Cancel run")
|
||||
self.assertContains(response, reverse("cancel_run", args=[run.id]))
|
||||
|
||||
def test_run_detail_renders_worker_execution_metadata(self) -> None:
|
||||
self.client.force_login(self.staff_user)
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
run = BackupRun.objects.create(
|
||||
host=host,
|
||||
status=BackupRun.Status.RUNNING,
|
||||
result={
|
||||
"execution": {
|
||||
"worker_host": "backup-01",
|
||||
"worker_pid": 4242,
|
||||
"heartbeat_at": "2026-05-21T10:30:00+00:00",
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
response = self.client.get(reverse("run_detail", args=[run.id]))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertContains(response, "Worker:")
|
||||
self.assertContains(response, "backup-01")
|
||||
self.assertContains(response, "pid 4242")
|
||||
self.assertContains(response, "Worker heartbeat:")
|
||||
|
||||
def test_cancel_run_marks_queued_run_cancelled(self) -> None:
|
||||
self.client.force_login(self.staff_user)
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
|
||||
Reference in New Issue
Block a user