(release) Track worker heartbeat for running jobs

Record worker pid, host, claim time, and heartbeat metadata on running
backup jobs so operators can see which worker owns a run.

Refresh the heartbeat while rsync is active and reconcile stale running
runs when the worker heartbeat stops. Add a worker option to tune or
disable stale-run reconciliation.

Refs #11
This commit is contained in:
2026-05-21 03:16:38 +02:00
parent 404b7f7500
commit 4c8ed24561
6 changed files with 184 additions and 10 deletions

View File

@@ -19,6 +19,12 @@ class Command(BaseCommand):
parser.add_argument("--once", action="store_true", help="Process one queued run and exit")
parser.add_argument("--loop", action="store_true", help="Keep checking for queued runs")
parser.add_argument("--interval", type=int, default=15, help="Loop interval in seconds")
parser.add_argument(
"--stale-running-seconds",
type=int,
default=24 * 60 * 60,
help="Mark running runs failed after this many seconds without a worker heartbeat; use 0 to disable",
)
def handle(self, *args: Any, **options: Any) -> None:
if not options["once"] and not options["loop"]:
@@ -26,14 +32,14 @@ class Command(BaseCommand):
paths = PobsyncPaths(home=Path(options["prefix"]))
while True:
count = self._run_once(prefix=paths.home)
count = self._run_once(prefix=paths.home, stale_running_seconds=int(options["stale_running_seconds"]))
self.stdout.write(f"Ran {count} queued backup run(s).")
if options["once"]:
return
time.sleep(max(1, int(options["interval"])))
def _run_once(self, *, prefix: Path) -> int:
reconciled = reconcile_running_runs()
def _run_once(self, *, prefix: Path, stale_running_seconds: int = 24 * 60 * 60) -> int:
reconciled = reconcile_running_runs(stale_worker_seconds=stale_running_seconds)
run = claim_next_queued_run()
if run is None:
return reconciled