(release) Track worker heartbeat for running jobs
Record worker pid, host, claim time, and heartbeat metadata on running backup jobs so operators can see which worker owns a run. Refresh the heartbeat while rsync is active and reconcile stale running runs when the worker heartbeat stops. Add a worker option to tune or disable stale-run reconciliation. Refs #11
This commit is contained in:
@@ -19,6 +19,12 @@ class Command(BaseCommand):
|
||||
parser.add_argument("--once", action="store_true", help="Process one queued run and exit")
|
||||
parser.add_argument("--loop", action="store_true", help="Keep checking for queued runs")
|
||||
parser.add_argument("--interval", type=int, default=15, help="Loop interval in seconds")
|
||||
parser.add_argument(
|
||||
"--stale-running-seconds",
|
||||
type=int,
|
||||
default=24 * 60 * 60,
|
||||
help="Mark running runs failed after this many seconds without a worker heartbeat; use 0 to disable",
|
||||
)
|
||||
|
||||
def handle(self, *args: Any, **options: Any) -> None:
|
||||
if not options["once"] and not options["loop"]:
|
||||
@@ -26,14 +32,14 @@ class Command(BaseCommand):
|
||||
|
||||
paths = PobsyncPaths(home=Path(options["prefix"]))
|
||||
while True:
|
||||
count = self._run_once(prefix=paths.home)
|
||||
count = self._run_once(prefix=paths.home, stale_running_seconds=int(options["stale_running_seconds"]))
|
||||
self.stdout.write(f"Ran {count} queued backup run(s).")
|
||||
if options["once"]:
|
||||
return
|
||||
time.sleep(max(1, int(options["interval"])))
|
||||
|
||||
def _run_once(self, *, prefix: Path) -> int:
|
||||
reconciled = reconcile_running_runs()
|
||||
def _run_once(self, *, prefix: Path, stale_running_seconds: int = 24 * 60 * 60) -> int:
|
||||
reconciled = reconcile_running_runs(stale_worker_seconds=stale_running_seconds)
|
||||
run = claim_next_queued_run()
|
||||
if run is None:
|
||||
return reconciled
|
||||
|
||||
Reference in New Issue
Block a user