(bugfix) Track rsync process state for running backups

Record rsync process pid and execution phase while normal backup runs are
active so the worker can reconcile stale running rows when rsync has
already disappeared.

Keep finalizing runs out of the missing-process path to avoid marking
slow post-rsync stats collection as a failed transfer.

Closes #54
This commit is contained in:
2026-05-23 00:26:22 +02:00
parent 6eb1b4add3
commit 28da9c4096
5 changed files with 146 additions and 4 deletions

View File

@@ -225,6 +225,9 @@ def _record_running_state(run_id: int, state: dict[str, object]) -> None:
log_path = state.get("log")
snapshot_path = state.get("snapshot")
phase = state.get("phase")
if isinstance(phase, str) and phase:
execution["phase"] = phase
if isinstance(log_path, str) and log_path:
execution["log"] = log_path
if isinstance(snapshot_path, str) and snapshot_path:
@@ -275,6 +278,30 @@ def _reconcile_running_run(*, run: BackupRun, grace_seconds: int, stale_worker_s
run.result = result
run.save(update_fields=["status", "ended_at", "rsync_exit_code", "result"])
return True
if _running_rsync_process_missing(run=run, grace_seconds=grace_seconds):
result.update(
{
"ok": False,
"host": run.host.host,
"log": str(log_path) if log_path else "",
"failure": {
"category": "rsync_process",
"message": "The rsync process is no longer running while the backup is still marked running.",
"hint": "Check the rsync log and pobsync-worker.service logs before retrying the backup.",
},
"rsync": {
**(result.get("rsync") if isinstance(result.get("rsync"), dict) else {}),
"exit_code": exit_code or 255,
"log_tail": log_tail,
},
}
)
run.status = BackupRun.Status.FAILED
run.ended_at = timezone.now()
run.rsync_exit_code = exit_code or 255
run.result = result
run.save(update_fields=["status", "ended_at", "rsync_exit_code", "result"])
return True
if stale_worker:
result.update(
{
@@ -413,6 +440,33 @@ def _running_worker_timed_out(*, run: BackupRun, stale_worker_seconds: int) -> b
return timezone.now() >= heartbeat_at + timedelta(seconds=stale_worker_seconds)
def _running_rsync_process_missing(*, run: BackupRun, grace_seconds: int) -> bool:
if grace_seconds <= 0:
return False
result = run.result if isinstance(run.result, dict) else {}
execution = result.get("execution") if isinstance(result.get("execution"), dict) else {}
if execution.get("phase") != "rsync":
return False
rsync = result.get("rsync") if isinstance(result.get("rsync"), dict) else {}
pid = rsync.get("pid")
if not isinstance(pid, int) or pid <= 0:
return False
heartbeat_at = _parse_iso_datetime(execution.get("heartbeat_at")) or run.started_at
if heartbeat_at is None or timezone.now() < heartbeat_at + timedelta(seconds=grace_seconds):
return False
return not _process_exists(pid)
def _process_exists(pid: int) -> bool:
try:
os.kill(pid, 0)
except ProcessLookupError:
return False
except PermissionError:
return True
return True
def _parse_iso_datetime(value: object):
if not isinstance(value, str) or not value:
return None