(bugfix) Track rsync process state for running backups
Record rsync process pid and execution phase while normal backup runs are active so the worker can reconcile stale running rows when rsync has already disappeared. Keep finalizing runs out of the missing-process path to avoid marking slow post-rsync stats collection as a failed transfer. Closes #54
This commit is contained in:
@@ -225,6 +225,9 @@ def _record_running_state(run_id: int, state: dict[str, object]) -> None:
|
||||
|
||||
log_path = state.get("log")
|
||||
snapshot_path = state.get("snapshot")
|
||||
phase = state.get("phase")
|
||||
if isinstance(phase, str) and phase:
|
||||
execution["phase"] = phase
|
||||
if isinstance(log_path, str) and log_path:
|
||||
execution["log"] = log_path
|
||||
if isinstance(snapshot_path, str) and snapshot_path:
|
||||
@@ -275,6 +278,30 @@ def _reconcile_running_run(*, run: BackupRun, grace_seconds: int, stale_worker_s
|
||||
run.result = result
|
||||
run.save(update_fields=["status", "ended_at", "rsync_exit_code", "result"])
|
||||
return True
|
||||
if _running_rsync_process_missing(run=run, grace_seconds=grace_seconds):
|
||||
result.update(
|
||||
{
|
||||
"ok": False,
|
||||
"host": run.host.host,
|
||||
"log": str(log_path) if log_path else "",
|
||||
"failure": {
|
||||
"category": "rsync_process",
|
||||
"message": "The rsync process is no longer running while the backup is still marked running.",
|
||||
"hint": "Check the rsync log and pobsync-worker.service logs before retrying the backup.",
|
||||
},
|
||||
"rsync": {
|
||||
**(result.get("rsync") if isinstance(result.get("rsync"), dict) else {}),
|
||||
"exit_code": exit_code or 255,
|
||||
"log_tail": log_tail,
|
||||
},
|
||||
}
|
||||
)
|
||||
run.status = BackupRun.Status.FAILED
|
||||
run.ended_at = timezone.now()
|
||||
run.rsync_exit_code = exit_code or 255
|
||||
run.result = result
|
||||
run.save(update_fields=["status", "ended_at", "rsync_exit_code", "result"])
|
||||
return True
|
||||
if stale_worker:
|
||||
result.update(
|
||||
{
|
||||
@@ -413,6 +440,33 @@ def _running_worker_timed_out(*, run: BackupRun, stale_worker_seconds: int) -> b
|
||||
return timezone.now() >= heartbeat_at + timedelta(seconds=stale_worker_seconds)
|
||||
|
||||
|
||||
def _running_rsync_process_missing(*, run: BackupRun, grace_seconds: int) -> bool:
|
||||
if grace_seconds <= 0:
|
||||
return False
|
||||
result = run.result if isinstance(run.result, dict) else {}
|
||||
execution = result.get("execution") if isinstance(result.get("execution"), dict) else {}
|
||||
if execution.get("phase") != "rsync":
|
||||
return False
|
||||
rsync = result.get("rsync") if isinstance(result.get("rsync"), dict) else {}
|
||||
pid = rsync.get("pid")
|
||||
if not isinstance(pid, int) or pid <= 0:
|
||||
return False
|
||||
heartbeat_at = _parse_iso_datetime(execution.get("heartbeat_at")) or run.started_at
|
||||
if heartbeat_at is None or timezone.now() < heartbeat_at + timedelta(seconds=grace_seconds):
|
||||
return False
|
||||
return not _process_exists(pid)
|
||||
|
||||
|
||||
def _process_exists(pid: int) -> bool:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
return True
|
||||
return True
|
||||
|
||||
|
||||
def _parse_iso_datetime(value: object):
|
||||
if not isinstance(value, str) or not value:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user