(bugfix) Reconcile real rsync failures from worker logs
Record live rsync log paths for normal backup runs so the worker can recover stale running state after terminal rsync errors. Treat rsync vanished-file exit code 24 as a warning and keep the completed snapshot instead of failing the run into incomplete state. Closes #54
This commit is contained in:
@@ -8,7 +8,13 @@ from pathlib import Path
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
from pobsync.commands.run_scheduled import DEFAULT_DRY_RUN_TIMEOUT_SECONDS, classify_rsync_failure, dry_run_log_path, run_scheduled
|
||||
from pobsync.commands.run_scheduled import (
|
||||
DEFAULT_DRY_RUN_TIMEOUT_SECONDS,
|
||||
classify_rsync_failure,
|
||||
classify_rsync_warning,
|
||||
dry_run_log_path,
|
||||
run_scheduled,
|
||||
)
|
||||
from pobsync_backend.config_source import DjangoConfigSource
|
||||
from pobsync_backend.models import BackupRun, HostConfig
|
||||
from pobsync_backend.retention import run_sql_retention_apply
|
||||
@@ -66,6 +72,7 @@ def execute_backup_run(
|
||||
run_id=run.id,
|
||||
cancel_check=lambda: _run_cancel_requested(run.id),
|
||||
verbose_output=bool(dry_run or verbose_output),
|
||||
state_callback=lambda state: _record_running_state(run.id, state),
|
||||
)
|
||||
except Exception as exc:
|
||||
run.refresh_from_db()
|
||||
@@ -83,6 +90,8 @@ def execute_backup_run(
|
||||
run.refresh_from_db()
|
||||
if result.get("cancelled") or run.status == BackupRun.Status.CANCELLED:
|
||||
run.status = BackupRun.Status.CANCELLED
|
||||
elif result.get("status") == BackupRun.Status.WARNING:
|
||||
run.status = BackupRun.Status.WARNING
|
||||
else:
|
||||
run.status = BackupRun.Status.SUCCESS if result.get("ok") else BackupRun.Status.FAILED
|
||||
run.ended_at = timezone.now()
|
||||
@@ -201,11 +210,71 @@ def _run_cancel_requested(run_id: int) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _record_running_state(run_id: int, state: dict[str, object]) -> None:
|
||||
try:
|
||||
run = BackupRun.objects.only("id", "status", "result", "snapshot_path", "rsync_exit_code").get(id=run_id)
|
||||
except BackupRun.DoesNotExist:
|
||||
return
|
||||
if run.status != BackupRun.Status.RUNNING:
|
||||
return
|
||||
|
||||
result = run.result if isinstance(run.result, dict) else {}
|
||||
execution = result.get("execution") if isinstance(result.get("execution"), dict) else {}
|
||||
rsync = result.get("rsync") if isinstance(result.get("rsync"), dict) else {}
|
||||
incoming_rsync = state.get("rsync") if isinstance(state.get("rsync"), dict) else {}
|
||||
|
||||
log_path = state.get("log")
|
||||
snapshot_path = state.get("snapshot")
|
||||
if isinstance(log_path, str) and log_path:
|
||||
execution["log"] = log_path
|
||||
if isinstance(snapshot_path, str) and snapshot_path:
|
||||
execution["snapshot"] = snapshot_path
|
||||
run.snapshot_path = snapshot_path
|
||||
if incoming_rsync:
|
||||
result["rsync"] = {**rsync, **incoming_rsync}
|
||||
exit_code = incoming_rsync.get("exit_code")
|
||||
if isinstance(exit_code, int):
|
||||
run.rsync_exit_code = exit_code
|
||||
result["execution"] = {
|
||||
**execution,
|
||||
"worker_pid": os.getpid(),
|
||||
"worker_host": socket.gethostname(),
|
||||
"heartbeat_at": timezone.now().isoformat(),
|
||||
}
|
||||
run.result = result
|
||||
run.save(update_fields=["snapshot_path", "rsync_exit_code", "result"])
|
||||
|
||||
|
||||
def _reconcile_running_run(*, run: BackupRun, grace_seconds: int, stale_worker_seconds: int) -> bool:
|
||||
result = run.result if isinstance(run.result, dict) else {}
|
||||
requested = result.get("requested") if isinstance(result.get("requested"), dict) else {}
|
||||
log_path = _execution_log_path(result)
|
||||
log_tail = _read_log_tail(log_path) if log_path is not None else []
|
||||
terminal_log = _terminal_rsync_log(log_tail)
|
||||
exit_code = _exit_code_from_log(log_tail)
|
||||
stale_worker = _running_worker_timed_out(run=run, stale_worker_seconds=stale_worker_seconds)
|
||||
if not requested.get("dry_run"):
|
||||
if terminal_log:
|
||||
failure = classify_rsync_failure(exit_code or 255, log_tail)
|
||||
result.update(
|
||||
{
|
||||
"ok": False,
|
||||
"host": run.host.host,
|
||||
"log": str(log_path) if log_path else "",
|
||||
"failure": failure,
|
||||
"rsync": {
|
||||
**(result.get("rsync") if isinstance(result.get("rsync"), dict) else {}),
|
||||
"exit_code": exit_code or 255,
|
||||
"log_tail": log_tail,
|
||||
},
|
||||
}
|
||||
)
|
||||
run.status = BackupRun.Status.FAILED
|
||||
run.ended_at = timezone.now()
|
||||
run.rsync_exit_code = exit_code or 255
|
||||
run.result = result
|
||||
run.save(update_fields=["status", "ended_at", "rsync_exit_code", "result"])
|
||||
return True
|
||||
if stale_worker:
|
||||
result.update(
|
||||
{
|
||||
@@ -225,14 +294,11 @@ def _reconcile_running_run(*, run: BackupRun, grace_seconds: int, stale_worker_s
|
||||
return True
|
||||
return False
|
||||
|
||||
log_path = _execution_log_path(result)
|
||||
log_tail = _read_log_tail(log_path) if log_path is not None else []
|
||||
terminal_log = _terminal_rsync_log(log_tail)
|
||||
timed_out = _running_dry_run_timed_out(run=run, grace_seconds=grace_seconds)
|
||||
if not terminal_log and not timed_out and not stale_worker:
|
||||
return False
|
||||
|
||||
exit_code = _exit_code_from_log(log_tail) or (124 if timed_out or stale_worker else 255)
|
||||
exit_code = exit_code or (124 if timed_out or stale_worker else 255)
|
||||
failure = classify_rsync_failure(exit_code, log_tail)
|
||||
if stale_worker and not terminal_log:
|
||||
failure = {
|
||||
@@ -305,6 +371,9 @@ def _read_log_tail(log_path: Path | None, *, max_lines: int = 40) -> list[str]:
|
||||
|
||||
|
||||
def _terminal_rsync_log(log_tail: list[str]) -> bool:
|
||||
warning = classify_rsync_warning(_exit_code_from_log(log_tail), log_tail)
|
||||
if warning is not None:
|
||||
return False
|
||||
return any(line.startswith("rsync error:") for line in log_tail)
|
||||
|
||||
|
||||
@@ -312,6 +381,8 @@ def _exit_code_from_log(log_tail: list[str]) -> int | None:
|
||||
for line in reversed(log_tail):
|
||||
if "code 255" in line:
|
||||
return 255
|
||||
if "code 24" in line:
|
||||
return 24
|
||||
if "code 124" in line:
|
||||
return 124
|
||||
if "code 12" in line:
|
||||
|
||||
Reference in New Issue
Block a user