(bugfix) Reconcile failed dry-runs from rsync terminal logs

Classify rsync failures in run results so transport issues such as exit
255 and broken pipes show clearer diagnostic hints.

Teach the worker to reconcile running dry-runs when their log already
contains a terminal rsync error, and to fail stale dry-runs after their
timeout window. This prevents failed rsync processes from leaving runs
stuck in the running state indefinitely.
This commit is contained in:
2026-05-19 21:10:08 +02:00
parent d67ba9cada
commit d52a9167d1
5 changed files with 201 additions and 7 deletions

View File

@@ -1,13 +1,15 @@
from __future__ import annotations
from datetime import timedelta
from pathlib import Path
from tempfile import TemporaryDirectory
from unittest.mock import patch
from django.test import TestCase
from django.utils import timezone
from pobsync.util import write_yaml_atomic
from pobsync_backend.backup_runner import queue_backup_run
from pobsync_backend.backup_runner import queue_backup_run, reconcile_running_runs
from pobsync_backend.management.commands.run_pobsync_worker import Command
from pobsync_backend.models import BackupRun, GlobalConfig, HostConfig, SnapshotRecord
@@ -133,3 +135,43 @@ class BackupWorkerTests(TestCase):
count = Command()._run_once(prefix=Path("/opt/pobsync"))
self.assertEqual(count, 0)
def test_worker_reconciles_running_dry_run_with_terminal_broken_pipe_log(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host, dry_run=True)
log_path = Path("/tmp/pobsync-dryrun/web-01/run-test-broken-pipe.log")
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text(
"rsync error: unexplained error (code 255) at rsync.c(716) [generator=3.4.1]\n"
"rsync: [generator] write error: Broken pipe (32)\n",
encoding="utf-8",
)
run.status = BackupRun.Status.RUNNING
run.started_at = timezone.now()
run.result["execution"] = {"log": str(log_path)}
run.save(update_fields=["status", "started_at", "result"])
reconciled = reconcile_running_runs()
self.assertEqual(reconciled, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.FAILED)
self.assertEqual(run.rsync_exit_code, 255)
self.assertEqual(run.result["failure"]["category"], "transport")
self.assertIn("Broken pipe", "\n".join(run.result["rsync"]["log_tail"]))
def test_worker_reconciles_stale_running_dry_run_after_timeout(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host, dry_run=True)
run.status = BackupRun.Status.RUNNING
run.started_at = timezone.now() - timedelta(seconds=1300)
run.result["timeout_seconds"] = 900
run.save(update_fields=["status", "started_at", "result"])
reconciled = reconcile_running_runs(grace_seconds=300)
self.assertEqual(reconciled, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.FAILED)
self.assertEqual(run.rsync_exit_code, 124)
self.assertEqual(run.result["failure"]["category"], "timeout")