(feature) Add cancellable backup runs and clearer dry-run logs

Add a cancel action for queued and running backup runs. Queued runs are
cancelled immediately, while running runs are marked for cancellation and
the worker terminates the active rsync process group.

Make dry-run log paths run-specific and add a defensive default dry-run
timeout so stuck dry-runs do not remain running indefinitely.

Remove rsync exit codes from run overview tables while keeping detailed
diagnostics available on the run detail payload.
This commit is contained in:
2026-05-19 20:46:10 +02:00
parent 088f43279e
commit bbb0f652f3
12 changed files with 291 additions and 47 deletions

View File

@@ -5,7 +5,7 @@ from pathlib import Path
from django.db import transaction
from django.utils import timezone
from pobsync.commands.run_scheduled import run_scheduled
from pobsync.commands.run_scheduled import dry_run_log_path, run_scheduled
from pobsync_backend.config_source import DjangoConfigSource
from pobsync_backend.models import BackupRun, HostConfig
from pobsync_backend.retention import run_sql_retention_apply
@@ -47,7 +47,8 @@ def execute_backup_run(
) -> BackupRun:
run.status = BackupRun.Status.RUNNING
run.started_at = run.started_at or timezone.now()
run.save(update_fields=["status", "started_at"])
run.result = _running_result(run=run, dry_run=bool(dry_run))
run.save(update_fields=["status", "started_at", "result"])
try:
result = run_scheduled(
@@ -56,15 +57,27 @@ def execute_backup_run(
dry_run=bool(dry_run),
prune=False,
config_source=DjangoConfigSource(),
run_id=run.id,
cancel_check=lambda: _run_cancel_requested(run.id),
)
except Exception as exc:
run.status = BackupRun.Status.FAILED
run.refresh_from_db()
run.status = BackupRun.Status.CANCELLED if run.status == BackupRun.Status.CANCELLED else BackupRun.Status.FAILED
run.ended_at = timezone.now()
run.result = {"ok": False, "error": str(exc), "type": type(exc).__name__}
run.result = {
**(run.result if isinstance(run.result, dict) else {}),
"ok": False,
"error": str(exc),
"type": type(exc).__name__,
}
run.save(update_fields=["status", "ended_at", "result"])
raise
run.status = BackupRun.Status.SUCCESS if result.get("ok") else BackupRun.Status.FAILED
run.refresh_from_db()
if result.get("cancelled") or run.status == BackupRun.Status.CANCELLED:
run.status = BackupRun.Status.CANCELLED
else:
run.status = BackupRun.Status.SUCCESS if result.get("ok") else BackupRun.Status.FAILED
run.ended_at = timezone.now()
run.snapshot_path = str(result.get("snapshot") or "")
run.base_path = str(result.get("base") or "")
@@ -146,3 +159,18 @@ def requested_options(run: BackupRun) -> dict[str, object]:
if not isinstance(requested, dict):
return {}
return requested
def _running_result(*, run: BackupRun, dry_run: bool) -> dict[str, object]:
result = dict(run.result) if isinstance(run.result, dict) else {}
execution = {
"started_at": (run.started_at or timezone.now()).isoformat(),
}
if dry_run:
execution["log"] = str(dry_run_log_path(run.host.host, run_id=run.id))
result["execution"] = execution
return result
def _run_cancel_requested(run_id: int) -> bool:
return BackupRun.objects.filter(id=run_id, status=BackupRun.Status.CANCELLED).exists()

View File

@@ -86,7 +86,6 @@
<th>Started</th>
<th>Ended</th>
<th>Snapshot</th>
<th>Rsync</th>
</tr>
</thead>
<tbody>
@@ -97,10 +96,9 @@
<td>{{ run.started_at|default:"" }}</td>
<td>{{ run.ended_at|default:"" }}</td>
<td>{% if run.snapshot %}<a href="{% url 'snapshot_detail' run.snapshot.id %}">{{ run.snapshot.dirname }}</a>{% else %}<span class="muted">{{ run.snapshot_path }}</span>{% endif %}</td>
<td>{{ run.rsync_exit_code|default:"" }}</td>
</tr>
{% empty %}
<tr><td colspan="6" class="muted">No backup runs recorded yet.</td></tr>
<tr><td colspan="5" class="muted">No backup runs recorded yet.</td></tr>
{% endfor %}
</tbody>
</table>

View File

@@ -175,7 +175,6 @@
<th>Ended</th>
<th>Snapshot</th>
<th>Base</th>
<th>Rsync</th>
</tr>
</thead>
<tbody>
@@ -186,10 +185,9 @@
<td>{{ run.ended_at|default:"" }}</td>
<td>{% if run.snapshot %}<a href="{% url 'snapshot_detail' run.snapshot.id %}">{{ run.snapshot.dirname }}</a>{% else %}<span class="muted">{{ run.snapshot_path }}</span>{% endif %}</td>
<td>{{ run.base_path|default:"" }}</td>
<td>{{ run.rsync_exit_code|default:"" }}</td>
</tr>
{% empty %}
<tr><td colspan="6" class="muted">No backup runs recorded for this host.</td></tr>
<tr><td colspan="5" class="muted">No backup runs recorded for this host.</td></tr>
{% endfor %}
</tbody>
</table>

View File

@@ -7,6 +7,12 @@
<section class="actions" aria-label="Run actions">
<a class="button-link" href="{% url 'host_detail' run.host.host %}">Back to host</a>
{% if can_cancel %}
<form method="post" action="{% url 'cancel_run' run.id %}">
{% csrf_token %}
<button type="submit" class="secondary">Cancel run</button>
</form>
{% endif %}
</section>
<section class="grid" aria-label="Run summary">

View File

@@ -47,7 +47,6 @@
<th>Status</th>
<th>Started</th>
<th>Ended</th>
<th>Rsync</th>
</tr>
</thead>
<tbody>
@@ -57,10 +56,9 @@
<td><span class="status {{ run.status }}">{{ run.status }}</span></td>
<td>{{ run.started_at|default:"" }}</td>
<td>{{ run.ended_at|default:"" }}</td>
<td>{{ run.rsync_exit_code|default:"" }}</td>
</tr>
{% empty %}
<tr><td colspan="5" class="muted">No backup runs linked to this snapshot.</td></tr>
<tr><td colspan="4" class="muted">No backup runs linked to this snapshot.</td></tr>
{% endfor %}
</tbody>
</table>

View File

@@ -48,22 +48,87 @@ class BackupWorkerTests(TestCase):
run = queue_backup_run(host=host)
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
run_scheduled.return_value = {
"ok": True,
"dry_run": False,
"host": host.host,
"snapshot": str(snapshot_dir),
"base": None,
"rsync": {"exit_code": 0},
}
def fake_run_scheduled(**kwargs):
run.refresh_from_db()
self.assertIn("execution", run.result)
return {
"ok": True,
"dry_run": False,
"host": host.host,
"snapshot": str(snapshot_dir),
"base": None,
"rsync": {"exit_code": 0},
}
run_scheduled.side_effect = fake_run_scheduled
count = Command()._run_once(prefix=Path(tmp) / "home")
run_scheduled.assert_called_once()
self.assertEqual(count, 1)
self.assertEqual(run_scheduled.call_args.kwargs["run_id"], run.id)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.SUCCESS)
self.assertEqual(SnapshotRecord.objects.count(), 1)
self.assertEqual(run.snapshot, SnapshotRecord.objects.get())
def test_worker_records_dry_run_log_path_while_running(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host, dry_run=True)
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
def fake_run_scheduled(**kwargs):
run.refresh_from_db()
self.assertEqual(
run.result["execution"]["log"],
f"/tmp/pobsync-dryrun/{host.host}/run-{run.id}/rsync.log",
)
return {
"ok": True,
"dry_run": True,
"host": host.host,
"base": None,
"log": run.result["execution"]["log"],
"rsync": {"exit_code": 0},
}
run_scheduled.side_effect = fake_run_scheduled
count = Command()._run_once(prefix=Path(tmp) / "home")
self.assertEqual(count, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.SUCCESS)
self.assertEqual(run.result["log"], f"/tmp/pobsync-dryrun/{host.host}/run-{run.id}/rsync.log")
def test_worker_preserves_cancelled_status_from_running_run(self) -> None:
with TemporaryDirectory() as tmp:
GlobalConfig.objects.create(name="default", backup_root=str(Path(tmp) / "backups"))
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = queue_backup_run(host=host, dry_run=True)
with patch("pobsync_backend.backup_runner.run_scheduled") as run_scheduled:
def fake_run_scheduled(**kwargs):
BackupRun.objects.filter(id=run.id).update(status=BackupRun.Status.CANCELLED)
self.assertTrue(kwargs["cancel_check"]())
return {
"ok": False,
"dry_run": True,
"cancelled": True,
"host": host.host,
"base": None,
"log": f"/tmp/pobsync-dryrun/{host.host}/run-{run.id}/rsync.log",
"rsync": {"exit_code": 130},
}
run_scheduled.side_effect = fake_run_scheduled
count = Command()._run_once(prefix=Path(tmp) / "home")
self.assertEqual(count, 1)
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.CANCELLED)
self.assertEqual(run.rsync_exit_code, 130)
def test_worker_returns_zero_without_queued_runs(self) -> None:
count = Command()._run_once(prefix=Path("/opt/pobsync"))

View File

@@ -50,7 +50,7 @@ class RunScheduledConfigSourceTests(SimpleTestCase):
run_rsync.assert_called_once()
def test_failed_dry_run_includes_log_tail(self) -> None:
def fake_run_rsync(command, log_path, timeout_seconds):
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text("Permission denied (publickey).\nrsync error\n", encoding="utf-8")
return RsyncResult(exit_code=12, command=command)
@@ -68,13 +68,13 @@ class RunScheduledConfigSourceTests(SimpleTestCase):
self.assertEqual(result["rsync"]["log_tail"], ["Permission denied (publickey).", "rsync error"])
def test_dry_run_clears_previous_log_before_running(self) -> None:
def fake_run_rsync(command, log_path, timeout_seconds):
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
self.assertFalse(log_path.exists())
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text("current run only\n", encoding="utf-8")
return RsyncResult(exit_code=0, command=command)
old_log = Path("/tmp/pobsync-dryrun/web-01/rsync.log")
old_log = Path("/tmp/pobsync-dryrun/web-01/adhoc/rsync.log")
old_log.parent.mkdir(parents=True, exist_ok=True)
old_log.write_text("old failure\n", encoding="utf-8")
@@ -89,6 +89,47 @@ class RunScheduledConfigSourceTests(SimpleTestCase):
self.assertTrue(result["ok"])
self.assertEqual(result["rsync"]["log_tail"], ["current run only"])
def test_dry_run_uses_run_specific_log_path_and_default_timeout(self) -> None:
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
self.assertEqual(log_path, Path("/tmp/pobsync-dryrun/web-01/run-42/rsync.log"))
self.assertEqual(timeout_seconds, 900)
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text("run 42\n", encoding="utf-8")
return RsyncResult(exit_code=0, command=command)
with patch("pobsync.commands.run_scheduled.run_rsync", side_effect=fake_run_rsync):
result = run_scheduled(
prefix=Path("/missing-prefix"),
host="web-01",
dry_run=True,
config_source=FakeConfigSource(),
run_id=42,
)
self.assertTrue(result["ok"])
self.assertEqual(result["log"], "/tmp/pobsync-dryrun/web-01/run-42/rsync.log")
self.assertEqual(result["timeout_seconds"], 900)
def test_dry_run_reports_cancelled_rsync(self) -> None:
def fake_run_rsync(command, log_path, timeout_seconds, cancel_check=None):
self.assertTrue(cancel_check())
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text("[pobsync] rsync cancelled\n", encoding="utf-8")
return RsyncResult(exit_code=130, command=command, cancelled=True)
with patch("pobsync.commands.run_scheduled.run_rsync", side_effect=fake_run_rsync):
result = run_scheduled(
prefix=Path("/missing-prefix"),
host="web-01",
dry_run=True,
config_source=FakeConfigSource(),
cancel_check=lambda: True,
)
self.assertFalse(result["ok"])
self.assertTrue(result["cancelled"])
self.assertEqual(result["rsync"]["exit_code"], 130)
def test_successful_real_run_applies_prune_when_requested(self) -> None:
with TemporaryDirectory() as tmp:
prefix = Path(tmp) / "home"

View File

@@ -764,6 +764,44 @@ class ViewTests(TestCase):
self.assertContains(response, "&quot;ok&quot;: true")
self.assertContains(response, reverse("snapshot_detail", args=[snapshot.id]))
def test_run_detail_offers_cancel_for_running_run(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = BackupRun.objects.create(host=host, status=BackupRun.Status.RUNNING)
response = self.client.get(reverse("run_detail", args=[run.id]))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Cancel run")
self.assertContains(response, reverse("cancel_run", args=[run.id]))
def test_cancel_run_marks_queued_run_cancelled(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = BackupRun.objects.create(host=host, status=BackupRun.Status.QUEUED)
response = self.client.post(reverse("cancel_run", args=[run.id]), follow=True)
self.assertRedirects(response, reverse("run_detail", args=[run.id]))
self.assertContains(response, "Cancellation requested")
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.CANCELLED)
self.assertIsNotNone(run.ended_at)
self.assertEqual(run.result["cancellation"]["previous_status"], BackupRun.Status.QUEUED)
def test_cancel_run_requests_running_run_cancellation(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = BackupRun.objects.create(host=host, status=BackupRun.Status.RUNNING)
response = self.client.post(reverse("cancel_run", args=[run.id]), follow=True)
self.assertRedirects(response, reverse("run_detail", args=[run.id]))
run.refresh_from_db()
self.assertEqual(run.status, BackupRun.Status.CANCELLED)
self.assertIsNone(run.ended_at)
self.assertEqual(run.result["cancellation"]["previous_status"], BackupRun.Status.RUNNING)
def test_snapshot_detail_renders_metadata_runs_and_children(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")

View File

@@ -10,6 +10,7 @@ from django.contrib.admin.views.decorators import staff_member_required
from django.conf import settings
from django.db.models import Count
from django.shortcuts import get_object_or_404, redirect, render
from django.utils import timezone
from django.views.decorators.http import require_POST
from pobsync.errors import PobsyncError
@@ -348,12 +349,37 @@ def run_detail(request, run_id: int):
run = get_object_or_404(BackupRun.objects.select_related("host", "snapshot"), id=run_id)
context = {
"run": run,
"can_cancel": run.status in {BackupRun.Status.QUEUED, BackupRun.Status.RUNNING},
"requested": run.result.get("requested") if isinstance(run.result, dict) else {},
"result_json": _pretty_json(run.result),
}
return render(request, "pobsync_backend/run_detail.html", context)
@staff_member_required
@require_POST
def cancel_run(request, run_id: int):
run = get_object_or_404(BackupRun.objects.select_related("host"), id=run_id)
if run.status not in {BackupRun.Status.QUEUED, BackupRun.Status.RUNNING}:
messages.warning(request, f"Run {run.id} is already {run.status}.")
return redirect("run_detail", run_id=run.id)
result = dict(run.result) if isinstance(run.result, dict) else {}
result["cancellation"] = {
"requested_at": timezone.now().isoformat(),
"previous_status": run.status,
}
update_fields = ["status", "result"]
run.status = BackupRun.Status.CANCELLED
run.result = result
if result["cancellation"]["previous_status"] == BackupRun.Status.QUEUED:
run.ended_at = timezone.now()
update_fields.append("ended_at")
run.save(update_fields=update_fields)
messages.success(request, f"Cancellation requested for run {run.id}.")
return redirect("run_detail", run_id=run.id)
@staff_member_required
def snapshot_detail(request, snapshot_id: int):
snapshot = get_object_or_404(