(feature) Add backup safety and preflight validation #13

Merged
parkel merged 3 commits from issue-3-backup-safety-preflight-validation into master 2026-05-21 00:58:23 +02:00
5 changed files with 262 additions and 1 deletions
Showing only changes of commit 3045093dcf - Show all commits

View File

@@ -1,11 +1,15 @@
from __future__ import annotations
import shlex
import subprocess
from dataclasses import dataclass
from typing import Any
from pobsync.config.merge import build_effective_config
from pobsync.rsync import build_ssh_command
from .config_repository import global_config_object_data, host_config_object_data
from .config_source import DjangoConfigSource
from .host_ops import collect_host_checks
from .models import GlobalConfig, HostConfig
from .self_check import SelfCheck
@@ -43,6 +47,9 @@ class BackupGate:
def collect_backup_gate(host: HostConfig, global_config: GlobalConfig | None = None) -> BackupGate:
checks = collect_host_checks(host, global_config)
remote_preflight_check = _remote_preflight_self_check(host)
if remote_preflight_check is not None:
checks.append(remote_preflight_check)
real_blockers = [check for check in checks if check.status == "failed"]
dry_run_blockers = [check for check in real_blockers if check.name in DRY_RUN_BLOCKING_CHECKS]
warnings = [check for check in checks if check.status == "warning"]
@@ -67,6 +74,61 @@ def collect_backup_gate(host: HostConfig, global_config: GlobalConfig | None = N
)
def run_remote_preflight(host: HostConfig, *, timeout_seconds: int = 20) -> dict[str, Any]:
config = DjangoConfigSource().effective_config_for_host(host.host)
ssh_cfg = config.get("ssh", {}) or {}
rsync_cfg = config.get("rsync", {}) or {}
address = str(config.get("address") or host.address)
user = str(ssh_cfg.get("user") or "root")
source_root = str(config.get("source_root") or (config.get("defaults", {}) or {}).get("source_root") or "/")
rsync_binary = str(rsync_cfg.get("binary") or "rsync")
target = f"{user}@{address}"
ssh_cmd = build_ssh_command(ssh_cfg)
checks = [
_run_remote_check(
name="SSH reachability",
command=[*ssh_cmd, "-oBatchMode=yes", target, "true"],
timeout_seconds=timeout_seconds,
),
_run_remote_check(
name="Remote rsync",
command=[
*ssh_cmd,
"-oBatchMode=yes",
target,
"sh",
"-lc",
f"command -v {shlex.quote(rsync_binary)} >/dev/null",
],
timeout_seconds=timeout_seconds,
),
_run_remote_check(
name="Remote source root",
command=[
*ssh_cmd,
"-oBatchMode=yes",
target,
"sh",
"-lc",
f"test -e {shlex.quote(source_root)} && test -r {shlex.quote(source_root)}",
],
timeout_seconds=timeout_seconds,
),
]
result = {
"ok": all(check["ok"] for check in checks),
"checks": checks,
"target": target,
"source_root": source_root,
"rsync_binary": rsync_binary,
"timeout_seconds": timeout_seconds,
}
host.config = {**(host.config or {}), "last_preflight": result}
host.save(update_fields=["config", "updated_at"])
return result
def effective_host_config_preview(host: HostConfig, global_config: GlobalConfig) -> dict[str, Any]:
config = build_effective_config(global_config_object_data(global_config), host_config_object_data(host))
credential = host.ssh_credential or global_config.default_ssh_credential
@@ -98,3 +160,73 @@ def effective_host_config_preview(host: HostConfig, global_config: GlobalConfig)
"yearly": retention.get("yearly", 0),
},
}
def _run_remote_check(*, name: str, command: list[str], timeout_seconds: int) -> dict[str, Any]:
try:
result = subprocess.run(
command,
check=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=timeout_seconds,
)
except subprocess.TimeoutExpired as exc:
return {
"name": name,
"ok": False,
"exit_code": 124,
"message": f"{name} timed out after {timeout_seconds}s.",
"detail": _clip_output((exc.stderr or exc.stdout or "").strip()),
}
except OSError as exc:
return {
"name": name,
"ok": False,
"exit_code": None,
"message": f"{name} could not start.",
"detail": str(exc),
}
return {
"name": name,
"ok": result.returncode == 0,
"exit_code": result.returncode,
"message": f"{name} passed." if result.returncode == 0 else f"{name} failed.",
"detail": _clip_output((result.stderr or result.stdout or "").strip()),
}
def _remote_preflight_self_check(host: HostConfig) -> SelfCheck | None:
preflight = (host.config or {}).get("last_preflight")
if not isinstance(preflight, dict):
return SelfCheck(
"Remote preflight",
"warning",
"No remote connection preflight has been run yet.",
"Run connection preflight before the first real backup.",
)
checks = preflight.get("checks")
if not isinstance(checks, list):
return SelfCheck("Remote preflight", "failed", "Stored remote preflight result is invalid.")
failed = [str(check.get("name", "unknown")) for check in checks if isinstance(check, dict) and not check.get("ok")]
if failed:
return SelfCheck(
"Remote preflight",
"failed",
"Remote connection preflight failed.",
", ".join(failed),
)
return SelfCheck(
"Remote preflight",
"ok",
"Remote connection preflight passed.",
f"{preflight.get('target', '')} {preflight.get('source_root', '')}".strip(),
)
def _clip_output(value: str, *, max_chars: int = 800) -> str:
if len(value) <= max_chars:
return value
return f"{value[:max_chars]}..."

View File

@@ -21,6 +21,10 @@
{% csrf_token %}
<button type="submit" class="secondary">Scan SSH host key</button>
</form>
<form method="post" action="{% url 'run_host_preflight' host.host %}">
{% csrf_token %}
<button type="submit" class="secondary">Run connection preflight</button>
</form>
</section>
<section class="grid" aria-label="Host summary">
@@ -199,6 +203,38 @@
</table>
</section>
{% if last_preflight %}
<section class="panel">
<h2>Connection Preflight</h2>
<div class="stack spaced">
<div><strong>Status:</strong> <span class="status {% if last_preflight.ok %}ok{% else %}failed{% endif %}">{% if last_preflight.ok %}ok{% else %}failed{% endif %}</span></div>
<div><strong>Target:</strong> {{ last_preflight.target }}</div>
<div><strong>Source root:</strong> {{ last_preflight.source_root }}</div>
<div><strong>Remote rsync:</strong> {{ last_preflight.rsync_binary }}</div>
</div>
<table>
<thead>
<tr>
<th>Status</th>
<th>Check</th>
<th>Message</th>
<th>Detail</th>
</tr>
</thead>
<tbody>
{% for check in last_preflight.checks %}
<tr>
<td><span class="status {% if check.ok %}ok{% else %}failed{% endif %}">{% if check.ok %}ok{% else %}failed{% endif %}</span></td>
<td>{{ check.name }}</td>
<td>{{ check.message }}</td>
<td class="muted">{{ check.detail }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
{% endif %}
<section class="panel">
<h2>Backup Control</h2>
<div class="operator-state">

View File

@@ -586,6 +586,7 @@ class ViewTests(TestCase):
self.assertContains(response, "Discover snapshots")
self.assertContains(response, "Edit schedule")
self.assertContains(response, "Edit config")
self.assertContains(response, "Run connection preflight")
self.assertContains(response, "Backup Control")
self.assertContains(response, "Queue dry-run")
self.assertContains(response, "Queue backup")
@@ -639,6 +640,68 @@ class ViewTests(TestCase):
self.assertContains(response, "d14")
self.assertContains(response, "w8")
def test_run_host_preflight_stores_remote_check_result(self) -> None:
self.client.force_login(self.staff_user)
GlobalConfig.objects.create(name="default", backup_root="/backups", rsync_args=["--archive"])
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
with patch(
"pobsync_backend.preflight.subprocess.run",
return_value=subprocess.CompletedProcess(args=["ssh"], returncode=0, stdout="", stderr=""),
) as run:
response = self.client.post(reverse("run_host_preflight", args=[host.host]), follow=True)
self.assertRedirects(response, reverse("host_detail", args=[host.host]))
self.assertContains(response, "Connection preflight passed for web-01.")
self.assertContains(response, "Connection Preflight")
self.assertContains(response, "SSH reachability")
self.assertContains(response, "Remote rsync")
self.assertContains(response, "Remote source root")
self.assertEqual(run.call_count, 3)
host.refresh_from_db()
self.assertTrue(host.config["last_preflight"]["ok"])
self.assertEqual(host.config["last_preflight"]["target"], "root@web-01.example.test")
def test_queue_manual_backup_blocks_real_backup_after_failed_remote_preflight(self) -> None:
self.client.force_login(self.staff_user)
with TemporaryDirectory() as tmp:
backup_root = Path(tmp)
GlobalConfig.objects.create(name="default", backup_root=str(backup_root), rsync_args=["--archive"])
host = HostConfig.objects.create(
host="web-01",
address="web-01.example.test",
config={
"last_preflight": {
"ok": False,
"target": "root@web-01.example.test",
"source_root": "/",
"rsync_binary": "rsync",
"checks": [
{
"name": "Remote rsync",
"ok": False,
"exit_code": 127,
"message": "Remote rsync failed.",
"detail": "rsync missing",
}
],
}
},
)
for subdir in ("scheduled", "manual", ".incomplete"):
(backup_root / host.host / subdir).mkdir(parents=True)
response = self.client.post(
reverse("queue_manual_backup", args=[host.host]),
{"prune_max_delete": "10"},
follow=True,
)
self.assertRedirects(response, reverse("host_detail", args=[host.host]))
self.assertContains(response, "Cannot queue real backup until failed preflight checks are resolved")
self.assertContains(response, "Remote preflight")
self.assertFalse(BackupRun.objects.exists())
def test_host_detail_renders_backup_trends(self) -> None:
self.client.force_login(self.staff_user)
GlobalConfig.objects.create(name="default", backup_root="/backups")

View File

@@ -31,7 +31,7 @@ from .forms import (
)
from .host_ops import ensure_host_directories
from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential
from .preflight import collect_backup_gate, effective_host_config_preview
from .preflight import collect_backup_gate, effective_host_config_preview, run_remote_preflight
from .retention import run_sql_retention_apply, run_sql_retention_plan
from .self_check import collect_self_checks, summarize_self_checks
from .scheduler import next_due_after
@@ -280,6 +280,7 @@ def host_detail(request, host: str):
"host_checks": backup_gate.checks,
"host_check_summary": summarize_self_checks(backup_gate.checks),
"backup_gate": backup_gate,
"last_preflight": (host_config.config or {}).get("last_preflight") if isinstance(host_config.config, dict) else {},
"effective_config": effective_host_config_preview(host_config, global_config) if global_config else {},
"stats_summary": stats_summary,
"manual_backup_form": ManualBackupForm(initial=_default_manual_backup_initial(host_config)),
@@ -336,6 +337,34 @@ def scan_host_known_key(request, host: str):
return redirect("host_detail", host=host_config.host)
@staff_member_required
@require_POST
def run_host_preflight(request, host: str):
host_config = get_object_or_404(HostConfig, host=host)
if not host_config.enabled:
messages.error(request, f"Cannot run preflight for disabled host {host_config.host}.")
return redirect("host_detail", host=host_config.host)
if not GlobalConfig.objects.filter(name="default").exists():
messages.error(request, "Create the default global config before running preflight.")
return redirect("host_detail", host=host_config.host)
try:
result = run_remote_preflight(host_config)
except Exception as exc:
messages.error(request, f"Connection preflight failed for {host_config.host}: {exc}")
else:
if result.get("ok"):
messages.success(request, f"Connection preflight passed for {host_config.host}.")
else:
failed = [
str(check.get("name"))
for check in result.get("checks", [])
if isinstance(check, dict) and not check.get("ok")
]
messages.error(request, f"Connection preflight failed for {host_config.host}: {', '.join(failed)}.")
return redirect("host_detail", host=host_config.host)
@staff_member_required
@require_POST
def queue_manual_backup(request, host: str):

View File

@@ -21,6 +21,7 @@ urlpatterns = [
path("hosts/<str:host>/config/", views.edit_host_config, name="edit_host_config"),
path("hosts/<str:host>/prepare-directories/", views.prepare_host_directories, name="prepare_host_directories"),
path("hosts/<str:host>/scan-known-key/", views.scan_host_known_key, name="scan_host_known_key"),
path("hosts/<str:host>/preflight/", views.run_host_preflight, name="run_host_preflight"),
path("hosts/<str:host>/queue-backup/", views.queue_manual_backup, name="queue_manual_backup"),
path("hosts/<str:host>/discover-snapshots/", views.discover_host_snapshots, name="discover_host_snapshots"),
path("hosts/<str:host>/retention-apply/", views.apply_host_retention, name="apply_host_retention"),