(feature) Add backup safety and preflight validation #13
@@ -1,11 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import shlex
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from pobsync.config.merge import build_effective_config
|
||||
from pobsync.rsync import build_ssh_command
|
||||
|
||||
from .config_repository import global_config_object_data, host_config_object_data
|
||||
from .config_source import DjangoConfigSource
|
||||
from .host_ops import collect_host_checks
|
||||
from .models import GlobalConfig, HostConfig
|
||||
from .self_check import SelfCheck
|
||||
@@ -43,6 +47,9 @@ class BackupGate:
|
||||
|
||||
def collect_backup_gate(host: HostConfig, global_config: GlobalConfig | None = None) -> BackupGate:
|
||||
checks = collect_host_checks(host, global_config)
|
||||
remote_preflight_check = _remote_preflight_self_check(host)
|
||||
if remote_preflight_check is not None:
|
||||
checks.append(remote_preflight_check)
|
||||
real_blockers = [check for check in checks if check.status == "failed"]
|
||||
dry_run_blockers = [check for check in real_blockers if check.name in DRY_RUN_BLOCKING_CHECKS]
|
||||
warnings = [check for check in checks if check.status == "warning"]
|
||||
@@ -67,6 +74,61 @@ def collect_backup_gate(host: HostConfig, global_config: GlobalConfig | None = N
|
||||
)
|
||||
|
||||
|
||||
def run_remote_preflight(host: HostConfig, *, timeout_seconds: int = 20) -> dict[str, Any]:
|
||||
config = DjangoConfigSource().effective_config_for_host(host.host)
|
||||
ssh_cfg = config.get("ssh", {}) or {}
|
||||
rsync_cfg = config.get("rsync", {}) or {}
|
||||
address = str(config.get("address") or host.address)
|
||||
user = str(ssh_cfg.get("user") or "root")
|
||||
source_root = str(config.get("source_root") or (config.get("defaults", {}) or {}).get("source_root") or "/")
|
||||
rsync_binary = str(rsync_cfg.get("binary") or "rsync")
|
||||
target = f"{user}@{address}"
|
||||
ssh_cmd = build_ssh_command(ssh_cfg)
|
||||
|
||||
checks = [
|
||||
_run_remote_check(
|
||||
name="SSH reachability",
|
||||
command=[*ssh_cmd, "-oBatchMode=yes", target, "true"],
|
||||
timeout_seconds=timeout_seconds,
|
||||
),
|
||||
_run_remote_check(
|
||||
name="Remote rsync",
|
||||
command=[
|
||||
*ssh_cmd,
|
||||
"-oBatchMode=yes",
|
||||
target,
|
||||
"sh",
|
||||
"-lc",
|
||||
f"command -v {shlex.quote(rsync_binary)} >/dev/null",
|
||||
],
|
||||
timeout_seconds=timeout_seconds,
|
||||
),
|
||||
_run_remote_check(
|
||||
name="Remote source root",
|
||||
command=[
|
||||
*ssh_cmd,
|
||||
"-oBatchMode=yes",
|
||||
target,
|
||||
"sh",
|
||||
"-lc",
|
||||
f"test -e {shlex.quote(source_root)} && test -r {shlex.quote(source_root)}",
|
||||
],
|
||||
timeout_seconds=timeout_seconds,
|
||||
),
|
||||
]
|
||||
result = {
|
||||
"ok": all(check["ok"] for check in checks),
|
||||
"checks": checks,
|
||||
"target": target,
|
||||
"source_root": source_root,
|
||||
"rsync_binary": rsync_binary,
|
||||
"timeout_seconds": timeout_seconds,
|
||||
}
|
||||
host.config = {**(host.config or {}), "last_preflight": result}
|
||||
host.save(update_fields=["config", "updated_at"])
|
||||
return result
|
||||
|
||||
|
||||
def effective_host_config_preview(host: HostConfig, global_config: GlobalConfig) -> dict[str, Any]:
|
||||
config = build_effective_config(global_config_object_data(global_config), host_config_object_data(host))
|
||||
credential = host.ssh_credential or global_config.default_ssh_credential
|
||||
@@ -98,3 +160,73 @@ def effective_host_config_preview(host: HostConfig, global_config: GlobalConfig)
|
||||
"yearly": retention.get("yearly", 0),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _run_remote_check(*, name: str, command: list[str], timeout_seconds: int) -> dict[str, Any]:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=timeout_seconds,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
return {
|
||||
"name": name,
|
||||
"ok": False,
|
||||
"exit_code": 124,
|
||||
"message": f"{name} timed out after {timeout_seconds}s.",
|
||||
"detail": _clip_output((exc.stderr or exc.stdout or "").strip()),
|
||||
}
|
||||
except OSError as exc:
|
||||
return {
|
||||
"name": name,
|
||||
"ok": False,
|
||||
"exit_code": None,
|
||||
"message": f"{name} could not start.",
|
||||
"detail": str(exc),
|
||||
}
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"ok": result.returncode == 0,
|
||||
"exit_code": result.returncode,
|
||||
"message": f"{name} passed." if result.returncode == 0 else f"{name} failed.",
|
||||
"detail": _clip_output((result.stderr or result.stdout or "").strip()),
|
||||
}
|
||||
|
||||
|
||||
def _remote_preflight_self_check(host: HostConfig) -> SelfCheck | None:
|
||||
preflight = (host.config or {}).get("last_preflight")
|
||||
if not isinstance(preflight, dict):
|
||||
return SelfCheck(
|
||||
"Remote preflight",
|
||||
"warning",
|
||||
"No remote connection preflight has been run yet.",
|
||||
"Run connection preflight before the first real backup.",
|
||||
)
|
||||
checks = preflight.get("checks")
|
||||
if not isinstance(checks, list):
|
||||
return SelfCheck("Remote preflight", "failed", "Stored remote preflight result is invalid.")
|
||||
failed = [str(check.get("name", "unknown")) for check in checks if isinstance(check, dict) and not check.get("ok")]
|
||||
if failed:
|
||||
return SelfCheck(
|
||||
"Remote preflight",
|
||||
"failed",
|
||||
"Remote connection preflight failed.",
|
||||
", ".join(failed),
|
||||
)
|
||||
return SelfCheck(
|
||||
"Remote preflight",
|
||||
"ok",
|
||||
"Remote connection preflight passed.",
|
||||
f"{preflight.get('target', '')} {preflight.get('source_root', '')}".strip(),
|
||||
)
|
||||
|
||||
|
||||
def _clip_output(value: str, *, max_chars: int = 800) -> str:
|
||||
if len(value) <= max_chars:
|
||||
return value
|
||||
return f"{value[:max_chars]}..."
|
||||
|
||||
@@ -21,6 +21,10 @@
|
||||
{% csrf_token %}
|
||||
<button type="submit" class="secondary">Scan SSH host key</button>
|
||||
</form>
|
||||
<form method="post" action="{% url 'run_host_preflight' host.host %}">
|
||||
{% csrf_token %}
|
||||
<button type="submit" class="secondary">Run connection preflight</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="grid" aria-label="Host summary">
|
||||
@@ -199,6 +203,38 @@
|
||||
</table>
|
||||
</section>
|
||||
|
||||
{% if last_preflight %}
|
||||
<section class="panel">
|
||||
<h2>Connection Preflight</h2>
|
||||
<div class="stack spaced">
|
||||
<div><strong>Status:</strong> <span class="status {% if last_preflight.ok %}ok{% else %}failed{% endif %}">{% if last_preflight.ok %}ok{% else %}failed{% endif %}</span></div>
|
||||
<div><strong>Target:</strong> {{ last_preflight.target }}</div>
|
||||
<div><strong>Source root:</strong> {{ last_preflight.source_root }}</div>
|
||||
<div><strong>Remote rsync:</strong> {{ last_preflight.rsync_binary }}</div>
|
||||
</div>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Status</th>
|
||||
<th>Check</th>
|
||||
<th>Message</th>
|
||||
<th>Detail</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for check in last_preflight.checks %}
|
||||
<tr>
|
||||
<td><span class="status {% if check.ok %}ok{% else %}failed{% endif %}">{% if check.ok %}ok{% else %}failed{% endif %}</span></td>
|
||||
<td>{{ check.name }}</td>
|
||||
<td>{{ check.message }}</td>
|
||||
<td class="muted">{{ check.detail }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="panel">
|
||||
<h2>Backup Control</h2>
|
||||
<div class="operator-state">
|
||||
|
||||
@@ -586,6 +586,7 @@ class ViewTests(TestCase):
|
||||
self.assertContains(response, "Discover snapshots")
|
||||
self.assertContains(response, "Edit schedule")
|
||||
self.assertContains(response, "Edit config")
|
||||
self.assertContains(response, "Run connection preflight")
|
||||
self.assertContains(response, "Backup Control")
|
||||
self.assertContains(response, "Queue dry-run")
|
||||
self.assertContains(response, "Queue backup")
|
||||
@@ -639,6 +640,68 @@ class ViewTests(TestCase):
|
||||
self.assertContains(response, "d14")
|
||||
self.assertContains(response, "w8")
|
||||
|
||||
def test_run_host_preflight_stores_remote_check_result(self) -> None:
|
||||
self.client.force_login(self.staff_user)
|
||||
GlobalConfig.objects.create(name="default", backup_root="/backups", rsync_args=["--archive"])
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
|
||||
with patch(
|
||||
"pobsync_backend.preflight.subprocess.run",
|
||||
return_value=subprocess.CompletedProcess(args=["ssh"], returncode=0, stdout="", stderr=""),
|
||||
) as run:
|
||||
response = self.client.post(reverse("run_host_preflight", args=[host.host]), follow=True)
|
||||
|
||||
self.assertRedirects(response, reverse("host_detail", args=[host.host]))
|
||||
self.assertContains(response, "Connection preflight passed for web-01.")
|
||||
self.assertContains(response, "Connection Preflight")
|
||||
self.assertContains(response, "SSH reachability")
|
||||
self.assertContains(response, "Remote rsync")
|
||||
self.assertContains(response, "Remote source root")
|
||||
self.assertEqual(run.call_count, 3)
|
||||
host.refresh_from_db()
|
||||
self.assertTrue(host.config["last_preflight"]["ok"])
|
||||
self.assertEqual(host.config["last_preflight"]["target"], "root@web-01.example.test")
|
||||
|
||||
def test_queue_manual_backup_blocks_real_backup_after_failed_remote_preflight(self) -> None:
|
||||
self.client.force_login(self.staff_user)
|
||||
with TemporaryDirectory() as tmp:
|
||||
backup_root = Path(tmp)
|
||||
GlobalConfig.objects.create(name="default", backup_root=str(backup_root), rsync_args=["--archive"])
|
||||
host = HostConfig.objects.create(
|
||||
host="web-01",
|
||||
address="web-01.example.test",
|
||||
config={
|
||||
"last_preflight": {
|
||||
"ok": False,
|
||||
"target": "root@web-01.example.test",
|
||||
"source_root": "/",
|
||||
"rsync_binary": "rsync",
|
||||
"checks": [
|
||||
{
|
||||
"name": "Remote rsync",
|
||||
"ok": False,
|
||||
"exit_code": 127,
|
||||
"message": "Remote rsync failed.",
|
||||
"detail": "rsync missing",
|
||||
}
|
||||
],
|
||||
}
|
||||
},
|
||||
)
|
||||
for subdir in ("scheduled", "manual", ".incomplete"):
|
||||
(backup_root / host.host / subdir).mkdir(parents=True)
|
||||
|
||||
response = self.client.post(
|
||||
reverse("queue_manual_backup", args=[host.host]),
|
||||
{"prune_max_delete": "10"},
|
||||
follow=True,
|
||||
)
|
||||
|
||||
self.assertRedirects(response, reverse("host_detail", args=[host.host]))
|
||||
self.assertContains(response, "Cannot queue real backup until failed preflight checks are resolved")
|
||||
self.assertContains(response, "Remote preflight")
|
||||
self.assertFalse(BackupRun.objects.exists())
|
||||
|
||||
def test_host_detail_renders_backup_trends(self) -> None:
|
||||
self.client.force_login(self.staff_user)
|
||||
GlobalConfig.objects.create(name="default", backup_root="/backups")
|
||||
|
||||
@@ -31,7 +31,7 @@ from .forms import (
|
||||
)
|
||||
from .host_ops import ensure_host_directories
|
||||
from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential
|
||||
from .preflight import collect_backup_gate, effective_host_config_preview
|
||||
from .preflight import collect_backup_gate, effective_host_config_preview, run_remote_preflight
|
||||
from .retention import run_sql_retention_apply, run_sql_retention_plan
|
||||
from .self_check import collect_self_checks, summarize_self_checks
|
||||
from .scheduler import next_due_after
|
||||
@@ -280,6 +280,7 @@ def host_detail(request, host: str):
|
||||
"host_checks": backup_gate.checks,
|
||||
"host_check_summary": summarize_self_checks(backup_gate.checks),
|
||||
"backup_gate": backup_gate,
|
||||
"last_preflight": (host_config.config or {}).get("last_preflight") if isinstance(host_config.config, dict) else {},
|
||||
"effective_config": effective_host_config_preview(host_config, global_config) if global_config else {},
|
||||
"stats_summary": stats_summary,
|
||||
"manual_backup_form": ManualBackupForm(initial=_default_manual_backup_initial(host_config)),
|
||||
@@ -336,6 +337,34 @@ def scan_host_known_key(request, host: str):
|
||||
return redirect("host_detail", host=host_config.host)
|
||||
|
||||
|
||||
@staff_member_required
|
||||
@require_POST
|
||||
def run_host_preflight(request, host: str):
|
||||
host_config = get_object_or_404(HostConfig, host=host)
|
||||
if not host_config.enabled:
|
||||
messages.error(request, f"Cannot run preflight for disabled host {host_config.host}.")
|
||||
return redirect("host_detail", host=host_config.host)
|
||||
if not GlobalConfig.objects.filter(name="default").exists():
|
||||
messages.error(request, "Create the default global config before running preflight.")
|
||||
return redirect("host_detail", host=host_config.host)
|
||||
|
||||
try:
|
||||
result = run_remote_preflight(host_config)
|
||||
except Exception as exc:
|
||||
messages.error(request, f"Connection preflight failed for {host_config.host}: {exc}")
|
||||
else:
|
||||
if result.get("ok"):
|
||||
messages.success(request, f"Connection preflight passed for {host_config.host}.")
|
||||
else:
|
||||
failed = [
|
||||
str(check.get("name"))
|
||||
for check in result.get("checks", [])
|
||||
if isinstance(check, dict) and not check.get("ok")
|
||||
]
|
||||
messages.error(request, f"Connection preflight failed for {host_config.host}: {', '.join(failed)}.")
|
||||
return redirect("host_detail", host=host_config.host)
|
||||
|
||||
|
||||
@staff_member_required
|
||||
@require_POST
|
||||
def queue_manual_backup(request, host: str):
|
||||
|
||||
@@ -21,6 +21,7 @@ urlpatterns = [
|
||||
path("hosts/<str:host>/config/", views.edit_host_config, name="edit_host_config"),
|
||||
path("hosts/<str:host>/prepare-directories/", views.prepare_host_directories, name="prepare_host_directories"),
|
||||
path("hosts/<str:host>/scan-known-key/", views.scan_host_known_key, name="scan_host_known_key"),
|
||||
path("hosts/<str:host>/preflight/", views.run_host_preflight, name="run_host_preflight"),
|
||||
path("hosts/<str:host>/queue-backup/", views.queue_manual_backup, name="queue_manual_backup"),
|
||||
path("hosts/<str:host>/discover-snapshots/", views.discover_host_snapshots, name="discover_host_snapshots"),
|
||||
path("hosts/<str:host>/retention-apply/", views.apply_host_retention, name="apply_host_retention"),
|
||||
|
||||
Reference in New Issue
Block a user