(feature) Add backup safety and preflight validation #13
@@ -1,11 +1,15 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shlex
|
||||||
|
import subprocess
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from pobsync.config.merge import build_effective_config
|
from pobsync.config.merge import build_effective_config
|
||||||
|
from pobsync.rsync import build_ssh_command
|
||||||
|
|
||||||
from .config_repository import global_config_object_data, host_config_object_data
|
from .config_repository import global_config_object_data, host_config_object_data
|
||||||
|
from .config_source import DjangoConfigSource
|
||||||
from .host_ops import collect_host_checks
|
from .host_ops import collect_host_checks
|
||||||
from .models import GlobalConfig, HostConfig
|
from .models import GlobalConfig, HostConfig
|
||||||
from .self_check import SelfCheck
|
from .self_check import SelfCheck
|
||||||
@@ -43,6 +47,9 @@ class BackupGate:
|
|||||||
|
|
||||||
def collect_backup_gate(host: HostConfig, global_config: GlobalConfig | None = None) -> BackupGate:
|
def collect_backup_gate(host: HostConfig, global_config: GlobalConfig | None = None) -> BackupGate:
|
||||||
checks = collect_host_checks(host, global_config)
|
checks = collect_host_checks(host, global_config)
|
||||||
|
remote_preflight_check = _remote_preflight_self_check(host)
|
||||||
|
if remote_preflight_check is not None:
|
||||||
|
checks.append(remote_preflight_check)
|
||||||
real_blockers = [check for check in checks if check.status == "failed"]
|
real_blockers = [check for check in checks if check.status == "failed"]
|
||||||
dry_run_blockers = [check for check in real_blockers if check.name in DRY_RUN_BLOCKING_CHECKS]
|
dry_run_blockers = [check for check in real_blockers if check.name in DRY_RUN_BLOCKING_CHECKS]
|
||||||
warnings = [check for check in checks if check.status == "warning"]
|
warnings = [check for check in checks if check.status == "warning"]
|
||||||
@@ -67,6 +74,61 @@ def collect_backup_gate(host: HostConfig, global_config: GlobalConfig | None = N
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_remote_preflight(host: HostConfig, *, timeout_seconds: int = 20) -> dict[str, Any]:
|
||||||
|
config = DjangoConfigSource().effective_config_for_host(host.host)
|
||||||
|
ssh_cfg = config.get("ssh", {}) or {}
|
||||||
|
rsync_cfg = config.get("rsync", {}) or {}
|
||||||
|
address = str(config.get("address") or host.address)
|
||||||
|
user = str(ssh_cfg.get("user") or "root")
|
||||||
|
source_root = str(config.get("source_root") or (config.get("defaults", {}) or {}).get("source_root") or "/")
|
||||||
|
rsync_binary = str(rsync_cfg.get("binary") or "rsync")
|
||||||
|
target = f"{user}@{address}"
|
||||||
|
ssh_cmd = build_ssh_command(ssh_cfg)
|
||||||
|
|
||||||
|
checks = [
|
||||||
|
_run_remote_check(
|
||||||
|
name="SSH reachability",
|
||||||
|
command=[*ssh_cmd, "-oBatchMode=yes", target, "true"],
|
||||||
|
timeout_seconds=timeout_seconds,
|
||||||
|
),
|
||||||
|
_run_remote_check(
|
||||||
|
name="Remote rsync",
|
||||||
|
command=[
|
||||||
|
*ssh_cmd,
|
||||||
|
"-oBatchMode=yes",
|
||||||
|
target,
|
||||||
|
"sh",
|
||||||
|
"-lc",
|
||||||
|
f"command -v {shlex.quote(rsync_binary)} >/dev/null",
|
||||||
|
],
|
||||||
|
timeout_seconds=timeout_seconds,
|
||||||
|
),
|
||||||
|
_run_remote_check(
|
||||||
|
name="Remote source root",
|
||||||
|
command=[
|
||||||
|
*ssh_cmd,
|
||||||
|
"-oBatchMode=yes",
|
||||||
|
target,
|
||||||
|
"sh",
|
||||||
|
"-lc",
|
||||||
|
f"test -e {shlex.quote(source_root)} && test -r {shlex.quote(source_root)}",
|
||||||
|
],
|
||||||
|
timeout_seconds=timeout_seconds,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = {
|
||||||
|
"ok": all(check["ok"] for check in checks),
|
||||||
|
"checks": checks,
|
||||||
|
"target": target,
|
||||||
|
"source_root": source_root,
|
||||||
|
"rsync_binary": rsync_binary,
|
||||||
|
"timeout_seconds": timeout_seconds,
|
||||||
|
}
|
||||||
|
host.config = {**(host.config or {}), "last_preflight": result}
|
||||||
|
host.save(update_fields=["config", "updated_at"])
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def effective_host_config_preview(host: HostConfig, global_config: GlobalConfig) -> dict[str, Any]:
|
def effective_host_config_preview(host: HostConfig, global_config: GlobalConfig) -> dict[str, Any]:
|
||||||
config = build_effective_config(global_config_object_data(global_config), host_config_object_data(host))
|
config = build_effective_config(global_config_object_data(global_config), host_config_object_data(host))
|
||||||
credential = host.ssh_credential or global_config.default_ssh_credential
|
credential = host.ssh_credential or global_config.default_ssh_credential
|
||||||
@@ -98,3 +160,73 @@ def effective_host_config_preview(host: HostConfig, global_config: GlobalConfig)
|
|||||||
"yearly": retention.get("yearly", 0),
|
"yearly": retention.get("yearly", 0),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _run_remote_check(*, name: str, command: list[str], timeout_seconds: int) -> dict[str, Any]:
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
command,
|
||||||
|
check=False,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
timeout=timeout_seconds,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired as exc:
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"ok": False,
|
||||||
|
"exit_code": 124,
|
||||||
|
"message": f"{name} timed out after {timeout_seconds}s.",
|
||||||
|
"detail": _clip_output((exc.stderr or exc.stdout or "").strip()),
|
||||||
|
}
|
||||||
|
except OSError as exc:
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"ok": False,
|
||||||
|
"exit_code": None,
|
||||||
|
"message": f"{name} could not start.",
|
||||||
|
"detail": str(exc),
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"ok": result.returncode == 0,
|
||||||
|
"exit_code": result.returncode,
|
||||||
|
"message": f"{name} passed." if result.returncode == 0 else f"{name} failed.",
|
||||||
|
"detail": _clip_output((result.stderr or result.stdout or "").strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _remote_preflight_self_check(host: HostConfig) -> SelfCheck | None:
|
||||||
|
preflight = (host.config or {}).get("last_preflight")
|
||||||
|
if not isinstance(preflight, dict):
|
||||||
|
return SelfCheck(
|
||||||
|
"Remote preflight",
|
||||||
|
"warning",
|
||||||
|
"No remote connection preflight has been run yet.",
|
||||||
|
"Run connection preflight before the first real backup.",
|
||||||
|
)
|
||||||
|
checks = preflight.get("checks")
|
||||||
|
if not isinstance(checks, list):
|
||||||
|
return SelfCheck("Remote preflight", "failed", "Stored remote preflight result is invalid.")
|
||||||
|
failed = [str(check.get("name", "unknown")) for check in checks if isinstance(check, dict) and not check.get("ok")]
|
||||||
|
if failed:
|
||||||
|
return SelfCheck(
|
||||||
|
"Remote preflight",
|
||||||
|
"failed",
|
||||||
|
"Remote connection preflight failed.",
|
||||||
|
", ".join(failed),
|
||||||
|
)
|
||||||
|
return SelfCheck(
|
||||||
|
"Remote preflight",
|
||||||
|
"ok",
|
||||||
|
"Remote connection preflight passed.",
|
||||||
|
f"{preflight.get('target', '')} {preflight.get('source_root', '')}".strip(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _clip_output(value: str, *, max_chars: int = 800) -> str:
|
||||||
|
if len(value) <= max_chars:
|
||||||
|
return value
|
||||||
|
return f"{value[:max_chars]}..."
|
||||||
|
|||||||
@@ -21,6 +21,10 @@
|
|||||||
{% csrf_token %}
|
{% csrf_token %}
|
||||||
<button type="submit" class="secondary">Scan SSH host key</button>
|
<button type="submit" class="secondary">Scan SSH host key</button>
|
||||||
</form>
|
</form>
|
||||||
|
<form method="post" action="{% url 'run_host_preflight' host.host %}">
|
||||||
|
{% csrf_token %}
|
||||||
|
<button type="submit" class="secondary">Run connection preflight</button>
|
||||||
|
</form>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section class="grid" aria-label="Host summary">
|
<section class="grid" aria-label="Host summary">
|
||||||
@@ -199,6 +203,38 @@
|
|||||||
</table>
|
</table>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
{% if last_preflight %}
|
||||||
|
<section class="panel">
|
||||||
|
<h2>Connection Preflight</h2>
|
||||||
|
<div class="stack spaced">
|
||||||
|
<div><strong>Status:</strong> <span class="status {% if last_preflight.ok %}ok{% else %}failed{% endif %}">{% if last_preflight.ok %}ok{% else %}failed{% endif %}</span></div>
|
||||||
|
<div><strong>Target:</strong> {{ last_preflight.target }}</div>
|
||||||
|
<div><strong>Source root:</strong> {{ last_preflight.source_root }}</div>
|
||||||
|
<div><strong>Remote rsync:</strong> {{ last_preflight.rsync_binary }}</div>
|
||||||
|
</div>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Status</th>
|
||||||
|
<th>Check</th>
|
||||||
|
<th>Message</th>
|
||||||
|
<th>Detail</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for check in last_preflight.checks %}
|
||||||
|
<tr>
|
||||||
|
<td><span class="status {% if check.ok %}ok{% else %}failed{% endif %}">{% if check.ok %}ok{% else %}failed{% endif %}</span></td>
|
||||||
|
<td>{{ check.name }}</td>
|
||||||
|
<td>{{ check.message }}</td>
|
||||||
|
<td class="muted">{{ check.detail }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</section>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<section class="panel">
|
<section class="panel">
|
||||||
<h2>Backup Control</h2>
|
<h2>Backup Control</h2>
|
||||||
<div class="operator-state">
|
<div class="operator-state">
|
||||||
|
|||||||
@@ -586,6 +586,7 @@ class ViewTests(TestCase):
|
|||||||
self.assertContains(response, "Discover snapshots")
|
self.assertContains(response, "Discover snapshots")
|
||||||
self.assertContains(response, "Edit schedule")
|
self.assertContains(response, "Edit schedule")
|
||||||
self.assertContains(response, "Edit config")
|
self.assertContains(response, "Edit config")
|
||||||
|
self.assertContains(response, "Run connection preflight")
|
||||||
self.assertContains(response, "Backup Control")
|
self.assertContains(response, "Backup Control")
|
||||||
self.assertContains(response, "Queue dry-run")
|
self.assertContains(response, "Queue dry-run")
|
||||||
self.assertContains(response, "Queue backup")
|
self.assertContains(response, "Queue backup")
|
||||||
@@ -639,6 +640,68 @@ class ViewTests(TestCase):
|
|||||||
self.assertContains(response, "d14")
|
self.assertContains(response, "d14")
|
||||||
self.assertContains(response, "w8")
|
self.assertContains(response, "w8")
|
||||||
|
|
||||||
|
def test_run_host_preflight_stores_remote_check_result(self) -> None:
|
||||||
|
self.client.force_login(self.staff_user)
|
||||||
|
GlobalConfig.objects.create(name="default", backup_root="/backups", rsync_args=["--archive"])
|
||||||
|
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"pobsync_backend.preflight.subprocess.run",
|
||||||
|
return_value=subprocess.CompletedProcess(args=["ssh"], returncode=0, stdout="", stderr=""),
|
||||||
|
) as run:
|
||||||
|
response = self.client.post(reverse("run_host_preflight", args=[host.host]), follow=True)
|
||||||
|
|
||||||
|
self.assertRedirects(response, reverse("host_detail", args=[host.host]))
|
||||||
|
self.assertContains(response, "Connection preflight passed for web-01.")
|
||||||
|
self.assertContains(response, "Connection Preflight")
|
||||||
|
self.assertContains(response, "SSH reachability")
|
||||||
|
self.assertContains(response, "Remote rsync")
|
||||||
|
self.assertContains(response, "Remote source root")
|
||||||
|
self.assertEqual(run.call_count, 3)
|
||||||
|
host.refresh_from_db()
|
||||||
|
self.assertTrue(host.config["last_preflight"]["ok"])
|
||||||
|
self.assertEqual(host.config["last_preflight"]["target"], "root@web-01.example.test")
|
||||||
|
|
||||||
|
def test_queue_manual_backup_blocks_real_backup_after_failed_remote_preflight(self) -> None:
|
||||||
|
self.client.force_login(self.staff_user)
|
||||||
|
with TemporaryDirectory() as tmp:
|
||||||
|
backup_root = Path(tmp)
|
||||||
|
GlobalConfig.objects.create(name="default", backup_root=str(backup_root), rsync_args=["--archive"])
|
||||||
|
host = HostConfig.objects.create(
|
||||||
|
host="web-01",
|
||||||
|
address="web-01.example.test",
|
||||||
|
config={
|
||||||
|
"last_preflight": {
|
||||||
|
"ok": False,
|
||||||
|
"target": "root@web-01.example.test",
|
||||||
|
"source_root": "/",
|
||||||
|
"rsync_binary": "rsync",
|
||||||
|
"checks": [
|
||||||
|
{
|
||||||
|
"name": "Remote rsync",
|
||||||
|
"ok": False,
|
||||||
|
"exit_code": 127,
|
||||||
|
"message": "Remote rsync failed.",
|
||||||
|
"detail": "rsync missing",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
for subdir in ("scheduled", "manual", ".incomplete"):
|
||||||
|
(backup_root / host.host / subdir).mkdir(parents=True)
|
||||||
|
|
||||||
|
response = self.client.post(
|
||||||
|
reverse("queue_manual_backup", args=[host.host]),
|
||||||
|
{"prune_max_delete": "10"},
|
||||||
|
follow=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertRedirects(response, reverse("host_detail", args=[host.host]))
|
||||||
|
self.assertContains(response, "Cannot queue real backup until failed preflight checks are resolved")
|
||||||
|
self.assertContains(response, "Remote preflight")
|
||||||
|
self.assertFalse(BackupRun.objects.exists())
|
||||||
|
|
||||||
def test_host_detail_renders_backup_trends(self) -> None:
|
def test_host_detail_renders_backup_trends(self) -> None:
|
||||||
self.client.force_login(self.staff_user)
|
self.client.force_login(self.staff_user)
|
||||||
GlobalConfig.objects.create(name="default", backup_root="/backups")
|
GlobalConfig.objects.create(name="default", backup_root="/backups")
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from .forms import (
|
|||||||
)
|
)
|
||||||
from .host_ops import ensure_host_directories
|
from .host_ops import ensure_host_directories
|
||||||
from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential
|
from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential
|
||||||
from .preflight import collect_backup_gate, effective_host_config_preview
|
from .preflight import collect_backup_gate, effective_host_config_preview, run_remote_preflight
|
||||||
from .retention import run_sql_retention_apply, run_sql_retention_plan
|
from .retention import run_sql_retention_apply, run_sql_retention_plan
|
||||||
from .self_check import collect_self_checks, summarize_self_checks
|
from .self_check import collect_self_checks, summarize_self_checks
|
||||||
from .scheduler import next_due_after
|
from .scheduler import next_due_after
|
||||||
@@ -280,6 +280,7 @@ def host_detail(request, host: str):
|
|||||||
"host_checks": backup_gate.checks,
|
"host_checks": backup_gate.checks,
|
||||||
"host_check_summary": summarize_self_checks(backup_gate.checks),
|
"host_check_summary": summarize_self_checks(backup_gate.checks),
|
||||||
"backup_gate": backup_gate,
|
"backup_gate": backup_gate,
|
||||||
|
"last_preflight": (host_config.config or {}).get("last_preflight") if isinstance(host_config.config, dict) else {},
|
||||||
"effective_config": effective_host_config_preview(host_config, global_config) if global_config else {},
|
"effective_config": effective_host_config_preview(host_config, global_config) if global_config else {},
|
||||||
"stats_summary": stats_summary,
|
"stats_summary": stats_summary,
|
||||||
"manual_backup_form": ManualBackupForm(initial=_default_manual_backup_initial(host_config)),
|
"manual_backup_form": ManualBackupForm(initial=_default_manual_backup_initial(host_config)),
|
||||||
@@ -336,6 +337,34 @@ def scan_host_known_key(request, host: str):
|
|||||||
return redirect("host_detail", host=host_config.host)
|
return redirect("host_detail", host=host_config.host)
|
||||||
|
|
||||||
|
|
||||||
|
@staff_member_required
|
||||||
|
@require_POST
|
||||||
|
def run_host_preflight(request, host: str):
|
||||||
|
host_config = get_object_or_404(HostConfig, host=host)
|
||||||
|
if not host_config.enabled:
|
||||||
|
messages.error(request, f"Cannot run preflight for disabled host {host_config.host}.")
|
||||||
|
return redirect("host_detail", host=host_config.host)
|
||||||
|
if not GlobalConfig.objects.filter(name="default").exists():
|
||||||
|
messages.error(request, "Create the default global config before running preflight.")
|
||||||
|
return redirect("host_detail", host=host_config.host)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = run_remote_preflight(host_config)
|
||||||
|
except Exception as exc:
|
||||||
|
messages.error(request, f"Connection preflight failed for {host_config.host}: {exc}")
|
||||||
|
else:
|
||||||
|
if result.get("ok"):
|
||||||
|
messages.success(request, f"Connection preflight passed for {host_config.host}.")
|
||||||
|
else:
|
||||||
|
failed = [
|
||||||
|
str(check.get("name"))
|
||||||
|
for check in result.get("checks", [])
|
||||||
|
if isinstance(check, dict) and not check.get("ok")
|
||||||
|
]
|
||||||
|
messages.error(request, f"Connection preflight failed for {host_config.host}: {', '.join(failed)}.")
|
||||||
|
return redirect("host_detail", host=host_config.host)
|
||||||
|
|
||||||
|
|
||||||
@staff_member_required
|
@staff_member_required
|
||||||
@require_POST
|
@require_POST
|
||||||
def queue_manual_backup(request, host: str):
|
def queue_manual_backup(request, host: str):
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ urlpatterns = [
|
|||||||
path("hosts/<str:host>/config/", views.edit_host_config, name="edit_host_config"),
|
path("hosts/<str:host>/config/", views.edit_host_config, name="edit_host_config"),
|
||||||
path("hosts/<str:host>/prepare-directories/", views.prepare_host_directories, name="prepare_host_directories"),
|
path("hosts/<str:host>/prepare-directories/", views.prepare_host_directories, name="prepare_host_directories"),
|
||||||
path("hosts/<str:host>/scan-known-key/", views.scan_host_known_key, name="scan_host_known_key"),
|
path("hosts/<str:host>/scan-known-key/", views.scan_host_known_key, name="scan_host_known_key"),
|
||||||
|
path("hosts/<str:host>/preflight/", views.run_host_preflight, name="run_host_preflight"),
|
||||||
path("hosts/<str:host>/queue-backup/", views.queue_manual_backup, name="queue_manual_backup"),
|
path("hosts/<str:host>/queue-backup/", views.queue_manual_backup, name="queue_manual_backup"),
|
||||||
path("hosts/<str:host>/discover-snapshots/", views.discover_host_snapshots, name="discover_host_snapshots"),
|
path("hosts/<str:host>/discover-snapshots/", views.discover_host_snapshots, name="discover_host_snapshots"),
|
||||||
path("hosts/<str:host>/retention-apply/", views.apply_host_retention, name="apply_host_retention"),
|
path("hosts/<str:host>/retention-apply/", views.apply_host_retention, name="apply_host_retention"),
|
||||||
|
|||||||
Reference in New Issue
Block a user