(feature) Add host backup preflight gates

Introduce a host preflight layer that separates dry-run blockers from real backup blockers.
Show the effective per-host backup configuration in Django before queueing a run.

Block real backup queueing when failed host checks remain, while still allowing dry-runs
when only local storage preparation is missing.
This commit is contained in:
2026-05-21 00:41:45 +02:00
parent 155ff63a73
commit 64a0ff8322
6 changed files with 290 additions and 34 deletions

View File

@@ -77,6 +77,14 @@ def _host_yaml_data(host_config: HostConfig) -> dict[str, Any]:
return validate_dict(data, HOST_SCHEMA, path="host")
def global_config_object_data(global_config: GlobalConfig) -> dict[str, Any]:
return _global_yaml_data(global_config)
def host_config_object_data(host_config: HostConfig) -> dict[str, Any]:
return _host_yaml_data(host_config)
def global_config_data(name: str = "default") -> dict[str, Any]:
try:
global_config = GlobalConfig.objects.get(name=name)

View File

@@ -0,0 +1,100 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from pobsync.config.merge import build_effective_config
from .config_repository import global_config_object_data, host_config_object_data
from .host_ops import collect_host_checks
from .models import GlobalConfig, HostConfig
from .self_check import SelfCheck
DRY_RUN_BLOCKING_CHECKS = {
"Host global config",
"Host address",
"Host SSH key file",
"Host effective source root",
"Host effective SSH user",
"Host effective SSH port",
"Host effective SSH credential",
"Host effective rsync recursion",
}
@dataclass(frozen=True)
class BackupGate:
state: str
message: str
checks: list[SelfCheck]
real_blockers: list[SelfCheck]
dry_run_blockers: list[SelfCheck]
warnings: list[SelfCheck]
@property
def can_queue_real(self) -> bool:
return not self.real_blockers
@property
def can_queue_dry_run(self) -> bool:
return not self.dry_run_blockers
def collect_backup_gate(host: HostConfig, global_config: GlobalConfig | None = None) -> BackupGate:
checks = collect_host_checks(host, global_config)
real_blockers = [check for check in checks if check.status == "failed"]
dry_run_blockers = [check for check in real_blockers if check.name in DRY_RUN_BLOCKING_CHECKS]
warnings = [check for check in checks if check.status == "warning"]
if real_blockers:
state = "blocked"
message = "Real backups are blocked until failed host checks are resolved."
elif warnings:
state = "warning"
message = "Backups can run, but review the warnings first."
else:
state = "ready"
message = "This host is ready for backup runs."
return BackupGate(
state=state,
message=message,
checks=checks,
real_blockers=real_blockers,
dry_run_blockers=dry_run_blockers,
warnings=warnings,
)
def effective_host_config_preview(host: HostConfig, global_config: GlobalConfig) -> dict[str, Any]:
config = build_effective_config(global_config_object_data(global_config), host_config_object_data(host))
credential = host.ssh_credential or global_config.default_ssh_credential
ssh = config.get("ssh", {}) or {}
rsync = config.get("rsync", {}) or {}
retention = config.get("retention", {}) or {}
return {
"source_root": config.get("source_root", ""),
"destination_subdir": (config.get("defaults", {}) or {}).get("destination_subdir", ""),
"includes": list(config.get("includes") or []),
"excludes": list(config.get("excludes_effective") or []),
"ssh": {
"user": ssh.get("user", ""),
"port": ssh.get("port", ""),
"options": list(ssh.get("options") or []),
"credential": str(credential) if credential else "",
},
"rsync": {
"binary": rsync.get("binary", ""),
"args": list(rsync.get("args_effective") or []),
"timeout_seconds": rsync.get("timeout_seconds", 0),
"bwlimit_kbps": rsync.get("bwlimit_kbps", 0),
},
"retention": {
"daily": retention.get("daily", 0),
"weekly": retention.get("weekly", 0),
"monthly": retention.get("monthly", 0),
"yearly": retention.get("yearly", 0),
},
}

View File

@@ -81,6 +81,7 @@
.status.success { color: var(--success); border-color: #a7d8b9; background: #edf8f1; }
.status.ok { color: var(--success); border-color: #a7d8b9; background: #edf8f1; }
.status.failed { color: var(--failed); border-color: #e8b4b4; background: #fff0f0; }
.status.blocked { color: var(--failed); border-color: #e8b4b4; background: #fff0f0; }
.status.running { color: var(--running); border-color: #e7cf8a; background: #fff8df; }
.status.warning { color: var(--running); border-color: #e7cf8a; background: #fff8df; }
.status.queued { color: var(--link); border-color: #b5cdea; background: #eef6ff; }

View File

@@ -64,6 +64,46 @@
</section>
</div>
{% if effective_config %}
<section class="panel">
<h2>Effective Config</h2>
<div class="two-col">
<div class="stack">
<div><strong>Source root:</strong> {{ effective_config.source_root }}</div>
<div><strong>Destination subdir:</strong> {{ effective_config.destination_subdir|default:"none" }}</div>
<div><strong>SSH:</strong> {{ effective_config.ssh.user }}@{{ host.address }}:{{ effective_config.ssh.port }}</div>
<div><strong>SSH key:</strong> {{ effective_config.ssh.credential|default:"none selected" }}</div>
<div><strong>SSH options:</strong> {{ effective_config.ssh.options|join:" " }}</div>
<div><strong>Rsync binary:</strong> {{ effective_config.rsync.binary }}</div>
<div><strong>Rsync args:</strong> {{ effective_config.rsync.args|join:" " }}</div>
<div><strong>Timeout:</strong> {{ effective_config.rsync.timeout_seconds }}s</div>
<div><strong>Bandwidth limit:</strong> {{ effective_config.rsync.bwlimit_kbps }} KB/s</div>
<div>
<strong>Retention:</strong>
d{{ effective_config.retention.daily }}
w{{ effective_config.retention.weekly }}
m{{ effective_config.retention.monthly }}
y{{ effective_config.retention.yearly }}
</div>
</div>
<div class="stack">
<div><strong>Includes:</strong> {{ effective_config.includes|length }}</div>
{% if effective_config.includes %}
<pre>{{ effective_config.includes|join:"&#10;" }}</pre>
{% else %}
<div class="muted">No include rules configured.</div>
{% endif %}
<div><strong>Excludes:</strong> {{ effective_config.excludes|length }}</div>
{% if effective_config.excludes %}
<pre>{{ effective_config.excludes|join:"&#10;" }}</pre>
{% else %}
<div class="muted">No exclude rules configured.</div>
{% endif %}
</div>
</div>
</section>
{% endif %}
<section class="panel">
<h2>Snapshot Discovery</h2>
<div class="stack">
@@ -165,8 +205,9 @@
{% if active_run %}
<span class="status {{ active_run.status }}">{{ active_run.status }}</span>
<a href="{% url 'run_detail' active_run.id %}">Run {{ active_run.id }}</a>
{% elif can_queue_backup %}
<span class="status success">ready</span>
{% elif has_global_config and host.enabled %}
<span class="status {{ backup_gate.state }}">{{ backup_gate.state }}</span>
<span class="muted">{{ backup_gate.message }}</span>
{% elif not host.enabled %}
<span class="status failed">disabled</span>
{% elif not has_global_config %}
@@ -180,20 +221,24 @@
<input type="hidden" name="dry_run" value="on">
<input type="hidden" name="verbose_output" value="on">
<input type="hidden" name="prune_max_delete" value="10">
<button type="submit" class="secondary" {% if not can_queue_backup %}disabled{% endif %}>Queue dry-run</button>
<button type="submit" class="secondary" {% if not can_queue_dry_run %}disabled{% endif %}>Queue dry-run</button>
</form>
<form class="inline-form" method="post" action="{% url 'queue_manual_backup' host.host %}">
{% csrf_token %}
<input type="hidden" name="prune_max_delete" value="10">
<button type="submit" {% if not can_queue_backup %}disabled{% endif %}>Queue backup</button>
<button type="submit" {% if not can_queue_real_backup %}disabled{% endif %}>Queue backup</button>
</form>
</section>
{% if not can_queue_backup %}
{% if active_run %}
<p class="muted">Wait for the active run to finish, or cancel it from the run detail page.</p>
{% elif not can_queue_dry_run or not can_queue_real_backup %}
{% if not has_global_config %}
<p class="muted">Create the default global config before queueing backups.</p>
{% elif not host.enabled %}
<p class="muted">Enable this host before queueing backups.</p>
{% elif backup_gate.real_blockers %}
<p class="muted">Real backups are blocked by failed preflight checks. Dry-runs may still be available when storage-only checks fail.</p>
{% endif %}
{% endif %}
@@ -212,7 +257,7 @@
{% endfor %}
<div class="actions">
<button type="submit" {% if not can_queue_backup %}disabled{% endif %}>Queue with options</button>
<button type="submit" {% if not can_queue_dry_run and not can_queue_real_backup %}disabled{% endif %}>Queue with options</button>
</div>
</form>
</section>

View File

@@ -557,13 +557,17 @@ class ViewTests(TestCase):
def test_host_detail_renders_config_schedule_runs_and_snapshots(self) -> None:
self.client.force_login(self.staff_user)
GlobalConfig.objects.create(name="default", backup_root="/backups")
with TemporaryDirectory() as tmp:
backup_root = Path(tmp)
GlobalConfig.objects.create(name="default", backup_root=str(backup_root), rsync_args=["--archive"])
host = HostConfig.objects.create(
host="web-01",
address="web-01.example.test",
source_root="/srv",
retention_daily=7,
)
for subdir in ("scheduled", "manual", ".incomplete"):
(backup_root / host.host / subdir).mkdir(parents=True)
ScheduleConfig.objects.create(host=host, cron_expr="15 2 * * *", prune=True, last_status="success")
snapshot = self._snapshot(host, "20260519-021500Z__ABCDEFGH")
BackupRun.objects.create(host=host, status=BackupRun.Status.SUCCESS, snapshot=snapshot)
@@ -587,12 +591,54 @@ class ViewTests(TestCase):
self.assertContains(response, "Queue backup")
self.assertContains(response, "Host Check")
self.assertContains(response, reverse("prepare_host_directories", args=[host.host]))
self.assertContains(response, "ready")
self.assertContains(response, "warning")
self.assertContains(response, "Snapshot Discovery")
self.assertContains(response, reverse("queue_manual_backup", args=[host.host]))
self.assertContains(response, reverse("run_detail", args=[BackupRun.objects.get().id]))
self.assertContains(response, reverse("snapshot_detail", args=[snapshot.id]))
def test_host_detail_renders_effective_config_preview(self) -> None:
self.client.force_login(self.staff_user)
credential = SshCredential.objects.create(name="default-key", key_path="/var/lib/pobsync/id_ed25519")
GlobalConfig.objects.create(
name="default",
backup_root="/backups",
default_ssh_credential=credential,
ssh_user="root",
ssh_port=2222,
ssh_options=["-oBatchMode=yes"],
rsync_args=["--archive", "--numeric-ids"],
rsync_extra_args=["--delete"],
rsync_timeout_seconds=300,
rsync_bwlimit_kbps=2048,
default_source_root="/",
excludes_default=["/proc/***", "/sys/***"],
retention_daily=14,
retention_weekly=4,
retention_monthly=2,
retention_yearly=1,
)
host = HostConfig.objects.create(
host="web-01",
address="web-01.example.test",
includes=["/srv/www/***"],
excludes_add=["/srv/www/cache/***"],
rsync_extra_args=["--one-file-system"],
)
response = self.client.get(reverse("host_detail", args=[host.host]))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Effective Config")
self.assertContains(response, "root@web-01.example.test:2222")
self.assertContains(response, "default-key")
self.assertContains(response, "-oBatchMode=yes")
self.assertContains(response, "--archive --numeric-ids --delete --one-file-system")
self.assertContains(response, "/srv/www/***")
self.assertContains(response, "/srv/www/cache/***")
self.assertContains(response, "d14")
self.assertContains(response, "w8")
def test_host_detail_renders_backup_trends(self) -> None:
self.client.force_login(self.staff_user)
GlobalConfig.objects.create(name="default", backup_root="/backups")
@@ -779,7 +825,7 @@ class ViewTests(TestCase):
def test_queue_manual_backup_creates_queued_run_and_redirects_to_run_detail(self) -> None:
self.client.force_login(self.staff_user)
GlobalConfig.objects.create(name="default", backup_root="/backups")
GlobalConfig.objects.create(name="default", backup_root="/backups", rsync_args=["--archive"])
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
response = self.client.post(
@@ -812,8 +858,12 @@ class ViewTests(TestCase):
def test_queue_manual_backup_quick_action_can_queue_real_backup(self) -> None:
self.client.force_login(self.staff_user)
GlobalConfig.objects.create(name="default", backup_root="/backups")
with TemporaryDirectory() as tmp:
backup_root = Path(tmp)
GlobalConfig.objects.create(name="default", backup_root=str(backup_root), rsync_args=["--archive"])
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
for subdir in ("scheduled", "manual", ".incomplete"):
(backup_root / host.host / subdir).mkdir(parents=True)
response = self.client.post(
reverse("queue_manual_backup", args=[host.host]),
@@ -834,6 +884,41 @@ class ViewTests(TestCase):
},
)
def test_queue_manual_backup_blocks_real_backup_when_host_directories_are_missing(self) -> None:
self.client.force_login(self.staff_user)
with TemporaryDirectory() as tmp:
backup_root = Path(tmp)
GlobalConfig.objects.create(name="default", backup_root=str(backup_root), rsync_args=["--archive"])
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
response = self.client.post(
reverse("queue_manual_backup", args=[host.host]),
{"prune_max_delete": "10"},
follow=True,
)
self.assertRedirects(response, reverse("host_detail", args=[host.host]))
self.assertContains(response, "Cannot queue real backup until failed preflight checks are resolved")
self.assertContains(response, "Host backup root")
self.assertFalse(BackupRun.objects.exists())
def test_queue_manual_backup_allows_dry_run_with_only_storage_preflight_failures(self) -> None:
self.client.force_login(self.staff_user)
with TemporaryDirectory() as tmp:
backup_root = Path(tmp)
GlobalConfig.objects.create(name="default", backup_root=str(backup_root), rsync_args=["--archive"])
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
response = self.client.post(
reverse("queue_manual_backup", args=[host.host]),
{"dry_run": "on", "verbose_output": "on", "prune_max_delete": "10"},
follow=True,
)
run = BackupRun.objects.get(host=host)
self.assertRedirects(response, reverse("run_detail", args=[run.id]))
self.assertEqual(run.result["requested"]["dry_run"], True)
def test_queue_manual_backup_requires_default_global_config(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
@@ -846,7 +931,7 @@ class ViewTests(TestCase):
def test_queue_manual_backup_rejects_disabled_host(self) -> None:
self.client.force_login(self.staff_user)
GlobalConfig.objects.create(name="default", backup_root="/backups")
GlobalConfig.objects.create(name="default", backup_root="/backups", rsync_args=["--archive"])
host = HostConfig.objects.create(host="web-01", address="web-01.example.test", enabled=False)
response = self.client.post(reverse("queue_manual_backup", args=[host.host]), {"dry_run": "on"}, follow=True)

View File

@@ -29,8 +29,9 @@ from .forms import (
ScheduleConfigForm,
SshCredentialForm,
)
from .host_ops import collect_host_checks, ensure_host_directories
from .host_ops import ensure_host_directories
from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential
from .preflight import collect_backup_gate, effective_host_config_preview
from .retention import run_sql_retention_apply, run_sql_retention_plan
from .self_check import collect_self_checks, summarize_self_checks
from .scheduler import next_due_after
@@ -260,14 +261,15 @@ def create_host_config(request):
@staff_member_required
def host_detail(request, host: str):
host_config = get_object_or_404(HostConfig, host=host)
global_config = GlobalConfig.objects.filter(name="default").first()
schedule = _schedule_for_host(host_config)
queued_runs = host_config.runs.filter(status=BackupRun.Status.QUEUED)
running_runs = host_config.runs.filter(status=BackupRun.Status.RUNNING)
active_run = host_config.runs.filter(
status__in=[BackupRun.Status.QUEUED, BackupRun.Status.RUNNING]
).order_by("created_at", "id").first()
has_global_config = GlobalConfig.objects.filter(name="default").exists()
host_checks = collect_host_checks(host_config)
has_global_config = global_config is not None
backup_gate = collect_backup_gate(host_config, global_config)
stats_summary = collect_host_stats(host=host_config, limit=10)
context = {
"host": host_config,
@@ -275,11 +277,14 @@ def host_detail(request, host: str):
"next_run_at": _next_run_for_schedule(schedule, host_config),
"scheduler_timezone": timezone.get_current_timezone_name(),
"discovery": inspect_snapshot_discovery(host=host_config),
"host_checks": host_checks,
"host_check_summary": summarize_self_checks(host_checks),
"host_checks": backup_gate.checks,
"host_check_summary": summarize_self_checks(backup_gate.checks),
"backup_gate": backup_gate,
"effective_config": effective_host_config_preview(host_config, global_config) if global_config else {},
"stats_summary": stats_summary,
"manual_backup_form": ManualBackupForm(initial=_default_manual_backup_initial(host_config)),
"can_queue_backup": host_config.enabled and has_global_config,
"can_queue_dry_run": host_config.enabled and has_global_config and backup_gate.can_queue_dry_run and active_run is None,
"can_queue_real_backup": host_config.enabled and has_global_config and backup_gate.can_queue_real and active_run is None,
"has_global_config": has_global_config,
"active_run": active_run,
"latest_runs": host_config.runs.select_related("snapshot").order_by("-created_at")[:10],
@@ -338,7 +343,8 @@ def queue_manual_backup(request, host: str):
if not host_config.enabled:
messages.error(request, f"Cannot queue backup for disabled host {host_config.host}.")
return redirect("host_detail", host=host_config.host)
if not GlobalConfig.objects.filter(name="default").exists():
global_config = GlobalConfig.objects.filter(name="default").first()
if global_config is None:
messages.error(request, "Create the default global config before queueing backups.")
return redirect("host_detail", host=host_config.host)
@@ -347,6 +353,17 @@ def queue_manual_backup(request, host: str):
messages.error(request, "Manual backup options are invalid.")
return redirect("host_detail", host=host_config.host)
backup_gate = collect_backup_gate(host_config, global_config)
if form.cleaned_data["dry_run"]:
if not backup_gate.can_queue_dry_run:
blockers = ", ".join(check.name for check in backup_gate.dry_run_blockers)
messages.error(request, f"Cannot queue dry-run until failed preflight checks are resolved: {blockers}.")
return redirect("host_detail", host=host_config.host)
elif not backup_gate.can_queue_real:
blockers = ", ".join(check.name for check in backup_gate.real_blockers)
messages.error(request, f"Cannot queue real backup until failed preflight checks are resolved: {blockers}.")
return redirect("host_detail", host=host_config.host)
run = queue_backup_run(
host=host_config,
dry_run=form.cleaned_data["dry_run"],