(feature) Improve retention UX and prune safety #14

Merged
parkel merged 5 commits from issue-4-retention-ux-and-safety into master 2026-05-21 01:27:20 +02:00
8 changed files with 297 additions and 9 deletions
Showing only changes of commit 50eb7cf2f3 - Show all commits

View File

@@ -23,6 +23,7 @@ def run_sql_retention_plan(*, host: str, kind: str, protect_bases: bool) -> dict
host_config = _enabled_host_config(host) host_config = _enabled_host_config(host)
retention = _retention_for_host(host_config) retention = _retention_for_host(host_config)
snapshots = _snapshots_for_retention(host_config=host_config, kind=kind) snapshots = _snapshots_for_retention(host_config=host_config, kind=kind)
incomplete_snapshots = _incomplete_snapshots_for_host(host_config)
plan = build_retention_plan( plan = build_retention_plan(
snapshots=snapshots, snapshots=snapshots,
@@ -36,6 +37,7 @@ def run_sql_retention_plan(*, host: str, kind: str, protect_bases: bool) -> dict
keep, reasons = apply_base_protection(snapshots=snapshots, keep=keep, reasons=reasons) keep, reasons = apply_base_protection(snapshots=snapshots, keep=keep, reasons=reasons)
delete = [snapshot for snapshot in snapshots if snapshot.dirname not in keep] delete = [snapshot for snapshot in snapshots if snapshot.dirname not in keep]
keep_items = [snapshot for snapshot in snapshots if snapshot.dirname in keep]
return { return {
"ok": True, "ok": True,
@@ -45,7 +47,12 @@ def run_sql_retention_plan(*, host: str, kind: str, protect_bases: bool) -> dict
"retention": retention, "retention": retention,
"source": "sql", "source": "sql",
"keep": sorted(keep), "keep": sorted(keep),
"delete": [_snapshot_to_delete_item(snapshot) for snapshot in delete], "keep_items": [_snapshot_to_item(snapshot, reasons=reasons.get(snapshot.dirname, [])) for snapshot in keep_items],
"delete": [_snapshot_to_item(snapshot, reasons=["outside retention policy"]) for snapshot in delete],
"incomplete": [
_snapshot_to_item(snapshot, reasons=["incomplete snapshot; excluded from retention cleanup"])
for snapshot in incomplete_snapshots
],
"reasons": reasons, "reasons": reasons,
} }
@@ -146,6 +153,15 @@ def _snapshots_for_retention(*, host_config: HostConfig, kind: str) -> list[Snap
return [_snapshot_from_record(record) for record in records] return [_snapshot_from_record(record) for record in records]
def _incomplete_snapshots_for_host(host_config: HostConfig) -> list[Snapshot]:
records = (
SnapshotRecord.objects.filter(host=host_config, kind=SnapshotRecord.Kind.INCOMPLETE)
.select_related("base")
.order_by("-started_at", "dirname")
)
return [_snapshot_from_record(record) for record in records]
def _snapshot_from_record(record: SnapshotRecord) -> Snapshot: def _snapshot_from_record(record: SnapshotRecord) -> Snapshot:
return Snapshot( return Snapshot(
kind=record.kind, kind=record.kind,
@@ -173,13 +189,15 @@ def _base_meta_from_record(record: SnapshotRecord) -> dict[str, str] | None:
return None return None
def _snapshot_to_delete_item(snapshot: Snapshot) -> dict[str, Any]: def _snapshot_to_item(snapshot: Snapshot, *, reasons: list[str]) -> dict[str, Any]:
return { return {
"dirname": snapshot.dirname, "dirname": snapshot.dirname,
"kind": snapshot.kind, "kind": snapshot.kind,
"path": snapshot.path, "path": snapshot.path,
"dt": snapshot.dt.isoformat(), "dt": snapshot.dt.isoformat(),
"status": snapshot.status, "status": snapshot.status,
"reasons": reasons,
"reason": ", ".join(reasons),
} }

View File

@@ -231,6 +231,17 @@
.host-card-stat.wide { .host-card-stat.wide {
grid-column: 1 / -1; grid-column: 1 / -1;
} }
.host-card-warning {
background: #fffaf0;
border: 1px solid #e7cf8a;
border-radius: 6px;
color: var(--running);
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 14px;
padding: 10px;
}
.messages { display: grid; gap: 8px; margin-bottom: 18px; } .messages { display: grid; gap: 8px; margin-bottom: 18px; }
.message { .message {
background: var(--panel); background: var(--panel);

View File

@@ -118,6 +118,20 @@
</div> </div>
</div> </div>
</div> </div>
{% if host.retention_warning.has_warning %}
<div class="host-card-warning">
<span class="status warning">retention</span>
{% if host.retention_warning.prune_exceeded %}
Scheduled prune would delete {{ host.retention_warning.delete_count }} snapshot(s), above max {{ host.retention_warning.max_delete }}.
{% endif %}
{% if host.retention_warning.incomplete_count %}
{{ host.retention_warning.incomplete_count }} incomplete snapshot(s) need review.
{% endif %}
{% if host.retention_warning.error %}
{{ host.retention_warning.error }}
{% endif %}
</div>
{% endif %}
</article> </article>
{% empty %} {% empty %}
<p class="muted">No hosts configured yet.</p> <p class="muted">No hosts configured yet.</p>

View File

@@ -68,6 +68,30 @@
</section> </section>
</div> </div>
{% if retention_warning.has_warning %}
<section class="panel highlight warning">
<h2>Retention Warnings</h2>
<div class="stack">
{% if retention_warning.prune_exceeded %}
<div>
Scheduled pruning would delete {{ retention_warning.delete_count }} snapshot(s), above max delete
{{ retention_warning.max_delete }}. Scheduled pruning will refuse this plan until the limit or retention
selection is adjusted.
</div>
{% endif %}
{% if retention_warning.incomplete_count %}
<div>
{{ retention_warning.incomplete_count }} incomplete snapshot(s) exist. Retention does not delete incomplete
snapshots automatically; inspect them before cleanup.
</div>
{% endif %}
{% if retention_warning.error %}
<div>{{ retention_warning.error }}</div>
{% endif %}
</div>
</section>
{% endif %}
{% if effective_config %} {% if effective_config %}
<section class="panel"> <section class="panel">
<h2>Effective Config</h2> <h2>Effective Config</h2>

View File

@@ -18,8 +18,31 @@
<div class="metric"><div class="label">Kind</div><div class="value">{{ plan.kind }}</div></div> <div class="metric"><div class="label">Kind</div><div class="value">{{ plan.kind }}</div></div>
<div class="metric"><div class="label">Keep</div><div class="value">{{ plan.keep|length }}</div></div> <div class="metric"><div class="label">Keep</div><div class="value">{{ plan.keep|length }}</div></div>
<div class="metric"><div class="label">Would Delete</div><div class="value">{{ plan.delete|length }}</div></div> <div class="metric"><div class="label">Would Delete</div><div class="value">{{ plan.delete|length }}</div></div>
<div class="metric"><div class="label">Scheduled Limit</div><div class="value">{{ scheduled_prune_limit|default:"none" }}</div></div>
<div class="metric"><div class="label">Incomplete</div><div class="value">{{ plan.incomplete|length }}</div></div>
</section> </section>
{% if scheduled_prune_exceeded %}
<section class="panel highlight warning">
<h2>Scheduled Prune Limit</h2>
<p>
This plan would delete {{ plan.delete|length }} snapshot(s), which exceeds the scheduled prune limit of
{{ scheduled_prune_limit }}. Scheduled pruning will refuse to apply this plan until the limit or retention
selection is adjusted.
</p>
</section>
{% endif %}
{% if plan.incomplete %}
<section class="panel highlight warning">
<h2>Incomplete Snapshots</h2>
<p>
{{ plan.incomplete|length }} incomplete snapshot(s) exist for this host. Retention does not delete incomplete
snapshots automatically because they can indicate an interrupted backup that should be inspected first.
</p>
</section>
{% endif %}
<section class="panel"> <section class="panel">
<h2>Policy</h2> <h2>Policy</h2>
<div class="stack"> <div class="stack">
@@ -28,6 +51,17 @@
<div><strong>Monthly:</strong> {{ plan.retention.monthly }}</div> <div><strong>Monthly:</strong> {{ plan.retention.monthly }}</div>
<div><strong>Yearly:</strong> {{ plan.retention.yearly }}</div> <div><strong>Yearly:</strong> {{ plan.retention.yearly }}</div>
<div><strong>Protect bases:</strong> {{ protect_bases|yesno:"yes,no" }}</div> <div><strong>Protect bases:</strong> {{ protect_bases|yesno:"yes,no" }}</div>
<div class="muted">
{% if protect_bases %}
Base snapshots referenced by kept snapshots are also kept and marked with a base-of reason.
{% else %}
Base snapshots are only kept when they match the regular retention policy.
{% endif %}
</div>
{% if schedule %}
<div><strong>Schedule pruning:</strong> {{ schedule.prune|yesno:"enabled,disabled" }}</div>
<div><strong>Schedule max delete:</strong> {{ schedule.prune_max_delete }}</div>
{% endif %}
</div> </div>
</section> </section>
@@ -40,6 +74,7 @@
<th>Dirname</th> <th>Dirname</th>
<th>Started</th> <th>Started</th>
<th>Status</th> <th>Status</th>
<th>Reason</th>
<th>Path</th> <th>Path</th>
</tr> </tr>
</thead> </thead>
@@ -50,10 +85,11 @@
<td>{{ snapshot.dirname }}</td> <td>{{ snapshot.dirname }}</td>
<td>{{ snapshot.dt }}</td> <td>{{ snapshot.dt }}</td>
<td>{{ snapshot.status|default:"" }}</td> <td>{{ snapshot.status|default:"" }}</td>
<td>{{ snapshot.reason }}</td>
<td class="muted">{{ snapshot.path }}</td> <td class="muted">{{ snapshot.path }}</td>
</tr> </tr>
{% empty %} {% empty %}
<tr><td colspan="5" class="muted">Retention would not delete snapshots for this selection.</td></tr> <tr><td colspan="6" class="muted">Retention would not delete snapshots for this selection.</td></tr>
{% endfor %} {% endfor %}
</tbody> </tbody>
</table> </table>
@@ -71,7 +107,7 @@
{{ apply_form.max_delete.errors }} {{ apply_form.max_delete.errors }}
<label for="{{ apply_form.max_delete.id_for_label }}">Max delete</label> <label for="{{ apply_form.max_delete.id_for_label }}">Max delete</label>
{{ apply_form.max_delete }} {{ apply_form.max_delete }}
<div class="helptext">Must be at least the number of snapshots shown in Would Delete.</div> <div class="helptext">Must be at least {{ plan.delete|length }} for the snapshots shown in Would Delete.</div>
</div> </div>
<div class="field"> <div class="field">
@@ -99,20 +135,54 @@
<table> <table>
<thead> <thead>
<tr> <tr>
<th>Kind</th>
<th>Dirname</th> <th>Dirname</th>
<th>Started</th>
<th>Status</th>
<th>Reasons</th> <th>Reasons</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for dirname, reasons in plan.reasons.items %} {% for snapshot in plan.keep_items %}
<tr> <tr>
<td>{{ dirname }}</td> <td>{{ snapshot.kind }}</td>
<td>{{ reasons|join:", " }}</td> <td>{{ snapshot.dirname }}</td>
<td>{{ snapshot.dt }}</td>
<td>{{ snapshot.status|default:"" }}</td>
<td>{{ snapshot.reason }}</td>
</tr> </tr>
{% empty %} {% empty %}
<tr><td colspan="2" class="muted">No snapshots matched this retention selection.</td></tr> <tr><td colspan="5" class="muted">No snapshots matched this retention selection.</td></tr>
{% endfor %} {% endfor %}
</tbody> </tbody>
</table> </table>
</section> </section>
{% if plan.incomplete %}
<section class="panel">
<h2>Incomplete Snapshots</h2>
<table>
<thead>
<tr>
<th>Dirname</th>
<th>Started</th>
<th>Status</th>
<th>Reason</th>
<th>Path</th>
</tr>
</thead>
<tbody>
{% for snapshot in plan.incomplete %}
<tr>
<td>{{ snapshot.dirname }}</td>
<td>{{ snapshot.dt }}</td>
<td>{{ snapshot.status|default:"" }}</td>
<td>{{ snapshot.reason }}</td>
<td class="muted">{{ snapshot.path }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
{% endif %}
{% endblock %} {% endblock %}

View File

@@ -32,7 +32,10 @@ class SqlRetentionTests(TestCase):
self.assertEqual(plan["source"], "sql") self.assertEqual(plan["source"], "sql")
self.assertEqual(plan["keep"], [new.dirname]) self.assertEqual(plan["keep"], [new.dirname])
self.assertEqual([item["dirname"] for item in plan["keep_items"]], [new.dirname])
self.assertEqual([item["dirname"] for item in plan["delete"]], [old.dirname]) self.assertEqual([item["dirname"] for item in plan["delete"]], [old.dirname])
self.assertEqual(plan["delete"][0]["reason"], "outside retention policy")
self.assertEqual(plan["incomplete"], [])
def test_plan_can_protect_base_snapshot_from_sql_relation(self) -> None: def test_plan_can_protect_base_snapshot_from_sql_relation(self) -> None:
host = HostConfig.objects.create( host = HostConfig.objects.create(

View File

@@ -99,6 +99,35 @@ class ViewTests(TestCase):
self.assertContains(response, "manual") self.assertContains(response, "manual")
self.assertContains(response, "1000") self.assertContains(response, "1000")
def test_dashboard_surfaces_retention_warnings(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(
host="web-01",
address="web-01.example.test",
retention_daily=0,
retention_weekly=0,
retention_monthly=0,
retention_yearly=0,
)
ScheduleConfig.objects.create(host=host, cron_expr="15 2 * * *", enabled=True, prune=True, prune_max_delete=1)
self._snapshot(host, "20260517-021500Z__OLDSNP1")
self._snapshot(host, "20260518-021500Z__OLDSNP2")
self._snapshot(host, "20260519-021500Z__NEWSNAP")
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname="20260519-031500Z__BROKEN01",
path=f"/backups/{host.host}/.incomplete/20260519-031500Z__BROKEN01",
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
response = self.client.get(reverse("dashboard"))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Scheduled prune would delete 2 snapshot(s), above max 1.")
self.assertContains(response, "1 incomplete snapshot(s) need review.")
def test_dashboard_links_latest_snapshot_for_each_host(self) -> None: def test_dashboard_links_latest_snapshot_for_each_host(self) -> None:
self.client.force_login(self.staff_user) self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test") host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
@@ -1351,6 +1380,30 @@ class ViewTests(TestCase):
self.assertContains(response, new_snapshot.dirname) self.assertContains(response, new_snapshot.dirname)
self.assertContains(response, "newest") self.assertContains(response, "newest")
self.assertContains(response, "Would Delete") self.assertContains(response, "Would Delete")
self.assertContains(response, "outside retention policy")
def test_retention_plan_warns_when_scheduled_prune_limit_is_exceeded(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(
host="web-01",
address="web-01.example.test",
retention_daily=0,
retention_weekly=0,
retention_monthly=0,
retention_yearly=0,
)
ScheduleConfig.objects.create(host=host, cron_expr="15 2 * * *", enabled=True, prune=True, prune_max_delete=1)
self._snapshot(host, "20260517-021500Z__OLDSNP1")
self._snapshot(host, "20260518-021500Z__OLDSNP2")
self._snapshot(host, "20260519-021500Z__NEWSNAP")
response = self.client.get(reverse("host_retention_plan", args=[host.host]))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Scheduled Prune Limit")
self.assertContains(response, "would delete 2 snapshot(s)")
self.assertContains(response, "scheduled prune limit of")
self.assertContains(response, "Schedule max delete:</strong> 1")
def test_retention_plan_can_enable_base_protection(self) -> None: def test_retention_plan_can_enable_base_protection(self) -> None:
self.client.force_login(self.staff_user) self.client.force_login(self.staff_user)
@@ -1371,8 +1424,58 @@ class ViewTests(TestCase):
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertContains(response, "Protect bases:</strong> yes") self.assertContains(response, "Protect bases:</strong> yes")
self.assertContains(response, "Base snapshots referenced by kept snapshots")
self.assertContains(response, f"base-of:{child.dirname}") self.assertContains(response, f"base-of:{child.dirname}")
def test_retention_plan_surfaces_incomplete_snapshots_without_deleting_them(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(
host="web-01",
address="web-01.example.test",
retention_daily=0,
retention_weekly=0,
retention_monthly=0,
retention_yearly=0,
)
self._snapshot(host, "20260518-021500Z__OLDSNAP")
self._snapshot(host, "20260519-021500Z__NEWSNAP")
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname="20260519-031500Z__BROKEN01",
path=f"/backups/{host.host}/.incomplete/20260519-031500Z__BROKEN01",
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
response = self.client.get(reverse("host_retention_plan", args=[host.host]))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Incomplete Snapshots")
self.assertContains(response, "20260519-031500Z__BROKEN01")
self.assertContains(response, "excluded from retention cleanup")
def test_host_detail_surfaces_retention_warnings(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(
host="web-01",
address="web-01.example.test",
retention_daily=0,
retention_weekly=0,
retention_monthly=0,
retention_yearly=0,
)
ScheduleConfig.objects.create(host=host, cron_expr="15 2 * * *", enabled=True, prune=True, prune_max_delete=1)
self._snapshot(host, "20260517-021500Z__OLDSNP1")
self._snapshot(host, "20260518-021500Z__OLDSNP2")
self._snapshot(host, "20260519-021500Z__NEWSNAP")
response = self.client.get(reverse("host_detail", args=[host.host]))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Retention Warnings")
self.assertContains(response, "Scheduled pruning would delete 2 snapshot(s), above max delete")
def test_retention_plan_rejects_invalid_kind(self) -> None: def test_retention_plan_rejects_invalid_kind(self) -> None:
self.client.force_login(self.staff_user) self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test") host = HostConfig.objects.create(host="web-01", address="web-01.example.test")

View File

@@ -55,6 +55,7 @@ def dashboard(request):
.first() .first()
) )
host_config.next_run_at = _next_run_for_host(host_config) host_config.next_run_at = _next_run_for_host(host_config)
host_config.retention_warning = _retention_warning_for_host(host_config, _schedule_for_host(host_config))
stats_summary = collect_dashboard_stats(hosts=hosts, global_config=global_config) stats_summary = collect_dashboard_stats(hosts=hosts, global_config=global_config)
context = { context = {
"hosts": hosts, "hosts": hosts,
@@ -274,6 +275,7 @@ def host_detail(request, host: str):
context = { context = {
"host": host_config, "host": host_config,
"schedule": schedule, "schedule": schedule,
"retention_warning": _retention_warning_for_host(host_config, schedule),
"next_run_at": _next_run_for_schedule(schedule, host_config), "next_run_at": _next_run_for_schedule(schedule, host_config),
"scheduler_timezone": timezone.get_current_timezone_name(), "scheduler_timezone": timezone.get_current_timezone_name(),
"discovery": inspect_snapshot_discovery(host=host_config), "discovery": inspect_snapshot_discovery(host=host_config),
@@ -526,17 +528,23 @@ def host_retention_plan(request, host: str):
except PobsyncError as exc: except PobsyncError as exc:
messages.error(request, str(exc)) messages.error(request, str(exc))
return redirect("host_detail", host=host_config.host) return redirect("host_detail", host=host_config.host)
schedule = _schedule_for_host(host_config)
scheduled_prune_limit = schedule.prune_max_delete if schedule and schedule.prune else None
delete_count = len(plan["delete"])
context = { context = {
"host": host_config, "host": host_config,
"kind": kind, "kind": kind,
"protect_bases": protect_bases, "protect_bases": protect_bases,
"plan": plan, "plan": plan,
"schedule": schedule,
"scheduled_prune_limit": scheduled_prune_limit,
"scheduled_prune_exceeded": scheduled_prune_limit is not None and delete_count > scheduled_prune_limit,
"apply_form": RetentionApplyForm( "apply_form": RetentionApplyForm(
host_name=host_config.host, host_name=host_config.host,
initial={ initial={
"kind": kind, "kind": kind,
"protect_bases": protect_bases, "protect_bases": protect_bases,
"max_delete": len(plan["delete"]), "max_delete": delete_count,
}, },
), ),
} }
@@ -652,6 +660,43 @@ def _next_run_for_schedule(schedule: ScheduleConfig | None, host_config: HostCon
return None return None
def _retention_warning_for_host(host_config: HostConfig, schedule: ScheduleConfig | None) -> dict[str, object]:
incomplete_count = host_config.snapshots.filter(kind=SnapshotRecord.Kind.INCOMPLETE).count()
warning: dict[str, object] = {
"has_warning": incomplete_count > 0,
"incomplete_count": incomplete_count,
}
if schedule is None or not schedule.prune or not host_config.enabled:
return warning
try:
plan = run_sql_retention_plan(
host=host_config.host,
kind="scheduled",
protect_bases=bool(schedule.prune_protect_bases),
)
except PobsyncError as exc:
warning.update(
{
"has_warning": True,
"error": str(exc),
}
)
return warning
delete_count = len(plan.get("delete") or [])
warning.update(
{
"delete_count": delete_count,
"max_delete": schedule.prune_max_delete,
"protect_bases": bool(schedule.prune_protect_bases),
"prune_exceeded": delete_count > schedule.prune_max_delete,
}
)
if warning["prune_exceeded"]:
warning["has_warning"] = True
return warning
def _default_schedule_initial() -> dict[str, object]: def _default_schedule_initial() -> dict[str, object]:
return { return {
"cron_expr": "15 2 * * *", "cron_expr": "15 2 * * *",