(release) Add explicit incomplete snapshot cleanup

Add a dedicated cleanup path for incomplete snapshots instead of letting
retention prune them implicitly. The retention plan now exposes a guarded
form that requires host and delete-count confirmation before removing
.incomplete snapshot directories and their SQL records.

Keep scheduled/manual retention behavior unchanged, add path safety checks,
and cover cleanup success, confirmation failures, max-delete limits, and
unexpected paths in tests.

Refs #10
This commit is contained in:
2026-05-21 03:26:21 +02:00
parent 4c8ed24561
commit d0c23deb72
7 changed files with 329 additions and 2 deletions

View File

@@ -274,6 +274,36 @@ class RetentionApplyForm(forms.Form):
return value
class IncompleteCleanupForm(forms.Form):
max_delete = forms.IntegerField(min_value=0, initial=0)
confirm_delete_count = forms.IntegerField(min_value=0)
confirm_host = forms.CharField()
def __init__(self, *args, host_name: str, expected_delete_count: int, **kwargs) -> None:
self.host_name = host_name
self.expected_delete_count = expected_delete_count
super().__init__(*args, **kwargs)
self.fields["confirm_host"].help_text = f"Type {host_name} to confirm incomplete snapshot cleanup."
self.fields["confirm_delete_count"].help_text = (
f"Type {expected_delete_count} to confirm the current number of incomplete snapshots."
)
self.fields["max_delete"].help_text = (
f"Must be at least {expected_delete_count} for the incomplete snapshots shown here."
)
def clean_confirm_host(self) -> str:
value = self.cleaned_data["confirm_host"].strip()
if value != self.host_name:
raise forms.ValidationError(f"Type {self.host_name} to confirm.")
return value
def clean_confirm_delete_count(self) -> int:
value = self.cleaned_data["confirm_delete_count"]
if value != self.expected_delete_count:
raise forms.ValidationError(f"Type {self.expected_delete_count} to confirm the incomplete count.")
return value
class ScheduleConfigForm(forms.ModelForm):
cron_expr = forms.CharField(
label="Schedule expression",

View File

@@ -131,6 +131,76 @@ def run_sql_retention_apply(
return _do_apply()
def run_incomplete_cleanup(
*,
prefix: Path,
host: str,
yes: bool,
max_delete: int,
acquire_lock: bool = True,
) -> dict[str, Any]:
host = sanitize_host(host)
if not yes:
raise ConfigError("Refusing to delete incomplete snapshots without --yes")
if max_delete < 0:
raise ConfigError("--max-delete must be >= 0")
paths = PobsyncPaths(home=prefix)
def _do_cleanup() -> dict[str, Any]:
host_config = _enabled_host_config(host)
incomplete_list = [
_snapshot_to_item(snapshot, reasons=["manual incomplete cleanup"])
for snapshot in _incomplete_snapshots_for_host(host_config)
]
if max_delete == 0 and len(incomplete_list) > 0:
raise ConfigError("Incomplete cleanup blocked by --max-delete=0")
if len(incomplete_list) > max_delete:
raise ConfigError(
f"Refusing to delete {len(incomplete_list)} incomplete snapshots (exceeds --max-delete={max_delete})"
)
actions: list[str] = []
deleted: list[dict[str, Any]] = []
for item in incomplete_list:
dirname = item["dirname"]
snap_path = Path(item["path"])
path = _snapshot_delete_path(path=snap_path, dirname=dirname)
_validate_incomplete_delete_path(host=host, path=path, dirname=dirname)
if not path.exists():
actions.append(f"skip missing incomplete/{dirname}")
elif not path.is_dir():
raise ConfigError(f"Refusing to delete non-directory path: {path}")
else:
_remove_snapshot_tree(path)
actions.append(f"deleted incomplete {dirname}")
SnapshotRecord.objects.filter(
host__host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=dirname,
).delete()
deleted.append({"dirname": dirname, "kind": SnapshotRecord.Kind.INCOMPLETE, "path": str(path)})
return {
"ok": True,
"host": host,
"kind": SnapshotRecord.Kind.INCOMPLETE,
"max_delete": max_delete,
"source": "sql",
"planned_delete_count": len(incomplete_list),
"deleted": deleted,
"actions": actions,
}
if acquire_lock:
with acquire_host_lock(paths.locks_dir, host, command="incomplete-cleanup"):
return _do_cleanup()
return _do_cleanup()
def _enabled_host_config(host: str) -> HostConfig:
try:
return HostConfig.objects.get(host=host, enabled=True)
@@ -212,6 +282,15 @@ def _snapshot_delete_path(*, path: Path, dirname: str) -> Path:
return path
def _validate_incomplete_delete_path(*, host: str, path: Path, dirname: str) -> None:
path_parts = path.parts
if path.name != dirname or ".incomplete" not in path_parts or host not in path_parts:
raise ConfigError(f"Refusing to delete unexpected incomplete snapshot path: {path}")
incomplete_index = path_parts.index(".incomplete")
if incomplete_index == 0 or path_parts[incomplete_index - 1] != host:
raise ConfigError(f"Refusing to delete incomplete snapshot outside host backup root: {path}")
def _remove_snapshot_tree(path: Path) -> None:
_make_directories_user_writable(path)
shutil.rmtree(path)

View File

@@ -40,6 +40,10 @@
{{ plan.incomplete|length }} incomplete snapshot(s) exist for this host. Retention does not delete incomplete
snapshots automatically because they can indicate an interrupted backup that should be inspected first.
</p>
<p>
After inspection, use the dedicated cleanup form below to delete only incomplete snapshot directories and their
SQL records. Successful scheduled and manual snapshots are not touched by this cleanup.
</p>
</section>
{% endif %}
@@ -190,6 +194,37 @@
{% endfor %}
</tbody>
</table>
<h3>Cleanup Incomplete Snapshots</h3>
<form method="post" action="{% url 'cleanup_host_incomplete_snapshots' host.host %}" class="form-grid">
{% csrf_token %}
{{ incomplete_cleanup_form.non_field_errors }}
<div class="field">
{{ incomplete_cleanup_form.max_delete.errors }}
<label for="{{ incomplete_cleanup_form.max_delete.id_for_label }}">Max delete</label>
{{ incomplete_cleanup_form.max_delete }}
<div class="helptext">{{ incomplete_cleanup_form.max_delete.help_text }}</div>
</div>
<div class="field">
{{ incomplete_cleanup_form.confirm_host.errors }}
<label for="{{ incomplete_cleanup_form.confirm_host.id_for_label }}">Confirm host</label>
{{ incomplete_cleanup_form.confirm_host }}
<div class="helptext">{{ incomplete_cleanup_form.confirm_host.help_text }}</div>
</div>
<div class="field">
{{ incomplete_cleanup_form.confirm_delete_count.errors }}
<label for="{{ incomplete_cleanup_form.confirm_delete_count.id_for_label }}">Confirm incomplete count</label>
{{ incomplete_cleanup_form.confirm_delete_count }}
<div class="helptext">{{ incomplete_cleanup_form.confirm_delete_count.help_text }}</div>
</div>
<div class="actions">
<button type="submit">Delete incomplete snapshots</button>
</div>
</form>
</section>
{% endif %}
{% endblock %}

View File

@@ -12,7 +12,7 @@ from django.test import TestCase
from pobsync.errors import ConfigError
from pobsync_backend.models import HostConfig, SnapshotRecord
from pobsync_backend.retention import run_sql_retention_apply, run_sql_retention_plan
from pobsync_backend.retention import run_incomplete_cleanup, run_sql_retention_apply, run_sql_retention_plan
class SqlRetentionTests(TestCase):
@@ -152,6 +152,78 @@ class SqlRetentionTests(TestCase):
acquire_lock=False,
)
def test_incomplete_cleanup_deletes_directory_and_record(self) -> None:
with TemporaryDirectory() as tmp:
prefix = Path(tmp) / "home"
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
incomplete_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-031500Z__BROKEN01"
incomplete_dir.mkdir(parents=True)
incomplete_dir.joinpath("partial-file").write_text("interrupted\n")
record = SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=incomplete_dir.name,
path=str(incomplete_dir),
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
result = run_incomplete_cleanup(
prefix=prefix,
host=host.host,
yes=True,
max_delete=1,
acquire_lock=False,
)
self.assertFalse(incomplete_dir.exists())
self.assertFalse(SnapshotRecord.objects.filter(pk=record.pk).exists())
self.assertEqual(
result["deleted"],
[{"dirname": incomplete_dir.name, "kind": SnapshotRecord.Kind.INCOMPLETE, "path": str(incomplete_dir)}],
)
self.assertEqual(result["planned_delete_count"], 1)
def test_incomplete_cleanup_respects_max_delete(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname="20260519-031500Z__BROKEN01",
path=f"/backups/{host.host}/.incomplete/20260519-031500Z__BROKEN01",
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
with self.assertRaisesRegex(ConfigError, "blocked by --max-delete=0"):
run_incomplete_cleanup(
prefix=Path("/tmp/pobsync-test"),
host=host.host,
yes=True,
max_delete=0,
acquire_lock=False,
)
def test_incomplete_cleanup_rejects_unexpected_path(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname="20260519-031500Z__BROKEN01",
path=f"/backups/{host.host}/scheduled/20260519-031500Z__BROKEN01",
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
with self.assertRaisesRegex(ConfigError, "unexpected incomplete snapshot path"):
run_incomplete_cleanup(
prefix=Path("/tmp/pobsync-test"),
host=host.host,
yes=True,
max_delete=1,
acquire_lock=False,
)
def test_management_command_plans_from_sql(self) -> None:
host = HostConfig.objects.create(
host="web-01",

View File

@@ -1620,6 +1620,68 @@ class ViewTests(TestCase):
self.assertContains(response, "Incomplete Snapshots")
self.assertContains(response, "20260519-031500Z__BROKEN01")
self.assertContains(response, "excluded from retention cleanup")
self.assertContains(response, "Delete incomplete snapshots")
self.assertContains(response, "Type 1 to confirm the current number of incomplete snapshots.")
def test_incomplete_cleanup_deletes_incomplete_snapshot_after_confirmation(self) -> None:
self.client.force_login(self.staff_user)
with TemporaryDirectory() as tmp:
home = Path(tmp) / "home"
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
incomplete_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-031500Z__BROKEN01"
incomplete_dir.mkdir(parents=True)
incomplete_dir.joinpath("partial-file").write_text("interrupted\n")
record = SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=incomplete_dir.name,
path=str(incomplete_dir),
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
with override_settings(POBSYNC_HOME=str(home)):
response = self.client.post(
reverse("cleanup_host_incomplete_snapshots", args=[host.host]),
{
"max_delete": "1",
"confirm_host": host.host,
"confirm_delete_count": "1",
},
follow=True,
)
self.assertFalse(incomplete_dir.exists())
self.assertRedirects(response, reverse("host_retention_plan", args=[host.host]))
self.assertContains(response, "Deleted 1 incomplete snapshot(s) for web-01.")
self.assertFalse(SnapshotRecord.objects.filter(pk=record.pk).exists())
def test_incomplete_cleanup_rejects_bad_confirmation(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname="20260519-031500Z__BROKEN01",
path=f"/backups/{host.host}/.incomplete/20260519-031500Z__BROKEN01",
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
response = self.client.post(
reverse("cleanup_host_incomplete_snapshots", args=[host.host]),
{
"max_delete": "1",
"confirm_host": "wrong",
"confirm_delete_count": "1",
},
follow=True,
)
self.assertRedirects(response, reverse("host_retention_plan", args=[host.host]))
self.assertContains(response, "Incomplete cleanup confirmation is invalid.")
self.assertEqual(SnapshotRecord.objects.filter(kind=SnapshotRecord.Kind.INCOMPLETE).count(), 1)
def test_host_detail_surfaces_retention_warnings(self) -> None:
self.client.force_login(self.staff_user)

View File

@@ -25,6 +25,7 @@ from .forms import (
CreateHostConfigForm,
GlobalConfigForm,
HostConfigForm,
IncompleteCleanupForm,
ManualBackupForm,
RetentionApplyForm,
SshCredentialGenerateForm,
@@ -34,7 +35,7 @@ from .forms import (
from .host_ops import ensure_host_directories
from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential
from .preflight import collect_backup_gate, effective_host_config_preview, run_remote_preflight
from .retention import run_sql_retention_apply, run_sql_retention_plan
from .retention import run_incomplete_cleanup, run_sql_retention_apply, run_sql_retention_plan
from .self_check import collect_self_checks, summarize_self_checks
from .scheduler import next_due_after
from .snapshot_discovery import discover_snapshots, inspect_snapshot_discovery
@@ -569,6 +570,7 @@ def host_retention_plan(request, host: str):
schedule = _schedule_for_host(host_config)
scheduled_prune_limit = schedule.prune_max_delete if schedule and schedule.prune else None
delete_count = len(plan["delete"])
incomplete_count = len(plan["incomplete"])
context = {
"host": host_config,
"kind": kind,
@@ -587,6 +589,14 @@ def host_retention_plan(request, host: str):
"confirm_delete_count": delete_count,
},
),
"incomplete_cleanup_form": IncompleteCleanupForm(
host_name=host_config.host,
expected_delete_count=incomplete_count,
initial={
"max_delete": incomplete_count,
"confirm_delete_count": incomplete_count,
},
),
}
return render(request, "pobsync_backend/retention_plan.html", context)
@@ -643,6 +653,40 @@ def apply_host_retention(request, host: str):
return target
@staff_member_required
@require_POST
def cleanup_host_incomplete_snapshots(request, host: str):
host_config = get_object_or_404(HostConfig, host=host)
try:
plan = run_sql_retention_plan(host=host_config.host, kind="all", protect_bases=True)
except PobsyncError as exc:
messages.error(request, str(exc))
return redirect("host_retention_plan", host=host_config.host)
incomplete_count = len(plan.get("incomplete") or [])
form = IncompleteCleanupForm(
request.POST,
host_name=host_config.host,
expected_delete_count=incomplete_count,
)
if not form.is_valid():
messages.error(request, "Incomplete cleanup confirmation is invalid.")
return redirect("host_retention_plan", host=host_config.host)
try:
result = run_incomplete_cleanup(
prefix=Path(settings.POBSYNC_HOME),
host=host_config.host,
yes=True,
max_delete=form.cleaned_data["max_delete"],
)
except PobsyncError as exc:
messages.error(request, str(exc))
else:
messages.success(request, f"Deleted {len(result['deleted'])} incomplete snapshot(s) for {host_config.host}.")
return redirect("host_retention_plan", host=host_config.host)
@staff_member_required
def edit_host_config(request, host: str):
host_config = get_object_or_404(HostConfig, host=host)

View File

@@ -27,6 +27,11 @@ urlpatterns = [
path("hosts/<str:host>/discover-snapshots/", views.discover_host_snapshots, name="discover_host_snapshots"),
path("hosts/<str:host>/retention-apply/", views.apply_host_retention, name="apply_host_retention"),
path("hosts/<str:host>/retention-plan/", views.host_retention_plan, name="host_retention_plan"),
path(
"hosts/<str:host>/incomplete-cleanup/",
views.cleanup_host_incomplete_snapshots,
name="cleanup_host_incomplete_snapshots",
),
path("hosts/<str:host>/schedule/", views.edit_host_schedule, name="edit_host_schedule"),
path("runs/<int:run_id>/", views.run_detail, name="run_detail"),
path("runs/<int:run_id>/rsync-log/", views.run_rsync_log, name="run_rsync_log"),