(release) Add explicit incomplete snapshot cleanup

Add a dedicated cleanup path for incomplete snapshots instead of letting
retention prune them implicitly. The retention plan now exposes a guarded
form that requires host and delete-count confirmation before removing
.incomplete snapshot directories and their SQL records.

Keep scheduled/manual retention behavior unchanged, add path safety checks,
and cover cleanup success, confirmation failures, max-delete limits, and
unexpected paths in tests.

Refs #10
This commit is contained in:
2026-05-21 03:26:21 +02:00
parent 4c8ed24561
commit d0c23deb72
7 changed files with 329 additions and 2 deletions

View File

@@ -274,6 +274,36 @@ class RetentionApplyForm(forms.Form):
return value return value
class IncompleteCleanupForm(forms.Form):
max_delete = forms.IntegerField(min_value=0, initial=0)
confirm_delete_count = forms.IntegerField(min_value=0)
confirm_host = forms.CharField()
def __init__(self, *args, host_name: str, expected_delete_count: int, **kwargs) -> None:
self.host_name = host_name
self.expected_delete_count = expected_delete_count
super().__init__(*args, **kwargs)
self.fields["confirm_host"].help_text = f"Type {host_name} to confirm incomplete snapshot cleanup."
self.fields["confirm_delete_count"].help_text = (
f"Type {expected_delete_count} to confirm the current number of incomplete snapshots."
)
self.fields["max_delete"].help_text = (
f"Must be at least {expected_delete_count} for the incomplete snapshots shown here."
)
def clean_confirm_host(self) -> str:
value = self.cleaned_data["confirm_host"].strip()
if value != self.host_name:
raise forms.ValidationError(f"Type {self.host_name} to confirm.")
return value
def clean_confirm_delete_count(self) -> int:
value = self.cleaned_data["confirm_delete_count"]
if value != self.expected_delete_count:
raise forms.ValidationError(f"Type {self.expected_delete_count} to confirm the incomplete count.")
return value
class ScheduleConfigForm(forms.ModelForm): class ScheduleConfigForm(forms.ModelForm):
cron_expr = forms.CharField( cron_expr = forms.CharField(
label="Schedule expression", label="Schedule expression",

View File

@@ -131,6 +131,76 @@ def run_sql_retention_apply(
return _do_apply() return _do_apply()
def run_incomplete_cleanup(
*,
prefix: Path,
host: str,
yes: bool,
max_delete: int,
acquire_lock: bool = True,
) -> dict[str, Any]:
host = sanitize_host(host)
if not yes:
raise ConfigError("Refusing to delete incomplete snapshots without --yes")
if max_delete < 0:
raise ConfigError("--max-delete must be >= 0")
paths = PobsyncPaths(home=prefix)
def _do_cleanup() -> dict[str, Any]:
host_config = _enabled_host_config(host)
incomplete_list = [
_snapshot_to_item(snapshot, reasons=["manual incomplete cleanup"])
for snapshot in _incomplete_snapshots_for_host(host_config)
]
if max_delete == 0 and len(incomplete_list) > 0:
raise ConfigError("Incomplete cleanup blocked by --max-delete=0")
if len(incomplete_list) > max_delete:
raise ConfigError(
f"Refusing to delete {len(incomplete_list)} incomplete snapshots (exceeds --max-delete={max_delete})"
)
actions: list[str] = []
deleted: list[dict[str, Any]] = []
for item in incomplete_list:
dirname = item["dirname"]
snap_path = Path(item["path"])
path = _snapshot_delete_path(path=snap_path, dirname=dirname)
_validate_incomplete_delete_path(host=host, path=path, dirname=dirname)
if not path.exists():
actions.append(f"skip missing incomplete/{dirname}")
elif not path.is_dir():
raise ConfigError(f"Refusing to delete non-directory path: {path}")
else:
_remove_snapshot_tree(path)
actions.append(f"deleted incomplete {dirname}")
SnapshotRecord.objects.filter(
host__host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=dirname,
).delete()
deleted.append({"dirname": dirname, "kind": SnapshotRecord.Kind.INCOMPLETE, "path": str(path)})
return {
"ok": True,
"host": host,
"kind": SnapshotRecord.Kind.INCOMPLETE,
"max_delete": max_delete,
"source": "sql",
"planned_delete_count": len(incomplete_list),
"deleted": deleted,
"actions": actions,
}
if acquire_lock:
with acquire_host_lock(paths.locks_dir, host, command="incomplete-cleanup"):
return _do_cleanup()
return _do_cleanup()
def _enabled_host_config(host: str) -> HostConfig: def _enabled_host_config(host: str) -> HostConfig:
try: try:
return HostConfig.objects.get(host=host, enabled=True) return HostConfig.objects.get(host=host, enabled=True)
@@ -212,6 +282,15 @@ def _snapshot_delete_path(*, path: Path, dirname: str) -> Path:
return path return path
def _validate_incomplete_delete_path(*, host: str, path: Path, dirname: str) -> None:
path_parts = path.parts
if path.name != dirname or ".incomplete" not in path_parts or host not in path_parts:
raise ConfigError(f"Refusing to delete unexpected incomplete snapshot path: {path}")
incomplete_index = path_parts.index(".incomplete")
if incomplete_index == 0 or path_parts[incomplete_index - 1] != host:
raise ConfigError(f"Refusing to delete incomplete snapshot outside host backup root: {path}")
def _remove_snapshot_tree(path: Path) -> None: def _remove_snapshot_tree(path: Path) -> None:
_make_directories_user_writable(path) _make_directories_user_writable(path)
shutil.rmtree(path) shutil.rmtree(path)

View File

@@ -40,6 +40,10 @@
{{ plan.incomplete|length }} incomplete snapshot(s) exist for this host. Retention does not delete incomplete {{ plan.incomplete|length }} incomplete snapshot(s) exist for this host. Retention does not delete incomplete
snapshots automatically because they can indicate an interrupted backup that should be inspected first. snapshots automatically because they can indicate an interrupted backup that should be inspected first.
</p> </p>
<p>
After inspection, use the dedicated cleanup form below to delete only incomplete snapshot directories and their
SQL records. Successful scheduled and manual snapshots are not touched by this cleanup.
</p>
</section> </section>
{% endif %} {% endif %}
@@ -190,6 +194,37 @@
{% endfor %} {% endfor %}
</tbody> </tbody>
</table> </table>
<h3>Cleanup Incomplete Snapshots</h3>
<form method="post" action="{% url 'cleanup_host_incomplete_snapshots' host.host %}" class="form-grid">
{% csrf_token %}
{{ incomplete_cleanup_form.non_field_errors }}
<div class="field">
{{ incomplete_cleanup_form.max_delete.errors }}
<label for="{{ incomplete_cleanup_form.max_delete.id_for_label }}">Max delete</label>
{{ incomplete_cleanup_form.max_delete }}
<div class="helptext">{{ incomplete_cleanup_form.max_delete.help_text }}</div>
</div>
<div class="field">
{{ incomplete_cleanup_form.confirm_host.errors }}
<label for="{{ incomplete_cleanup_form.confirm_host.id_for_label }}">Confirm host</label>
{{ incomplete_cleanup_form.confirm_host }}
<div class="helptext">{{ incomplete_cleanup_form.confirm_host.help_text }}</div>
</div>
<div class="field">
{{ incomplete_cleanup_form.confirm_delete_count.errors }}
<label for="{{ incomplete_cleanup_form.confirm_delete_count.id_for_label }}">Confirm incomplete count</label>
{{ incomplete_cleanup_form.confirm_delete_count }}
<div class="helptext">{{ incomplete_cleanup_form.confirm_delete_count.help_text }}</div>
</div>
<div class="actions">
<button type="submit">Delete incomplete snapshots</button>
</div>
</form>
</section> </section>
{% endif %} {% endif %}
{% endblock %} {% endblock %}

View File

@@ -12,7 +12,7 @@ from django.test import TestCase
from pobsync.errors import ConfigError from pobsync.errors import ConfigError
from pobsync_backend.models import HostConfig, SnapshotRecord from pobsync_backend.models import HostConfig, SnapshotRecord
from pobsync_backend.retention import run_sql_retention_apply, run_sql_retention_plan from pobsync_backend.retention import run_incomplete_cleanup, run_sql_retention_apply, run_sql_retention_plan
class SqlRetentionTests(TestCase): class SqlRetentionTests(TestCase):
@@ -152,6 +152,78 @@ class SqlRetentionTests(TestCase):
acquire_lock=False, acquire_lock=False,
) )
def test_incomplete_cleanup_deletes_directory_and_record(self) -> None:
with TemporaryDirectory() as tmp:
prefix = Path(tmp) / "home"
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
incomplete_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-031500Z__BROKEN01"
incomplete_dir.mkdir(parents=True)
incomplete_dir.joinpath("partial-file").write_text("interrupted\n")
record = SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=incomplete_dir.name,
path=str(incomplete_dir),
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
result = run_incomplete_cleanup(
prefix=prefix,
host=host.host,
yes=True,
max_delete=1,
acquire_lock=False,
)
self.assertFalse(incomplete_dir.exists())
self.assertFalse(SnapshotRecord.objects.filter(pk=record.pk).exists())
self.assertEqual(
result["deleted"],
[{"dirname": incomplete_dir.name, "kind": SnapshotRecord.Kind.INCOMPLETE, "path": str(incomplete_dir)}],
)
self.assertEqual(result["planned_delete_count"], 1)
def test_incomplete_cleanup_respects_max_delete(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname="20260519-031500Z__BROKEN01",
path=f"/backups/{host.host}/.incomplete/20260519-031500Z__BROKEN01",
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
with self.assertRaisesRegex(ConfigError, "blocked by --max-delete=0"):
run_incomplete_cleanup(
prefix=Path("/tmp/pobsync-test"),
host=host.host,
yes=True,
max_delete=0,
acquire_lock=False,
)
def test_incomplete_cleanup_rejects_unexpected_path(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname="20260519-031500Z__BROKEN01",
path=f"/backups/{host.host}/scheduled/20260519-031500Z__BROKEN01",
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
with self.assertRaisesRegex(ConfigError, "unexpected incomplete snapshot path"):
run_incomplete_cleanup(
prefix=Path("/tmp/pobsync-test"),
host=host.host,
yes=True,
max_delete=1,
acquire_lock=False,
)
def test_management_command_plans_from_sql(self) -> None: def test_management_command_plans_from_sql(self) -> None:
host = HostConfig.objects.create( host = HostConfig.objects.create(
host="web-01", host="web-01",

View File

@@ -1620,6 +1620,68 @@ class ViewTests(TestCase):
self.assertContains(response, "Incomplete Snapshots") self.assertContains(response, "Incomplete Snapshots")
self.assertContains(response, "20260519-031500Z__BROKEN01") self.assertContains(response, "20260519-031500Z__BROKEN01")
self.assertContains(response, "excluded from retention cleanup") self.assertContains(response, "excluded from retention cleanup")
self.assertContains(response, "Delete incomplete snapshots")
self.assertContains(response, "Type 1 to confirm the current number of incomplete snapshots.")
def test_incomplete_cleanup_deletes_incomplete_snapshot_after_confirmation(self) -> None:
self.client.force_login(self.staff_user)
with TemporaryDirectory() as tmp:
home = Path(tmp) / "home"
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
incomplete_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-031500Z__BROKEN01"
incomplete_dir.mkdir(parents=True)
incomplete_dir.joinpath("partial-file").write_text("interrupted\n")
record = SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=incomplete_dir.name,
path=str(incomplete_dir),
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
with override_settings(POBSYNC_HOME=str(home)):
response = self.client.post(
reverse("cleanup_host_incomplete_snapshots", args=[host.host]),
{
"max_delete": "1",
"confirm_host": host.host,
"confirm_delete_count": "1",
},
follow=True,
)
self.assertFalse(incomplete_dir.exists())
self.assertRedirects(response, reverse("host_retention_plan", args=[host.host]))
self.assertContains(response, "Deleted 1 incomplete snapshot(s) for web-01.")
self.assertFalse(SnapshotRecord.objects.filter(pk=record.pk).exists())
def test_incomplete_cleanup_rejects_bad_confirmation(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname="20260519-031500Z__BROKEN01",
path=f"/backups/{host.host}/.incomplete/20260519-031500Z__BROKEN01",
status="failed",
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
)
response = self.client.post(
reverse("cleanup_host_incomplete_snapshots", args=[host.host]),
{
"max_delete": "1",
"confirm_host": "wrong",
"confirm_delete_count": "1",
},
follow=True,
)
self.assertRedirects(response, reverse("host_retention_plan", args=[host.host]))
self.assertContains(response, "Incomplete cleanup confirmation is invalid.")
self.assertEqual(SnapshotRecord.objects.filter(kind=SnapshotRecord.Kind.INCOMPLETE).count(), 1)
def test_host_detail_surfaces_retention_warnings(self) -> None: def test_host_detail_surfaces_retention_warnings(self) -> None:
self.client.force_login(self.staff_user) self.client.force_login(self.staff_user)

View File

@@ -25,6 +25,7 @@ from .forms import (
CreateHostConfigForm, CreateHostConfigForm,
GlobalConfigForm, GlobalConfigForm,
HostConfigForm, HostConfigForm,
IncompleteCleanupForm,
ManualBackupForm, ManualBackupForm,
RetentionApplyForm, RetentionApplyForm,
SshCredentialGenerateForm, SshCredentialGenerateForm,
@@ -34,7 +35,7 @@ from .forms import (
from .host_ops import ensure_host_directories from .host_ops import ensure_host_directories
from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential
from .preflight import collect_backup_gate, effective_host_config_preview, run_remote_preflight from .preflight import collect_backup_gate, effective_host_config_preview, run_remote_preflight
from .retention import run_sql_retention_apply, run_sql_retention_plan from .retention import run_incomplete_cleanup, run_sql_retention_apply, run_sql_retention_plan
from .self_check import collect_self_checks, summarize_self_checks from .self_check import collect_self_checks, summarize_self_checks
from .scheduler import next_due_after from .scheduler import next_due_after
from .snapshot_discovery import discover_snapshots, inspect_snapshot_discovery from .snapshot_discovery import discover_snapshots, inspect_snapshot_discovery
@@ -569,6 +570,7 @@ def host_retention_plan(request, host: str):
schedule = _schedule_for_host(host_config) schedule = _schedule_for_host(host_config)
scheduled_prune_limit = schedule.prune_max_delete if schedule and schedule.prune else None scheduled_prune_limit = schedule.prune_max_delete if schedule and schedule.prune else None
delete_count = len(plan["delete"]) delete_count = len(plan["delete"])
incomplete_count = len(plan["incomplete"])
context = { context = {
"host": host_config, "host": host_config,
"kind": kind, "kind": kind,
@@ -587,6 +589,14 @@ def host_retention_plan(request, host: str):
"confirm_delete_count": delete_count, "confirm_delete_count": delete_count,
}, },
), ),
"incomplete_cleanup_form": IncompleteCleanupForm(
host_name=host_config.host,
expected_delete_count=incomplete_count,
initial={
"max_delete": incomplete_count,
"confirm_delete_count": incomplete_count,
},
),
} }
return render(request, "pobsync_backend/retention_plan.html", context) return render(request, "pobsync_backend/retention_plan.html", context)
@@ -643,6 +653,40 @@ def apply_host_retention(request, host: str):
return target return target
@staff_member_required
@require_POST
def cleanup_host_incomplete_snapshots(request, host: str):
host_config = get_object_or_404(HostConfig, host=host)
try:
plan = run_sql_retention_plan(host=host_config.host, kind="all", protect_bases=True)
except PobsyncError as exc:
messages.error(request, str(exc))
return redirect("host_retention_plan", host=host_config.host)
incomplete_count = len(plan.get("incomplete") or [])
form = IncompleteCleanupForm(
request.POST,
host_name=host_config.host,
expected_delete_count=incomplete_count,
)
if not form.is_valid():
messages.error(request, "Incomplete cleanup confirmation is invalid.")
return redirect("host_retention_plan", host=host_config.host)
try:
result = run_incomplete_cleanup(
prefix=Path(settings.POBSYNC_HOME),
host=host_config.host,
yes=True,
max_delete=form.cleaned_data["max_delete"],
)
except PobsyncError as exc:
messages.error(request, str(exc))
else:
messages.success(request, f"Deleted {len(result['deleted'])} incomplete snapshot(s) for {host_config.host}.")
return redirect("host_retention_plan", host=host_config.host)
@staff_member_required @staff_member_required
def edit_host_config(request, host: str): def edit_host_config(request, host: str):
host_config = get_object_or_404(HostConfig, host=host) host_config = get_object_or_404(HostConfig, host=host)

View File

@@ -27,6 +27,11 @@ urlpatterns = [
path("hosts/<str:host>/discover-snapshots/", views.discover_host_snapshots, name="discover_host_snapshots"), path("hosts/<str:host>/discover-snapshots/", views.discover_host_snapshots, name="discover_host_snapshots"),
path("hosts/<str:host>/retention-apply/", views.apply_host_retention, name="apply_host_retention"), path("hosts/<str:host>/retention-apply/", views.apply_host_retention, name="apply_host_retention"),
path("hosts/<str:host>/retention-plan/", views.host_retention_plan, name="host_retention_plan"), path("hosts/<str:host>/retention-plan/", views.host_retention_plan, name="host_retention_plan"),
path(
"hosts/<str:host>/incomplete-cleanup/",
views.cleanup_host_incomplete_snapshots,
name="cleanup_host_incomplete_snapshots",
),
path("hosts/<str:host>/schedule/", views.edit_host_schedule, name="edit_host_schedule"), path("hosts/<str:host>/schedule/", views.edit_host_schedule, name="edit_host_schedule"),
path("runs/<int:run_id>/", views.run_detail, name="run_detail"), path("runs/<int:run_id>/", views.run_detail, name="run_detail"),
path("runs/<int:run_id>/rsync-log/", views.run_rsync_log, name="run_rsync_log"), path("runs/<int:run_id>/rsync-log/", views.run_rsync_log, name="run_rsync_log"),