(release) Add explicit incomplete snapshot cleanup
Add a dedicated cleanup path for incomplete snapshots instead of letting retention prune them implicitly. The retention plan now exposes a guarded form that requires host and delete-count confirmation before removing .incomplete snapshot directories and their SQL records. Keep scheduled/manual retention behavior unchanged, add path safety checks, and cover cleanup success, confirmation failures, max-delete limits, and unexpected paths in tests. Refs #10
This commit is contained in:
@@ -274,6 +274,36 @@ class RetentionApplyForm(forms.Form):
|
||||
return value
|
||||
|
||||
|
||||
class IncompleteCleanupForm(forms.Form):
|
||||
max_delete = forms.IntegerField(min_value=0, initial=0)
|
||||
confirm_delete_count = forms.IntegerField(min_value=0)
|
||||
confirm_host = forms.CharField()
|
||||
|
||||
def __init__(self, *args, host_name: str, expected_delete_count: int, **kwargs) -> None:
|
||||
self.host_name = host_name
|
||||
self.expected_delete_count = expected_delete_count
|
||||
super().__init__(*args, **kwargs)
|
||||
self.fields["confirm_host"].help_text = f"Type {host_name} to confirm incomplete snapshot cleanup."
|
||||
self.fields["confirm_delete_count"].help_text = (
|
||||
f"Type {expected_delete_count} to confirm the current number of incomplete snapshots."
|
||||
)
|
||||
self.fields["max_delete"].help_text = (
|
||||
f"Must be at least {expected_delete_count} for the incomplete snapshots shown here."
|
||||
)
|
||||
|
||||
def clean_confirm_host(self) -> str:
|
||||
value = self.cleaned_data["confirm_host"].strip()
|
||||
if value != self.host_name:
|
||||
raise forms.ValidationError(f"Type {self.host_name} to confirm.")
|
||||
return value
|
||||
|
||||
def clean_confirm_delete_count(self) -> int:
|
||||
value = self.cleaned_data["confirm_delete_count"]
|
||||
if value != self.expected_delete_count:
|
||||
raise forms.ValidationError(f"Type {self.expected_delete_count} to confirm the incomplete count.")
|
||||
return value
|
||||
|
||||
|
||||
class ScheduleConfigForm(forms.ModelForm):
|
||||
cron_expr = forms.CharField(
|
||||
label="Schedule expression",
|
||||
|
||||
@@ -131,6 +131,76 @@ def run_sql_retention_apply(
|
||||
return _do_apply()
|
||||
|
||||
|
||||
def run_incomplete_cleanup(
|
||||
*,
|
||||
prefix: Path,
|
||||
host: str,
|
||||
yes: bool,
|
||||
max_delete: int,
|
||||
acquire_lock: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
host = sanitize_host(host)
|
||||
if not yes:
|
||||
raise ConfigError("Refusing to delete incomplete snapshots without --yes")
|
||||
if max_delete < 0:
|
||||
raise ConfigError("--max-delete must be >= 0")
|
||||
|
||||
paths = PobsyncPaths(home=prefix)
|
||||
|
||||
def _do_cleanup() -> dict[str, Any]:
|
||||
host_config = _enabled_host_config(host)
|
||||
incomplete_list = [
|
||||
_snapshot_to_item(snapshot, reasons=["manual incomplete cleanup"])
|
||||
for snapshot in _incomplete_snapshots_for_host(host_config)
|
||||
]
|
||||
if max_delete == 0 and len(incomplete_list) > 0:
|
||||
raise ConfigError("Incomplete cleanup blocked by --max-delete=0")
|
||||
if len(incomplete_list) > max_delete:
|
||||
raise ConfigError(
|
||||
f"Refusing to delete {len(incomplete_list)} incomplete snapshots (exceeds --max-delete={max_delete})"
|
||||
)
|
||||
|
||||
actions: list[str] = []
|
||||
deleted: list[dict[str, Any]] = []
|
||||
|
||||
for item in incomplete_list:
|
||||
dirname = item["dirname"]
|
||||
snap_path = Path(item["path"])
|
||||
path = _snapshot_delete_path(path=snap_path, dirname=dirname)
|
||||
_validate_incomplete_delete_path(host=host, path=path, dirname=dirname)
|
||||
|
||||
if not path.exists():
|
||||
actions.append(f"skip missing incomplete/{dirname}")
|
||||
elif not path.is_dir():
|
||||
raise ConfigError(f"Refusing to delete non-directory path: {path}")
|
||||
else:
|
||||
_remove_snapshot_tree(path)
|
||||
actions.append(f"deleted incomplete {dirname}")
|
||||
|
||||
SnapshotRecord.objects.filter(
|
||||
host__host=host,
|
||||
kind=SnapshotRecord.Kind.INCOMPLETE,
|
||||
dirname=dirname,
|
||||
).delete()
|
||||
deleted.append({"dirname": dirname, "kind": SnapshotRecord.Kind.INCOMPLETE, "path": str(path)})
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"host": host,
|
||||
"kind": SnapshotRecord.Kind.INCOMPLETE,
|
||||
"max_delete": max_delete,
|
||||
"source": "sql",
|
||||
"planned_delete_count": len(incomplete_list),
|
||||
"deleted": deleted,
|
||||
"actions": actions,
|
||||
}
|
||||
|
||||
if acquire_lock:
|
||||
with acquire_host_lock(paths.locks_dir, host, command="incomplete-cleanup"):
|
||||
return _do_cleanup()
|
||||
return _do_cleanup()
|
||||
|
||||
|
||||
def _enabled_host_config(host: str) -> HostConfig:
|
||||
try:
|
||||
return HostConfig.objects.get(host=host, enabled=True)
|
||||
@@ -212,6 +282,15 @@ def _snapshot_delete_path(*, path: Path, dirname: str) -> Path:
|
||||
return path
|
||||
|
||||
|
||||
def _validate_incomplete_delete_path(*, host: str, path: Path, dirname: str) -> None:
|
||||
path_parts = path.parts
|
||||
if path.name != dirname or ".incomplete" not in path_parts or host not in path_parts:
|
||||
raise ConfigError(f"Refusing to delete unexpected incomplete snapshot path: {path}")
|
||||
incomplete_index = path_parts.index(".incomplete")
|
||||
if incomplete_index == 0 or path_parts[incomplete_index - 1] != host:
|
||||
raise ConfigError(f"Refusing to delete incomplete snapshot outside host backup root: {path}")
|
||||
|
||||
|
||||
def _remove_snapshot_tree(path: Path) -> None:
|
||||
_make_directories_user_writable(path)
|
||||
shutil.rmtree(path)
|
||||
|
||||
@@ -40,6 +40,10 @@
|
||||
{{ plan.incomplete|length }} incomplete snapshot(s) exist for this host. Retention does not delete incomplete
|
||||
snapshots automatically because they can indicate an interrupted backup that should be inspected first.
|
||||
</p>
|
||||
<p>
|
||||
After inspection, use the dedicated cleanup form below to delete only incomplete snapshot directories and their
|
||||
SQL records. Successful scheduled and manual snapshots are not touched by this cleanup.
|
||||
</p>
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
@@ -190,6 +194,37 @@
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h3>Cleanup Incomplete Snapshots</h3>
|
||||
<form method="post" action="{% url 'cleanup_host_incomplete_snapshots' host.host %}" class="form-grid">
|
||||
{% csrf_token %}
|
||||
{{ incomplete_cleanup_form.non_field_errors }}
|
||||
|
||||
<div class="field">
|
||||
{{ incomplete_cleanup_form.max_delete.errors }}
|
||||
<label for="{{ incomplete_cleanup_form.max_delete.id_for_label }}">Max delete</label>
|
||||
{{ incomplete_cleanup_form.max_delete }}
|
||||
<div class="helptext">{{ incomplete_cleanup_form.max_delete.help_text }}</div>
|
||||
</div>
|
||||
|
||||
<div class="field">
|
||||
{{ incomplete_cleanup_form.confirm_host.errors }}
|
||||
<label for="{{ incomplete_cleanup_form.confirm_host.id_for_label }}">Confirm host</label>
|
||||
{{ incomplete_cleanup_form.confirm_host }}
|
||||
<div class="helptext">{{ incomplete_cleanup_form.confirm_host.help_text }}</div>
|
||||
</div>
|
||||
|
||||
<div class="field">
|
||||
{{ incomplete_cleanup_form.confirm_delete_count.errors }}
|
||||
<label for="{{ incomplete_cleanup_form.confirm_delete_count.id_for_label }}">Confirm incomplete count</label>
|
||||
{{ incomplete_cleanup_form.confirm_delete_count }}
|
||||
<div class="helptext">{{ incomplete_cleanup_form.confirm_delete_count.help_text }}</div>
|
||||
</div>
|
||||
|
||||
<div class="actions">
|
||||
<button type="submit">Delete incomplete snapshots</button>
|
||||
</div>
|
||||
</form>
|
||||
</section>
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
|
||||
@@ -12,7 +12,7 @@ from django.test import TestCase
|
||||
|
||||
from pobsync.errors import ConfigError
|
||||
from pobsync_backend.models import HostConfig, SnapshotRecord
|
||||
from pobsync_backend.retention import run_sql_retention_apply, run_sql_retention_plan
|
||||
from pobsync_backend.retention import run_incomplete_cleanup, run_sql_retention_apply, run_sql_retention_plan
|
||||
|
||||
|
||||
class SqlRetentionTests(TestCase):
|
||||
@@ -152,6 +152,78 @@ class SqlRetentionTests(TestCase):
|
||||
acquire_lock=False,
|
||||
)
|
||||
|
||||
def test_incomplete_cleanup_deletes_directory_and_record(self) -> None:
|
||||
with TemporaryDirectory() as tmp:
|
||||
prefix = Path(tmp) / "home"
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
incomplete_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-031500Z__BROKEN01"
|
||||
incomplete_dir.mkdir(parents=True)
|
||||
incomplete_dir.joinpath("partial-file").write_text("interrupted\n")
|
||||
record = SnapshotRecord.objects.create(
|
||||
host=host,
|
||||
kind=SnapshotRecord.Kind.INCOMPLETE,
|
||||
dirname=incomplete_dir.name,
|
||||
path=str(incomplete_dir),
|
||||
status="failed",
|
||||
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
result = run_incomplete_cleanup(
|
||||
prefix=prefix,
|
||||
host=host.host,
|
||||
yes=True,
|
||||
max_delete=1,
|
||||
acquire_lock=False,
|
||||
)
|
||||
|
||||
self.assertFalse(incomplete_dir.exists())
|
||||
self.assertFalse(SnapshotRecord.objects.filter(pk=record.pk).exists())
|
||||
self.assertEqual(
|
||||
result["deleted"],
|
||||
[{"dirname": incomplete_dir.name, "kind": SnapshotRecord.Kind.INCOMPLETE, "path": str(incomplete_dir)}],
|
||||
)
|
||||
self.assertEqual(result["planned_delete_count"], 1)
|
||||
|
||||
def test_incomplete_cleanup_respects_max_delete(self) -> None:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
SnapshotRecord.objects.create(
|
||||
host=host,
|
||||
kind=SnapshotRecord.Kind.INCOMPLETE,
|
||||
dirname="20260519-031500Z__BROKEN01",
|
||||
path=f"/backups/{host.host}/.incomplete/20260519-031500Z__BROKEN01",
|
||||
status="failed",
|
||||
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(ConfigError, "blocked by --max-delete=0"):
|
||||
run_incomplete_cleanup(
|
||||
prefix=Path("/tmp/pobsync-test"),
|
||||
host=host.host,
|
||||
yes=True,
|
||||
max_delete=0,
|
||||
acquire_lock=False,
|
||||
)
|
||||
|
||||
def test_incomplete_cleanup_rejects_unexpected_path(self) -> None:
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
SnapshotRecord.objects.create(
|
||||
host=host,
|
||||
kind=SnapshotRecord.Kind.INCOMPLETE,
|
||||
dirname="20260519-031500Z__BROKEN01",
|
||||
path=f"/backups/{host.host}/scheduled/20260519-031500Z__BROKEN01",
|
||||
status="failed",
|
||||
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(ConfigError, "unexpected incomplete snapshot path"):
|
||||
run_incomplete_cleanup(
|
||||
prefix=Path("/tmp/pobsync-test"),
|
||||
host=host.host,
|
||||
yes=True,
|
||||
max_delete=1,
|
||||
acquire_lock=False,
|
||||
)
|
||||
|
||||
def test_management_command_plans_from_sql(self) -> None:
|
||||
host = HostConfig.objects.create(
|
||||
host="web-01",
|
||||
|
||||
@@ -1620,6 +1620,68 @@ class ViewTests(TestCase):
|
||||
self.assertContains(response, "Incomplete Snapshots")
|
||||
self.assertContains(response, "20260519-031500Z__BROKEN01")
|
||||
self.assertContains(response, "excluded from retention cleanup")
|
||||
self.assertContains(response, "Delete incomplete snapshots")
|
||||
self.assertContains(response, "Type 1 to confirm the current number of incomplete snapshots.")
|
||||
|
||||
def test_incomplete_cleanup_deletes_incomplete_snapshot_after_confirmation(self) -> None:
|
||||
self.client.force_login(self.staff_user)
|
||||
with TemporaryDirectory() as tmp:
|
||||
home = Path(tmp) / "home"
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
incomplete_dir = Path(tmp) / "backups" / host.host / ".incomplete" / "20260519-031500Z__BROKEN01"
|
||||
incomplete_dir.mkdir(parents=True)
|
||||
incomplete_dir.joinpath("partial-file").write_text("interrupted\n")
|
||||
record = SnapshotRecord.objects.create(
|
||||
host=host,
|
||||
kind=SnapshotRecord.Kind.INCOMPLETE,
|
||||
dirname=incomplete_dir.name,
|
||||
path=str(incomplete_dir),
|
||||
status="failed",
|
||||
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
with override_settings(POBSYNC_HOME=str(home)):
|
||||
response = self.client.post(
|
||||
reverse("cleanup_host_incomplete_snapshots", args=[host.host]),
|
||||
{
|
||||
"max_delete": "1",
|
||||
"confirm_host": host.host,
|
||||
"confirm_delete_count": "1",
|
||||
},
|
||||
follow=True,
|
||||
)
|
||||
|
||||
self.assertFalse(incomplete_dir.exists())
|
||||
|
||||
self.assertRedirects(response, reverse("host_retention_plan", args=[host.host]))
|
||||
self.assertContains(response, "Deleted 1 incomplete snapshot(s) for web-01.")
|
||||
self.assertFalse(SnapshotRecord.objects.filter(pk=record.pk).exists())
|
||||
|
||||
def test_incomplete_cleanup_rejects_bad_confirmation(self) -> None:
|
||||
self.client.force_login(self.staff_user)
|
||||
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
|
||||
SnapshotRecord.objects.create(
|
||||
host=host,
|
||||
kind=SnapshotRecord.Kind.INCOMPLETE,
|
||||
dirname="20260519-031500Z__BROKEN01",
|
||||
path=f"/backups/{host.host}/.incomplete/20260519-031500Z__BROKEN01",
|
||||
status="failed",
|
||||
started_at=datetime(2026, 5, 19, 3, 15, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
response = self.client.post(
|
||||
reverse("cleanup_host_incomplete_snapshots", args=[host.host]),
|
||||
{
|
||||
"max_delete": "1",
|
||||
"confirm_host": "wrong",
|
||||
"confirm_delete_count": "1",
|
||||
},
|
||||
follow=True,
|
||||
)
|
||||
|
||||
self.assertRedirects(response, reverse("host_retention_plan", args=[host.host]))
|
||||
self.assertContains(response, "Incomplete cleanup confirmation is invalid.")
|
||||
self.assertEqual(SnapshotRecord.objects.filter(kind=SnapshotRecord.Kind.INCOMPLETE).count(), 1)
|
||||
|
||||
def test_host_detail_surfaces_retention_warnings(self) -> None:
|
||||
self.client.force_login(self.staff_user)
|
||||
|
||||
@@ -25,6 +25,7 @@ from .forms import (
|
||||
CreateHostConfigForm,
|
||||
GlobalConfigForm,
|
||||
HostConfigForm,
|
||||
IncompleteCleanupForm,
|
||||
ManualBackupForm,
|
||||
RetentionApplyForm,
|
||||
SshCredentialGenerateForm,
|
||||
@@ -34,7 +35,7 @@ from .forms import (
|
||||
from .host_ops import ensure_host_directories
|
||||
from .models import BackupRun, GlobalConfig, HostConfig, ScheduleConfig, SnapshotRecord, SshCredential
|
||||
from .preflight import collect_backup_gate, effective_host_config_preview, run_remote_preflight
|
||||
from .retention import run_sql_retention_apply, run_sql_retention_plan
|
||||
from .retention import run_incomplete_cleanup, run_sql_retention_apply, run_sql_retention_plan
|
||||
from .self_check import collect_self_checks, summarize_self_checks
|
||||
from .scheduler import next_due_after
|
||||
from .snapshot_discovery import discover_snapshots, inspect_snapshot_discovery
|
||||
@@ -569,6 +570,7 @@ def host_retention_plan(request, host: str):
|
||||
schedule = _schedule_for_host(host_config)
|
||||
scheduled_prune_limit = schedule.prune_max_delete if schedule and schedule.prune else None
|
||||
delete_count = len(plan["delete"])
|
||||
incomplete_count = len(plan["incomplete"])
|
||||
context = {
|
||||
"host": host_config,
|
||||
"kind": kind,
|
||||
@@ -587,6 +589,14 @@ def host_retention_plan(request, host: str):
|
||||
"confirm_delete_count": delete_count,
|
||||
},
|
||||
),
|
||||
"incomplete_cleanup_form": IncompleteCleanupForm(
|
||||
host_name=host_config.host,
|
||||
expected_delete_count=incomplete_count,
|
||||
initial={
|
||||
"max_delete": incomplete_count,
|
||||
"confirm_delete_count": incomplete_count,
|
||||
},
|
||||
),
|
||||
}
|
||||
return render(request, "pobsync_backend/retention_plan.html", context)
|
||||
|
||||
@@ -643,6 +653,40 @@ def apply_host_retention(request, host: str):
|
||||
return target
|
||||
|
||||
|
||||
@staff_member_required
|
||||
@require_POST
|
||||
def cleanup_host_incomplete_snapshots(request, host: str):
|
||||
host_config = get_object_or_404(HostConfig, host=host)
|
||||
try:
|
||||
plan = run_sql_retention_plan(host=host_config.host, kind="all", protect_bases=True)
|
||||
except PobsyncError as exc:
|
||||
messages.error(request, str(exc))
|
||||
return redirect("host_retention_plan", host=host_config.host)
|
||||
|
||||
incomplete_count = len(plan.get("incomplete") or [])
|
||||
form = IncompleteCleanupForm(
|
||||
request.POST,
|
||||
host_name=host_config.host,
|
||||
expected_delete_count=incomplete_count,
|
||||
)
|
||||
if not form.is_valid():
|
||||
messages.error(request, "Incomplete cleanup confirmation is invalid.")
|
||||
return redirect("host_retention_plan", host=host_config.host)
|
||||
|
||||
try:
|
||||
result = run_incomplete_cleanup(
|
||||
prefix=Path(settings.POBSYNC_HOME),
|
||||
host=host_config.host,
|
||||
yes=True,
|
||||
max_delete=form.cleaned_data["max_delete"],
|
||||
)
|
||||
except PobsyncError as exc:
|
||||
messages.error(request, str(exc))
|
||||
else:
|
||||
messages.success(request, f"Deleted {len(result['deleted'])} incomplete snapshot(s) for {host_config.host}.")
|
||||
return redirect("host_retention_plan", host=host_config.host)
|
||||
|
||||
|
||||
@staff_member_required
|
||||
def edit_host_config(request, host: str):
|
||||
host_config = get_object_or_404(HostConfig, host=host)
|
||||
|
||||
@@ -27,6 +27,11 @@ urlpatterns = [
|
||||
path("hosts/<str:host>/discover-snapshots/", views.discover_host_snapshots, name="discover_host_snapshots"),
|
||||
path("hosts/<str:host>/retention-apply/", views.apply_host_retention, name="apply_host_retention"),
|
||||
path("hosts/<str:host>/retention-plan/", views.host_retention_plan, name="host_retention_plan"),
|
||||
path(
|
||||
"hosts/<str:host>/incomplete-cleanup/",
|
||||
views.cleanup_host_incomplete_snapshots,
|
||||
name="cleanup_host_incomplete_snapshots",
|
||||
),
|
||||
path("hosts/<str:host>/schedule/", views.edit_host_schedule, name="edit_host_schedule"),
|
||||
path("runs/<int:run_id>/", views.run_detail, name="run_detail"),
|
||||
path("runs/<int:run_id>/rsync-log/", views.run_rsync_log, name="run_rsync_log"),
|
||||
|
||||
Reference in New Issue
Block a user