(bugfix) Mark retention failures as backup warnings

Add a warning status for BackupRun records so successful snapshots are not reported as failed when post-run SQL retention fails. Keep the prune error in the run result, link the successful snapshot, and let the management command complete with a warning instead of raising a backup failure. Include warning runs in backup trend summaries and add a regression test for successful backups with failed retention cleanup.
2026-05-19 23:20:52 +02:00
parent 1e04da9de8
commit 8bff241b12
6 changed files with 49 additions and 15 deletions
--- a/src/pobsync_backend/backup_runner.py
+++ b/src/pobsync_backend/backup_runner.py
@@ -111,7 +111,7 @@ def execute_backup_run(
            )
        except Exception as exc:
            result["prune"] = {"ok": False, "error": str(exc), "type": type(exc).__name__}
-            run.status = BackupRun.Status.FAILED
+            run.status = BackupRun.Status.WARNING
            run.result = result
            run.snapshot = snapshot_record
            run.save(
@@ -125,7 +125,6 @@ def execute_backup_run(
                    "result",
                ],
            )
-            raise

    run.snapshot = snapshot_record
    run.result = result
--- a/src/pobsync_backend/management/commands/run_pobsync_backup.py
+++ b/src/pobsync_backend/management/commands/run_pobsync_backup.py
@@ -60,5 +60,8 @@ class Command(BaseCommand):
        if run.status == BackupRun.Status.SUCCESS:
            self.stdout.write(self.style.SUCCESS(f"Backup completed for {host.host}."))
            return
+        if run.status == BackupRun.Status.WARNING:
+            self.stdout.write(self.style.WARNING(f"Backup completed with warnings for {host.host}; run id={run.id}"))
+            return

        raise CommandError(f"Backup failed for {host.host}; run id={run.id}")
--- a/src/pobsync_backend/migrations/0008_alter_backuprun_status.py
+++ b/src/pobsync_backend/migrations/0008_alter_backuprun_status.py
@@ -0,0 +1,27 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("pobsync_backend", "0007_filesystem_ssh_credentials"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="backuprun",
+            name="status",
+            field=models.CharField(
+                choices=[
+                    ("queued", "Queued"),
+                    ("running", "Running"),
+                    ("success", "Success"),
+                    ("warning", "Warning"),
+                    ("failed", "Failed"),
+                    ("cancelled", "Cancelled"),
+                ],
+                default="queued",
+                max_length=16,
+            ),
+        ),
+    ]
--- a/src/pobsync_backend/models.py
+++ b/src/pobsync_backend/models.py
@@ -105,6 +105,7 @@ class BackupRun(models.Model):
        QUEUED = "queued", "Queued"
        RUNNING = "running", "Running"
        SUCCESS = "success", "Success"
+        WARNING = "warning", "Warning"
        FAILED = "failed", "Failed"
        CANCELLED = "cancelled", "Cancelled"

--- a/src/pobsync_backend/stats_summary.py
+++ b/src/pobsync_backend/stats_summary.py
@@ -13,7 +13,7 @@ from .models import BackupRun, GlobalConfig, HostConfig, SnapshotRecord
 def collect_dashboard_stats(*, hosts: Iterable[HostConfig], global_config: GlobalConfig | None) -> dict[str, Any]:
    runs = list(
        BackupRun.objects.select_related("host", "snapshot")
-        .filter(status=BackupRun.Status.SUCCESS)
+        .filter(status__in=_COMPLETED_BACKUP_STATUSES)
        .order_by("-started_at", "-created_at")[:100]
    )
    real_runs = [_run_summary(run) for run in runs if _is_real_run(run)]
@@ -52,7 +52,7 @@ def collect_dashboard_stats(*, hosts: Iterable[HostConfig], global_config: Globa


 def collect_host_stats(*, host: HostConfig, limit: int = 8) -> dict[str, Any]:
-    runs = list(host.runs.select_related("snapshot").filter(status=BackupRun.Status.SUCCESS).order_by("-started_at", "-created_at")[:50])
+    runs = list(host.runs.select_related("snapshot").filter(status__in=_COMPLETED_BACKUP_STATUSES).order_by("-started_at", "-created_at")[:50])
    real_runs = [_run_summary(run) for run in runs if _is_real_run(run)]
    trend_runs = [run for run in real_runs if run["has_stats"]][:limit]
    latest_snapshot = host.snapshots.order_by("-started_at", "-discovered_at", "-id").first()
@@ -198,3 +198,6 @@ def _int_at(data: dict[str, Any], *keys: str) -> int | None:
    if isinstance(value, float):
        return int(value)
    return None
+
+
+_COMPLETED_BACKUP_STATUSES = [BackupRun.Status.SUCCESS, BackupRun.Status.WARNING]
--- a/src/pobsync_backend/tests/test_run_backup_records_snapshot.py
+++ b/src/pobsync_backend/tests/test_run_backup_records_snapshot.py
@@ -102,7 +102,7 @@ class RunBackupRecordsSnapshotTests(TestCase):
        self.assertEqual(run.status, BackupRun.Status.SUCCESS)
        self.assertEqual(run.result["prune"], {"ok": True, "source": "sql", "deleted": []})

-    def test_prune_failure_is_recorded_on_backup_run(self) -> None:
+    def test_prune_failure_marks_backup_run_as_warning(self) -> None:
        with TemporaryDirectory() as tmp:
            backup_root = Path(tmp) / "backups"
            GlobalConfig.objects.create(name="default", backup_root=str(backup_root))
@@ -128,19 +128,20 @@ class RunBackupRecordsSnapshotTests(TestCase):
                }
                retention_apply.side_effect = ConfigError("Deletion blocked by --max-delete=0")

-                with self.assertRaises(ConfigError):
+                output = StringIO()
                call_command(
                    "run_pobsync_backup",
                    host.host,
                    prefix=str(Path(tmp) / "home"),
                    prune=True,
                    prune_max_delete=0,
-                        stdout=StringIO(),
+                    stdout=output,
                )

        run = BackupRun.objects.get()
-        self.assertEqual(run.status, BackupRun.Status.FAILED)
+        self.assertEqual(run.status, BackupRun.Status.WARNING)
        self.assertIsNotNone(run.snapshot)
+        self.assertIn("completed with warnings", output.getvalue())
        self.assertEqual(run.result["prune"]["ok"], False)
        self.assertEqual(run.result["prune"]["type"], "ConfigError")
        self.assertEqual(run.result["prune"]["error"], "Deletion blocked by --max-delete=0")