(bugfix) Mark retention failures as backup warnings
Add a warning status for BackupRun records so successful snapshots are not reported as failed when post-run SQL retention fails. Keep the prune error in the run result, link the successful snapshot, and let the management command complete with a warning instead of raising a backup failure. Include warning runs in backup trend summaries and add a regression test for successful backups with failed retention cleanup.
This commit is contained in:
@@ -111,7 +111,7 @@ def execute_backup_run(
|
|||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
result["prune"] = {"ok": False, "error": str(exc), "type": type(exc).__name__}
|
result["prune"] = {"ok": False, "error": str(exc), "type": type(exc).__name__}
|
||||||
run.status = BackupRun.Status.FAILED
|
run.status = BackupRun.Status.WARNING
|
||||||
run.result = result
|
run.result = result
|
||||||
run.snapshot = snapshot_record
|
run.snapshot = snapshot_record
|
||||||
run.save(
|
run.save(
|
||||||
@@ -125,7 +125,6 @@ def execute_backup_run(
|
|||||||
"result",
|
"result",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
raise
|
|
||||||
|
|
||||||
run.snapshot = snapshot_record
|
run.snapshot = snapshot_record
|
||||||
run.result = result
|
run.result = result
|
||||||
|
|||||||
@@ -60,5 +60,8 @@ class Command(BaseCommand):
|
|||||||
if run.status == BackupRun.Status.SUCCESS:
|
if run.status == BackupRun.Status.SUCCESS:
|
||||||
self.stdout.write(self.style.SUCCESS(f"Backup completed for {host.host}."))
|
self.stdout.write(self.style.SUCCESS(f"Backup completed for {host.host}."))
|
||||||
return
|
return
|
||||||
|
if run.status == BackupRun.Status.WARNING:
|
||||||
|
self.stdout.write(self.style.WARNING(f"Backup completed with warnings for {host.host}; run id={run.id}"))
|
||||||
|
return
|
||||||
|
|
||||||
raise CommandError(f"Backup failed for {host.host}; run id={run.id}")
|
raise CommandError(f"Backup failed for {host.host}; run id={run.id}")
|
||||||
|
|||||||
@@ -0,0 +1,27 @@
|
|||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
("pobsync_backend", "0007_filesystem_ssh_credentials"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name="backuprun",
|
||||||
|
name="status",
|
||||||
|
field=models.CharField(
|
||||||
|
choices=[
|
||||||
|
("queued", "Queued"),
|
||||||
|
("running", "Running"),
|
||||||
|
("success", "Success"),
|
||||||
|
("warning", "Warning"),
|
||||||
|
("failed", "Failed"),
|
||||||
|
("cancelled", "Cancelled"),
|
||||||
|
],
|
||||||
|
default="queued",
|
||||||
|
max_length=16,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -105,6 +105,7 @@ class BackupRun(models.Model):
|
|||||||
QUEUED = "queued", "Queued"
|
QUEUED = "queued", "Queued"
|
||||||
RUNNING = "running", "Running"
|
RUNNING = "running", "Running"
|
||||||
SUCCESS = "success", "Success"
|
SUCCESS = "success", "Success"
|
||||||
|
WARNING = "warning", "Warning"
|
||||||
FAILED = "failed", "Failed"
|
FAILED = "failed", "Failed"
|
||||||
CANCELLED = "cancelled", "Cancelled"
|
CANCELLED = "cancelled", "Cancelled"
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from .models import BackupRun, GlobalConfig, HostConfig, SnapshotRecord
|
|||||||
def collect_dashboard_stats(*, hosts: Iterable[HostConfig], global_config: GlobalConfig | None) -> dict[str, Any]:
|
def collect_dashboard_stats(*, hosts: Iterable[HostConfig], global_config: GlobalConfig | None) -> dict[str, Any]:
|
||||||
runs = list(
|
runs = list(
|
||||||
BackupRun.objects.select_related("host", "snapshot")
|
BackupRun.objects.select_related("host", "snapshot")
|
||||||
.filter(status=BackupRun.Status.SUCCESS)
|
.filter(status__in=_COMPLETED_BACKUP_STATUSES)
|
||||||
.order_by("-started_at", "-created_at")[:100]
|
.order_by("-started_at", "-created_at")[:100]
|
||||||
)
|
)
|
||||||
real_runs = [_run_summary(run) for run in runs if _is_real_run(run)]
|
real_runs = [_run_summary(run) for run in runs if _is_real_run(run)]
|
||||||
@@ -52,7 +52,7 @@ def collect_dashboard_stats(*, hosts: Iterable[HostConfig], global_config: Globa
|
|||||||
|
|
||||||
|
|
||||||
def collect_host_stats(*, host: HostConfig, limit: int = 8) -> dict[str, Any]:
|
def collect_host_stats(*, host: HostConfig, limit: int = 8) -> dict[str, Any]:
|
||||||
runs = list(host.runs.select_related("snapshot").filter(status=BackupRun.Status.SUCCESS).order_by("-started_at", "-created_at")[:50])
|
runs = list(host.runs.select_related("snapshot").filter(status__in=_COMPLETED_BACKUP_STATUSES).order_by("-started_at", "-created_at")[:50])
|
||||||
real_runs = [_run_summary(run) for run in runs if _is_real_run(run)]
|
real_runs = [_run_summary(run) for run in runs if _is_real_run(run)]
|
||||||
trend_runs = [run for run in real_runs if run["has_stats"]][:limit]
|
trend_runs = [run for run in real_runs if run["has_stats"]][:limit]
|
||||||
latest_snapshot = host.snapshots.order_by("-started_at", "-discovered_at", "-id").first()
|
latest_snapshot = host.snapshots.order_by("-started_at", "-discovered_at", "-id").first()
|
||||||
@@ -198,3 +198,6 @@ def _int_at(data: dict[str, Any], *keys: str) -> int | None:
|
|||||||
if isinstance(value, float):
|
if isinstance(value, float):
|
||||||
return int(value)
|
return int(value)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_COMPLETED_BACKUP_STATUSES = [BackupRun.Status.SUCCESS, BackupRun.Status.WARNING]
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ class RunBackupRecordsSnapshotTests(TestCase):
|
|||||||
self.assertEqual(run.status, BackupRun.Status.SUCCESS)
|
self.assertEqual(run.status, BackupRun.Status.SUCCESS)
|
||||||
self.assertEqual(run.result["prune"], {"ok": True, "source": "sql", "deleted": []})
|
self.assertEqual(run.result["prune"], {"ok": True, "source": "sql", "deleted": []})
|
||||||
|
|
||||||
def test_prune_failure_is_recorded_on_backup_run(self) -> None:
|
def test_prune_failure_marks_backup_run_as_warning(self) -> None:
|
||||||
with TemporaryDirectory() as tmp:
|
with TemporaryDirectory() as tmp:
|
||||||
backup_root = Path(tmp) / "backups"
|
backup_root = Path(tmp) / "backups"
|
||||||
GlobalConfig.objects.create(name="default", backup_root=str(backup_root))
|
GlobalConfig.objects.create(name="default", backup_root=str(backup_root))
|
||||||
@@ -128,19 +128,20 @@ class RunBackupRecordsSnapshotTests(TestCase):
|
|||||||
}
|
}
|
||||||
retention_apply.side_effect = ConfigError("Deletion blocked by --max-delete=0")
|
retention_apply.side_effect = ConfigError("Deletion blocked by --max-delete=0")
|
||||||
|
|
||||||
with self.assertRaises(ConfigError):
|
output = StringIO()
|
||||||
call_command(
|
call_command(
|
||||||
"run_pobsync_backup",
|
"run_pobsync_backup",
|
||||||
host.host,
|
host.host,
|
||||||
prefix=str(Path(tmp) / "home"),
|
prefix=str(Path(tmp) / "home"),
|
||||||
prune=True,
|
prune=True,
|
||||||
prune_max_delete=0,
|
prune_max_delete=0,
|
||||||
stdout=StringIO(),
|
stdout=output,
|
||||||
)
|
)
|
||||||
|
|
||||||
run = BackupRun.objects.get()
|
run = BackupRun.objects.get()
|
||||||
self.assertEqual(run.status, BackupRun.Status.FAILED)
|
self.assertEqual(run.status, BackupRun.Status.WARNING)
|
||||||
self.assertIsNotNone(run.snapshot)
|
self.assertIsNotNone(run.snapshot)
|
||||||
|
self.assertIn("completed with warnings", output.getvalue())
|
||||||
self.assertEqual(run.result["prune"]["ok"], False)
|
self.assertEqual(run.result["prune"]["ok"], False)
|
||||||
self.assertEqual(run.result["prune"]["type"], "ConfigError")
|
self.assertEqual(run.result["prune"]["type"], "ConfigError")
|
||||||
self.assertEqual(run.result["prune"]["error"], "Deletion blocked by --max-delete=0")
|
self.assertEqual(run.result["prune"]["error"], "Deletion blocked by --max-delete=0")
|
||||||
|
|||||||
Reference in New Issue
Block a user