Merge pull request '(bugfix) Measure incomplete snapshot data from disk' (#70) from issue-69-data-overview-incomplete-sizes into master

Reviewed-on: #70
This commit was merged in pull request #70.
This commit is contained in:
2026-05-28 21:34:22 +02:00
6 changed files with 167 additions and 26 deletions

View File

@@ -120,12 +120,15 @@ def _backup_data_by_kind(host: HostConfig) -> dict[str, Any]:
row = rows.setdefault(snapshot.kind, _empty_snapshot_data_row())
allocated = summary.get("allocated_size_bytes") or summary.get("apparent_size_bytes") or 0
apparent = summary.get("apparent_size_bytes") or 0
unique_apparent = summary.get("unique_apparent_size_bytes") or 0
row["count"] += 1
row["allocated_size_bytes"] += int(allocated)
row["apparent_size_bytes"] += int(apparent)
row["unique_apparent_size_bytes"] += int(unique_apparent)
total["count"] += 1
total["allocated_size_bytes"] += int(allocated)
total["apparent_size_bytes"] += int(apparent)
total["unique_apparent_size_bytes"] += int(unique_apparent)
return {
"scheduled": rows[SnapshotRecord.Kind.SCHEDULED],
@@ -140,6 +143,7 @@ def _empty_snapshot_data_row() -> dict[str, int]:
"count": 0,
"allocated_size_bytes": 0,
"apparent_size_bytes": 0,
"unique_apparent_size_bytes": 0,
}
@@ -157,6 +161,7 @@ def _sum_backup_data_by_kind(rows: Iterable[dict[str, dict[str, int]]]) -> dict[
total_row["count"] += values.get("count", 0)
total_row["allocated_size_bytes"] += values.get("allocated_size_bytes", 0)
total_row["apparent_size_bytes"] += values.get("apparent_size_bytes", 0)
total_row["unique_apparent_size_bytes"] += values.get("unique_apparent_size_bytes", 0)
return total_rows
@@ -168,21 +173,28 @@ def _snapshot_summary(snapshot: SnapshotRecord | None) -> dict[str, Any]:
stats = metadata.get("stats") if isinstance(metadata.get("stats"), dict) else {}
storage = stats.get("storage") if isinstance(stats.get("storage"), dict) else {}
snapshot_storage = storage.get("snapshot") if isinstance(storage.get("snapshot"), dict) else {}
has_recorded_size = (
_int_at(snapshot_storage, "allocated_size_bytes") is not None
or _int_at(snapshot_storage, "apparent_size_bytes") is not None
)
if not has_recorded_size:
if snapshot.kind == SnapshotRecord.Kind.INCOMPLETE:
snapshot_storage = _snapshot_storage_from_filesystem(snapshot)
else:
has_recorded_size = (
_int_at(snapshot_storage, "allocated_size_bytes") is not None
or _int_at(snapshot_storage, "apparent_size_bytes") is not None
)
if not has_recorded_size:
snapshot_storage = _snapshot_storage_from_filesystem(snapshot)
apparent_size = _int_at(snapshot_storage, "apparent_size_bytes")
hardlinked_apparent = _int_at(snapshot_storage, "hardlinked_apparent_size_bytes") or 0
return {
"id": snapshot.id,
"dirname": snapshot.dirname,
"kind": snapshot.kind,
"status": snapshot.status,
"started_at": snapshot.started_at,
"apparent_size_bytes": _int_at(snapshot_storage, "apparent_size_bytes"),
"apparent_size_bytes": apparent_size,
"allocated_size_bytes": _int_at(snapshot_storage, "allocated_size_bytes"),
"hardlinked_files": _int_at(snapshot_storage, "hardlinked_files"),
"hardlinked_apparent_size_bytes": hardlinked_apparent,
"unique_apparent_size_bytes": max((apparent_size or 0) - hardlinked_apparent, 0),
}

View File

@@ -178,21 +178,28 @@
<div class="metric">
<div class="label">Scheduled</div>
<div class="value">{{ stats_summary.backup_data.scheduled.allocated_size_bytes|filesizeformat }}</div>
<div class="muted">unique {{ stats_summary.backup_data.scheduled.unique_apparent_size_bytes|filesizeformat }}</div>
</div>
<div class="metric">
<div class="label">Manual</div>
<div class="value">{{ stats_summary.backup_data.manual.allocated_size_bytes|filesizeformat }}</div>
<div class="muted">unique {{ stats_summary.backup_data.manual.unique_apparent_size_bytes|filesizeformat }}</div>
</div>
<div class="metric">
<div class="label">Incomplete</div>
<div class="value">{{ stats_summary.backup_data.incomplete.allocated_size_bytes|filesizeformat }}</div>
<div class="muted">measured from disk</div>
</div>
<div class="metric">
<div class="label">Total</div>
<div class="value">{{ stats_summary.backup_data.total.allocated_size_bytes|filesizeformat }}</div>
<div class="muted">unique {{ stats_summary.backup_data.total.unique_apparent_size_bytes|filesizeformat }}</div>
</div>
</section>
<p class="muted">Totals use the allocated snapshot size recorded in backup metadata, grouped by snapshot kind.</p>
<p class="muted">
Main totals use allocated snapshot size. Unique values estimate non-hardlinked visible data; incomplete
snapshots are measured from disk because their metadata can be stale.
</p>
</section>
{% if stats_summary.runs %}

View File

@@ -105,18 +105,22 @@
<div class="host-card-stat">
<div class="label">Scheduled data</div>
<div class="value">{{ host.stats_summary.backup_data.scheduled.allocated_size_bytes|filesizeformat }}</div>
<div class="muted">unique {{ host.stats_summary.backup_data.scheduled.unique_apparent_size_bytes|filesizeformat }}</div>
</div>
<div class="host-card-stat">
<div class="label">Manual data</div>
<div class="value">{{ host.stats_summary.backup_data.manual.allocated_size_bytes|filesizeformat }}</div>
<div class="muted">unique {{ host.stats_summary.backup_data.manual.unique_apparent_size_bytes|filesizeformat }}</div>
</div>
<div class="host-card-stat">
<div class="label">Incomplete data</div>
<div class="value">{{ host.stats_summary.backup_data.incomplete.allocated_size_bytes|filesizeformat }}</div>
<div class="muted">measured from disk</div>
</div>
<div class="host-card-stat">
<div class="label">Total data</div>
<div class="value">{{ host.stats_summary.backup_data.total.allocated_size_bytes|filesizeformat }}</div>
<div class="muted">unique {{ host.stats_summary.backup_data.total.unique_apparent_size_bytes|filesizeformat }}</div>
</div>
</div>
</div>

View File

@@ -130,18 +130,22 @@
<div>
<span class="label">Scheduled data</span>
<strong>{{ stats_summary.backup_data.scheduled.allocated_size_bytes|filesizeformat }}</strong>
<span class="muted">unique {{ stats_summary.backup_data.scheduled.unique_apparent_size_bytes|filesizeformat }}</span>
</div>
<div>
<span class="label">Manual data</span>
<strong>{{ stats_summary.backup_data.manual.allocated_size_bytes|filesizeformat }}</strong>
<span class="muted">unique {{ stats_summary.backup_data.manual.unique_apparent_size_bytes|filesizeformat }}</span>
</div>
<div>
<span class="label">Incomplete data</span>
<strong>{{ stats_summary.backup_data.incomplete.allocated_size_bytes|filesizeformat }}</strong>
<span class="muted">measured from disk</span>
</div>
<div>
<span class="label">Total snapshot data</span>
<strong>{{ stats_summary.backup_data.total.allocated_size_bytes|filesizeformat }}</strong>
<span class="muted">unique {{ stats_summary.backup_data.total.unique_apparent_size_bytes|filesizeformat }}</span>
</div>
</div>
</article>

View File

@@ -18,32 +18,44 @@ class StatsSummaryTests(TestCase):
self._snapshot(web, "20260519-021500Z__SCHED01", SnapshotRecord.Kind.SCHEDULED, allocated=100)
self._snapshot(web, "20260519-031500Z__MANUAL1", SnapshotRecord.Kind.MANUAL, allocated=200)
self._snapshot(db, "20260519-041500Z__SCHED02", SnapshotRecord.Kind.SCHEDULED, allocated=300)
self._snapshot(db, "20260519-051500Z__BROKEN1", SnapshotRecord.Kind.INCOMPLETE, allocated=400)
with TemporaryDirectory() as tmp:
incomplete_usage = self._incomplete_snapshot_on_disk(
db,
Path(tmp),
"20260519-051500Z__BROKEN1",
)
stats = collect_dashboard_stats(hosts=[web, db], global_config=None)
stats = collect_dashboard_stats(hosts=[web, db], global_config=None)
self.assertEqual(stats["backup_data"]["scheduled"]["count"], 2)
self.assertEqual(stats["backup_data"]["scheduled"]["allocated_size_bytes"], 400)
self.assertEqual(stats["backup_data"]["manual"]["allocated_size_bytes"], 200)
self.assertEqual(stats["backup_data"]["incomplete"]["allocated_size_bytes"], 400)
self.assertEqual(stats["backup_data"]["incomplete"]["allocated_size_bytes"], incomplete_usage["allocated_size_bytes"])
self.assertEqual(stats["backup_data"]["total"]["count"], 4)
self.assertEqual(stats["backup_data"]["total"]["allocated_size_bytes"], 1000)
self.assertEqual(stats["backup_data"]["total"]["allocated_size_bytes"], 600 + incomplete_usage["allocated_size_bytes"])
self.assertEqual(stats["backup_data"]["total"]["unique_apparent_size_bytes"], 1200 + incomplete_usage["apparent_size_bytes"])
def test_collect_host_stats_sums_backup_data_by_snapshot_kind(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
self._snapshot(host, "20260519-021500Z__SCHED01", SnapshotRecord.Kind.SCHEDULED, allocated=100)
self._snapshot(host, "20260519-031500Z__SCHED02", SnapshotRecord.Kind.SCHEDULED, allocated=200)
self._snapshot(host, "20260519-041500Z__MANUAL1", SnapshotRecord.Kind.MANUAL, allocated=300)
self._snapshot(host, "20260519-051500Z__BROKEN1", SnapshotRecord.Kind.INCOMPLETE, allocated=400)
with TemporaryDirectory() as tmp:
incomplete_usage = self._incomplete_snapshot_on_disk(
host,
Path(tmp),
"20260519-051500Z__BROKEN1",
)
stats = collect_host_stats(host=host)
stats = collect_host_stats(host=host)
self.assertEqual(stats["backup_data"]["scheduled"]["count"], 2)
self.assertEqual(stats["backup_data"]["scheduled"]["allocated_size_bytes"], 300)
self.assertEqual(stats["backup_data"]["manual"]["allocated_size_bytes"], 300)
self.assertEqual(stats["backup_data"]["incomplete"]["allocated_size_bytes"], 400)
self.assertEqual(stats["backup_data"]["incomplete"]["allocated_size_bytes"], incomplete_usage["allocated_size_bytes"])
self.assertEqual(stats["backup_data"]["total"]["count"], 4)
self.assertEqual(stats["backup_data"]["total"]["allocated_size_bytes"], 1000)
self.assertEqual(stats["backup_data"]["total"]["allocated_size_bytes"], 600 + incomplete_usage["allocated_size_bytes"])
self.assertEqual(stats["backup_data"]["total"]["unique_apparent_size_bytes"], 1200 + incomplete_usage["apparent_size_bytes"])
def test_collect_host_stats_falls_back_to_filesystem_usage_for_snapshots_without_metadata(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
@@ -81,8 +93,87 @@ class StatsSummaryTests(TestCase):
expected_usage["allocated_size_bytes"],
)
def test_collect_host_stats_measures_incomplete_data_from_disk_even_with_stale_metadata(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
with TemporaryDirectory() as tmp:
incomplete_dir = Path(tmp) / host.host / ".incomplete" / "20260519-051500Z__BROKEN1"
data_dir = incomplete_dir / "data"
data_dir.mkdir(parents=True)
data_dir.joinpath("partial-file").write_text("interrupted backup data\n", encoding="utf-8")
expected_usage = tree_usage(data_dir)
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=incomplete_dir.name,
path=str(incomplete_dir),
status="failed",
metadata={
"stats": {
"storage": {
"snapshot": {
"apparent_size_bytes": 0,
"allocated_size_bytes": 0,
}
}
}
},
)
stats = collect_host_stats(host=host)
self.assertEqual(
stats["backup_data"]["incomplete"]["allocated_size_bytes"],
expected_usage["allocated_size_bytes"],
)
self.assertGreater(stats["backup_data"]["incomplete"]["apparent_size_bytes"], 0)
def test_collect_host_stats_reports_non_hardlinked_snapshot_data(self) -> None:
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
self._snapshot_with_sizes(
host,
"20260519-021500Z__SCHED01",
SnapshotRecord.Kind.SCHEDULED,
allocated=1_200,
apparent=2_000,
hardlinked_apparent=1_500,
)
stats = collect_host_stats(host=host)
self.assertEqual(stats["backup_data"]["scheduled"]["apparent_size_bytes"], 2_000)
self.assertEqual(stats["backup_data"]["scheduled"]["unique_apparent_size_bytes"], 500)
self.assertEqual(stats["backup_data"]["total"]["unique_apparent_size_bytes"], 500)
def _snapshot(self, host: HostConfig, dirname: str, kind: str, *, allocated: int) -> SnapshotRecord:
return self._snapshot_with_sizes(host, dirname, kind, allocated=allocated)
def _incomplete_snapshot_on_disk(self, host: HostConfig, root: Path, dirname: str) -> dict:
incomplete_dir = root / host.host / ".incomplete" / dirname
data_dir = incomplete_dir / "data"
data_dir.mkdir(parents=True)
data_dir.joinpath("partial-file").write_text("interrupted backup data\n", encoding="utf-8")
usage = tree_usage(data_dir)
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=dirname,
path=str(incomplete_dir),
status="failed",
)
return usage
def _snapshot_with_sizes(
self,
host: HostConfig,
dirname: str,
kind: str,
*,
allocated: int,
apparent: int | None = None,
hardlinked_apparent: int = 0,
) -> SnapshotRecord:
started_at = datetime.strptime(dirname.split("__", 1)[0], "%Y%m%d-%H%M%SZ").replace(tzinfo=timezone.utc)
apparent_size = apparent if apparent is not None else allocated * 2
return SnapshotRecord.objects.create(
host=host,
kind=kind,
@@ -94,8 +185,9 @@ class StatsSummaryTests(TestCase):
"stats": {
"storage": {
"snapshot": {
"apparent_size_bytes": allocated * 2,
"apparent_size_bytes": apparent_size,
"allocated_size_bytes": allocated,
"hardlinked_apparent_size_bytes": hardlinked_apparent,
}
}
}

View File

@@ -265,12 +265,23 @@ class ViewTests(TestCase):
db = HostConfig.objects.create(host="db-01", address="db-01.example.test")
scheduled = self._snapshot(web, "20260519-021500Z__SCHED01", kind=SnapshotRecord.Kind.SCHEDULED)
manual = self._snapshot(web, "20260519-031500Z__MANUAL1", kind=SnapshotRecord.Kind.MANUAL)
incomplete = self._snapshot(db, "20260519-041500Z__BROKEN1", kind=SnapshotRecord.Kind.INCOMPLETE)
self._set_snapshot_storage(scheduled, allocated=100)
self._set_snapshot_storage(manual, allocated=200)
self._set_snapshot_storage(incomplete, allocated=300)
with TemporaryDirectory() as tmp:
incomplete_dir = Path(tmp) / db.host / ".incomplete" / "20260519-041500Z__BROKEN1"
data_dir = incomplete_dir / "data"
data_dir.mkdir(parents=True)
data_dir.joinpath("partial-file").write_text("interrupted backup data\n", encoding="utf-8")
expected_usage = tree_usage(data_dir)
SnapshotRecord.objects.create(
host=db,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=incomplete_dir.name,
path=str(incomplete_dir),
status="failed",
)
response = self.client.get(reverse("dashboard_priority_live"))
response = self.client.get(reverse("dashboard_priority_live"))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Scheduled data")
@@ -279,8 +290,8 @@ class ViewTests(TestCase):
self.assertContains(response, "Total snapshot data")
self.assertContains(response, "100&nbsp;bytes", html=True)
self.assertContains(response, "200&nbsp;bytes", html=True)
self.assertContains(response, "300&nbsp;bytes", html=True)
self.assertContains(response, "600&nbsp;bytes", html=True)
self.assertContains(response, filesizeformat(expected_usage["allocated_size_bytes"]))
self.assertContains(response, filesizeformat(300 + expected_usage["allocated_size_bytes"]))
def test_dashboard_hosts_live_returns_hosts_partial(self) -> None:
self.client.force_login(self.staff_user)
@@ -300,12 +311,23 @@ class ViewTests(TestCase):
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
scheduled = self._snapshot(host, "20260519-021500Z__SCHED01", kind=SnapshotRecord.Kind.SCHEDULED)
manual = self._snapshot(host, "20260519-031500Z__MANUAL1", kind=SnapshotRecord.Kind.MANUAL)
incomplete = self._snapshot(host, "20260519-041500Z__BROKEN1", kind=SnapshotRecord.Kind.INCOMPLETE)
self._set_snapshot_storage(scheduled, allocated=100)
self._set_snapshot_storage(manual, allocated=200)
self._set_snapshot_storage(incomplete, allocated=300)
with TemporaryDirectory() as tmp:
incomplete_dir = Path(tmp) / host.host / ".incomplete" / "20260519-041500Z__BROKEN1"
data_dir = incomplete_dir / "data"
data_dir.mkdir(parents=True)
data_dir.joinpath("partial-file").write_text("interrupted backup data\n", encoding="utf-8")
expected_usage = tree_usage(data_dir)
SnapshotRecord.objects.create(
host=host,
kind=SnapshotRecord.Kind.INCOMPLETE,
dirname=incomplete_dir.name,
path=str(incomplete_dir),
status="failed",
)
response = self.client.get(reverse("dashboard_hosts_live"))
response = self.client.get(reverse("dashboard_hosts_live"))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Scheduled data")
@@ -314,8 +336,8 @@ class ViewTests(TestCase):
self.assertContains(response, "Total data")
self.assertContains(response, "100&nbsp;bytes", html=True)
self.assertContains(response, "200&nbsp;bytes", html=True)
self.assertContains(response, "300&nbsp;bytes", html=True)
self.assertContains(response, "600&nbsp;bytes", html=True)
self.assertContains(response, filesizeformat(expected_usage["allocated_size_bytes"]))
self.assertContains(response, filesizeformat(300 + expected_usage["allocated_size_bytes"]))
def test_dashboard_host_cards_measure_incomplete_data_without_snapshot_metadata(self) -> None:
self.client.force_login(self.staff_user)