(bugfix) Bound snapshot storage metadata scans

Limit snapshot storage scans recorded by backup workers so very large backup targets cannot make run finalization walk unbounded file trees. Limited scans now record scan_limited, entries_scanned, and max_entries in snapshot storage metadata.

Closes #100
This commit is contained in:
2026-06-08 23:18:50 +02:00
parent 42b3430274
commit 9ece39b72e
4 changed files with 77 additions and 7 deletions

View File

@@ -24,6 +24,7 @@ from ..util import ensure_dir, realpath_startswith, sanitize_host, write_yaml_at
DEFAULT_DRY_RUN_TIMEOUT_SECONDS = 900
RSYNC_PARTIAL_VANISHED_EXIT_CODE = 24
SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES = 200_000
def dry_run_log_path(host: str, run_id: int | None = None) -> Path:
@@ -100,7 +101,11 @@ def _collect_run_stats(
) -> dict[str, Any]:
stats: dict[str, Any] = {
"rsync": read_rsync_stats(log_path),
"storage": collect_storage_stats(backup_root=backup_root, snapshot_dir=snapshot_dir),
"storage": collect_storage_stats(
backup_root=backup_root,
snapshot_dir=snapshot_dir,
snapshot_max_entries=SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES if snapshot_dir is not None else None,
),
}
if duration_seconds is not None:
stats["duration_seconds"] = int(duration_seconds)

View File

@@ -62,7 +62,12 @@ def read_rsync_stats(log_path: Path) -> dict[str, Any]:
return parse_rsync_stats(text)
def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None) -> dict[str, Any]:
def collect_storage_stats(
*,
backup_root: Path,
snapshot_dir: Path | None = None,
snapshot_max_entries: int | None = None,
) -> dict[str, Any]:
stats: dict[str, Any] = {
"backup_root": str(backup_root),
}
@@ -71,7 +76,7 @@ def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None
stats["capacity"] = capacity
if snapshot_dir is not None:
snapshot_usage = tree_usage(snapshot_dir)
snapshot_usage = tree_usage(snapshot_dir, max_entries=snapshot_max_entries)
if snapshot_usage:
stats["snapshot"] = {
"path": str(snapshot_dir),
@@ -103,13 +108,15 @@ def filesystem_capacity(path: Path) -> dict[str, Any]:
}
def tree_usage(path: Path) -> dict[str, Any]:
def tree_usage(path: Path, *, max_entries: int | None = None) -> dict[str, Any]:
apparent_size = 0
allocated_size = 0
files = 0
directories = 0
hardlinked_files = 0
hardlinked_apparent_size = 0
entries_scanned = 0
scan_limited = False
seen_allocated_inodes: set[tuple[int, int]] = set()
try:
@@ -119,6 +126,7 @@ def tree_usage(path: Path) -> dict[str, Any]:
if path.is_file():
files = 1
entries_scanned = 1
apparent_size = root_stat.st_size
allocated_size = int(getattr(root_stat, "st_blocks", 0) * 512)
if root_stat.st_nlink > 1:
@@ -126,14 +134,25 @@ def tree_usage(path: Path) -> dict[str, Any]:
hardlinked_apparent_size = root_stat.st_size
else:
for current_root, dirnames, filenames in path.walk():
directories += len(dirnames)
for _dirname in dirnames:
if _scan_limit_reached(entries_scanned, max_entries):
scan_limited = True
break
directories += 1
entries_scanned += 1
if scan_limited:
break
for filename in filenames:
if _scan_limit_reached(entries_scanned, max_entries):
scan_limited = True
break
file_path = current_root / filename
try:
file_stat = file_path.lstat()
except OSError:
continue
files += 1
entries_scanned += 1
apparent_size += file_stat.st_size
inode_key = (file_stat.st_dev, file_stat.st_ino)
if inode_key not in seen_allocated_inodes:
@@ -142,6 +161,8 @@ def tree_usage(path: Path) -> dict[str, Any]:
if file_stat.st_nlink > 1:
hardlinked_files += 1
hardlinked_apparent_size += file_stat.st_size
if scan_limited:
break
return {
"path": str(path),
@@ -149,12 +170,19 @@ def tree_usage(path: Path) -> dict[str, Any]:
"allocated_size_bytes": int(allocated_size),
"files": files,
"directories": directories,
"entries_scanned": entries_scanned,
"scan_limited": scan_limited,
"max_entries": max_entries,
"hardlinked_files": hardlinked_files,
"hardlinked_apparent_size_bytes": int(hardlinked_apparent_size),
"hardlink_apparent_ratio": round(hardlinked_apparent_size / apparent_size, 4) if apparent_size else 0.0,
}
def _scan_limit_reached(entries_scanned: int, max_entries: int | None) -> bool:
return max_entries is not None and max_entries >= 0 and entries_scanned >= max_entries
def _parse_colon_stat(line: str, stats: dict[str, Any]) -> None:
if ":" not in line:
return

View File

@@ -6,7 +6,7 @@ from unittest.mock import patch
from django.test import SimpleTestCase
from pobsync.commands.run_scheduled import run_scheduled
from pobsync.commands.run_scheduled import SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES, run_scheduled
from pobsync.errors import ConfigError
from pobsync.rsync import RsyncResult
@@ -270,9 +270,12 @@ class RunScheduledConfigSourceTests(SimpleTestCase):
self.assertEqual(result["stats"]["rsync"]["files_transferred"], 2)
self.assertEqual(result["stats"]["rsync"]["link_dest_estimated_savings_bytes"], 1500)
self.assertIn("snapshot", result["stats"]["storage"])
self.assertEqual(result["stats"]["storage"]["snapshot"]["max_entries"], SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES)
self.assertFalse(result["stats"]["storage"]["snapshot"]["scan_limited"])
self.assertIn("capacity", result["stats"]["storage"])
self.assertIn("stats:", meta_text)
self.assertIn("files_total: 10", meta_text)
self.assertIn(f"max_entries: {SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES}", meta_text)
def test_real_run_reports_running_state_callback_before_rsync_returns(self) -> None:
states = []

View File

@@ -6,7 +6,7 @@ from tempfile import TemporaryDirectory
from django.test import SimpleTestCase
from pobsync.run_stats import parse_rsync_stats, tree_usage
from pobsync.run_stats import collect_storage_stats, parse_rsync_stats, tree_usage
class RunStatsTests(SimpleTestCase):
@@ -58,3 +58,37 @@ total size is 1.50M speedup is 125.00
self.assertEqual(stats["hardlinked_files"], 2)
self.assertEqual(stats["hardlinked_apparent_size_bytes"], 6)
self.assertEqual(stats["hardlink_apparent_ratio"], 1.0)
self.assertFalse(stats["scan_limited"])
def test_tree_usage_can_limit_large_scans(self) -> None:
with TemporaryDirectory() as tmp:
root = Path(tmp)
for index in range(5):
(root / f"file-{index}").write_bytes(b"x")
stats = tree_usage(root, max_entries=2)
self.assertEqual(stats["files"], 2)
self.assertEqual(stats["entries_scanned"], 2)
self.assertEqual(stats["max_entries"], 2)
self.assertTrue(stats["scan_limited"])
self.assertEqual(stats["apparent_size_bytes"], 2)
def test_collect_storage_stats_marks_limited_snapshot_scan(self) -> None:
with TemporaryDirectory() as tmp:
root = Path(tmp)
snapshot = root / "snapshot"
snapshot.mkdir()
for index in range(4):
(snapshot / f"file-{index}").write_bytes(b"x")
stats = collect_storage_stats(
backup_root=root,
snapshot_dir=snapshot,
snapshot_max_entries=1,
)
self.assertEqual(stats["snapshot"]["files"], 1)
self.assertEqual(stats["snapshot"]["entries_scanned"], 1)
self.assertEqual(stats["snapshot"]["max_entries"], 1)
self.assertTrue(stats["snapshot"]["scan_limited"])