(bugfix) Bound snapshot storage metadata scans

Limit snapshot storage scans recorded by backup workers so very large backup targets cannot make run finalization walk unbounded file trees. Limited scans now record scan_limited, entries_scanned, and max_entries in snapshot storage metadata.

Closes #100
This commit is contained in:
2026-06-08 23:18:50 +02:00
parent 42b3430274
commit 9ece39b72e
4 changed files with 77 additions and 7 deletions

View File

@@ -24,6 +24,7 @@ from ..util import ensure_dir, realpath_startswith, sanitize_host, write_yaml_at
DEFAULT_DRY_RUN_TIMEOUT_SECONDS = 900 DEFAULT_DRY_RUN_TIMEOUT_SECONDS = 900
RSYNC_PARTIAL_VANISHED_EXIT_CODE = 24 RSYNC_PARTIAL_VANISHED_EXIT_CODE = 24
SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES = 200_000
def dry_run_log_path(host: str, run_id: int | None = None) -> Path: def dry_run_log_path(host: str, run_id: int | None = None) -> Path:
@@ -100,7 +101,11 @@ def _collect_run_stats(
) -> dict[str, Any]: ) -> dict[str, Any]:
stats: dict[str, Any] = { stats: dict[str, Any] = {
"rsync": read_rsync_stats(log_path), "rsync": read_rsync_stats(log_path),
"storage": collect_storage_stats(backup_root=backup_root, snapshot_dir=snapshot_dir), "storage": collect_storage_stats(
backup_root=backup_root,
snapshot_dir=snapshot_dir,
snapshot_max_entries=SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES if snapshot_dir is not None else None,
),
} }
if duration_seconds is not None: if duration_seconds is not None:
stats["duration_seconds"] = int(duration_seconds) stats["duration_seconds"] = int(duration_seconds)

View File

@@ -62,7 +62,12 @@ def read_rsync_stats(log_path: Path) -> dict[str, Any]:
return parse_rsync_stats(text) return parse_rsync_stats(text)
def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None) -> dict[str, Any]: def collect_storage_stats(
*,
backup_root: Path,
snapshot_dir: Path | None = None,
snapshot_max_entries: int | None = None,
) -> dict[str, Any]:
stats: dict[str, Any] = { stats: dict[str, Any] = {
"backup_root": str(backup_root), "backup_root": str(backup_root),
} }
@@ -71,7 +76,7 @@ def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None
stats["capacity"] = capacity stats["capacity"] = capacity
if snapshot_dir is not None: if snapshot_dir is not None:
snapshot_usage = tree_usage(snapshot_dir) snapshot_usage = tree_usage(snapshot_dir, max_entries=snapshot_max_entries)
if snapshot_usage: if snapshot_usage:
stats["snapshot"] = { stats["snapshot"] = {
"path": str(snapshot_dir), "path": str(snapshot_dir),
@@ -103,13 +108,15 @@ def filesystem_capacity(path: Path) -> dict[str, Any]:
} }
def tree_usage(path: Path) -> dict[str, Any]: def tree_usage(path: Path, *, max_entries: int | None = None) -> dict[str, Any]:
apparent_size = 0 apparent_size = 0
allocated_size = 0 allocated_size = 0
files = 0 files = 0
directories = 0 directories = 0
hardlinked_files = 0 hardlinked_files = 0
hardlinked_apparent_size = 0 hardlinked_apparent_size = 0
entries_scanned = 0
scan_limited = False
seen_allocated_inodes: set[tuple[int, int]] = set() seen_allocated_inodes: set[tuple[int, int]] = set()
try: try:
@@ -119,6 +126,7 @@ def tree_usage(path: Path) -> dict[str, Any]:
if path.is_file(): if path.is_file():
files = 1 files = 1
entries_scanned = 1
apparent_size = root_stat.st_size apparent_size = root_stat.st_size
allocated_size = int(getattr(root_stat, "st_blocks", 0) * 512) allocated_size = int(getattr(root_stat, "st_blocks", 0) * 512)
if root_stat.st_nlink > 1: if root_stat.st_nlink > 1:
@@ -126,14 +134,25 @@ def tree_usage(path: Path) -> dict[str, Any]:
hardlinked_apparent_size = root_stat.st_size hardlinked_apparent_size = root_stat.st_size
else: else:
for current_root, dirnames, filenames in path.walk(): for current_root, dirnames, filenames in path.walk():
directories += len(dirnames) for _dirname in dirnames:
if _scan_limit_reached(entries_scanned, max_entries):
scan_limited = True
break
directories += 1
entries_scanned += 1
if scan_limited:
break
for filename in filenames: for filename in filenames:
if _scan_limit_reached(entries_scanned, max_entries):
scan_limited = True
break
file_path = current_root / filename file_path = current_root / filename
try: try:
file_stat = file_path.lstat() file_stat = file_path.lstat()
except OSError: except OSError:
continue continue
files += 1 files += 1
entries_scanned += 1
apparent_size += file_stat.st_size apparent_size += file_stat.st_size
inode_key = (file_stat.st_dev, file_stat.st_ino) inode_key = (file_stat.st_dev, file_stat.st_ino)
if inode_key not in seen_allocated_inodes: if inode_key not in seen_allocated_inodes:
@@ -142,6 +161,8 @@ def tree_usage(path: Path) -> dict[str, Any]:
if file_stat.st_nlink > 1: if file_stat.st_nlink > 1:
hardlinked_files += 1 hardlinked_files += 1
hardlinked_apparent_size += file_stat.st_size hardlinked_apparent_size += file_stat.st_size
if scan_limited:
break
return { return {
"path": str(path), "path": str(path),
@@ -149,12 +170,19 @@ def tree_usage(path: Path) -> dict[str, Any]:
"allocated_size_bytes": int(allocated_size), "allocated_size_bytes": int(allocated_size),
"files": files, "files": files,
"directories": directories, "directories": directories,
"entries_scanned": entries_scanned,
"scan_limited": scan_limited,
"max_entries": max_entries,
"hardlinked_files": hardlinked_files, "hardlinked_files": hardlinked_files,
"hardlinked_apparent_size_bytes": int(hardlinked_apparent_size), "hardlinked_apparent_size_bytes": int(hardlinked_apparent_size),
"hardlink_apparent_ratio": round(hardlinked_apparent_size / apparent_size, 4) if apparent_size else 0.0, "hardlink_apparent_ratio": round(hardlinked_apparent_size / apparent_size, 4) if apparent_size else 0.0,
} }
def _scan_limit_reached(entries_scanned: int, max_entries: int | None) -> bool:
return max_entries is not None and max_entries >= 0 and entries_scanned >= max_entries
def _parse_colon_stat(line: str, stats: dict[str, Any]) -> None: def _parse_colon_stat(line: str, stats: dict[str, Any]) -> None:
if ":" not in line: if ":" not in line:
return return

View File

@@ -6,7 +6,7 @@ from unittest.mock import patch
from django.test import SimpleTestCase from django.test import SimpleTestCase
from pobsync.commands.run_scheduled import run_scheduled from pobsync.commands.run_scheduled import SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES, run_scheduled
from pobsync.errors import ConfigError from pobsync.errors import ConfigError
from pobsync.rsync import RsyncResult from pobsync.rsync import RsyncResult
@@ -270,9 +270,12 @@ class RunScheduledConfigSourceTests(SimpleTestCase):
self.assertEqual(result["stats"]["rsync"]["files_transferred"], 2) self.assertEqual(result["stats"]["rsync"]["files_transferred"], 2)
self.assertEqual(result["stats"]["rsync"]["link_dest_estimated_savings_bytes"], 1500) self.assertEqual(result["stats"]["rsync"]["link_dest_estimated_savings_bytes"], 1500)
self.assertIn("snapshot", result["stats"]["storage"]) self.assertIn("snapshot", result["stats"]["storage"])
self.assertEqual(result["stats"]["storage"]["snapshot"]["max_entries"], SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES)
self.assertFalse(result["stats"]["storage"]["snapshot"]["scan_limited"])
self.assertIn("capacity", result["stats"]["storage"]) self.assertIn("capacity", result["stats"]["storage"])
self.assertIn("stats:", meta_text) self.assertIn("stats:", meta_text)
self.assertIn("files_total: 10", meta_text) self.assertIn("files_total: 10", meta_text)
self.assertIn(f"max_entries: {SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES}", meta_text)
def test_real_run_reports_running_state_callback_before_rsync_returns(self) -> None: def test_real_run_reports_running_state_callback_before_rsync_returns(self) -> None:
states = [] states = []

View File

@@ -6,7 +6,7 @@ from tempfile import TemporaryDirectory
from django.test import SimpleTestCase from django.test import SimpleTestCase
from pobsync.run_stats import parse_rsync_stats, tree_usage from pobsync.run_stats import collect_storage_stats, parse_rsync_stats, tree_usage
class RunStatsTests(SimpleTestCase): class RunStatsTests(SimpleTestCase):
@@ -58,3 +58,37 @@ total size is 1.50M speedup is 125.00
self.assertEqual(stats["hardlinked_files"], 2) self.assertEqual(stats["hardlinked_files"], 2)
self.assertEqual(stats["hardlinked_apparent_size_bytes"], 6) self.assertEqual(stats["hardlinked_apparent_size_bytes"], 6)
self.assertEqual(stats["hardlink_apparent_ratio"], 1.0) self.assertEqual(stats["hardlink_apparent_ratio"], 1.0)
self.assertFalse(stats["scan_limited"])
def test_tree_usage_can_limit_large_scans(self) -> None:
with TemporaryDirectory() as tmp:
root = Path(tmp)
for index in range(5):
(root / f"file-{index}").write_bytes(b"x")
stats = tree_usage(root, max_entries=2)
self.assertEqual(stats["files"], 2)
self.assertEqual(stats["entries_scanned"], 2)
self.assertEqual(stats["max_entries"], 2)
self.assertTrue(stats["scan_limited"])
self.assertEqual(stats["apparent_size_bytes"], 2)
def test_collect_storage_stats_marks_limited_snapshot_scan(self) -> None:
with TemporaryDirectory() as tmp:
root = Path(tmp)
snapshot = root / "snapshot"
snapshot.mkdir()
for index in range(4):
(snapshot / f"file-{index}").write_bytes(b"x")
stats = collect_storage_stats(
backup_root=root,
snapshot_dir=snapshot,
snapshot_max_entries=1,
)
self.assertEqual(stats["snapshot"]["files"], 1)
self.assertEqual(stats["snapshot"]["entries_scanned"], 1)
self.assertEqual(stats["snapshot"]["max_entries"], 1)
self.assertTrue(stats["snapshot"]["scan_limited"])