From 9ece39b72ef7435898146ed863e4924ac022e877 Mon Sep 17 00:00:00 2001 From: Peter van Arkel Date: Mon, 8 Jun 2026 23:18:50 +0200 Subject: [PATCH] (bugfix) Bound snapshot storage metadata scans Limit snapshot storage scans recorded by backup workers so very large backup targets cannot make run finalization walk unbounded file trees. Limited scans now record scan_limited, entries_scanned, and max_entries in snapshot storage metadata. Closes #100 --- src/pobsync/commands/run_scheduled.py | 7 +++- src/pobsync/run_stats.py | 36 ++++++++++++++++--- .../tests/test_run_scheduled_config_source.py | 5 ++- src/pobsync_backend/tests/test_run_stats.py | 36 ++++++++++++++++++- 4 files changed, 77 insertions(+), 7 deletions(-) diff --git a/src/pobsync/commands/run_scheduled.py b/src/pobsync/commands/run_scheduled.py index b9353f4..ee00ecd 100644 --- a/src/pobsync/commands/run_scheduled.py +++ b/src/pobsync/commands/run_scheduled.py @@ -24,6 +24,7 @@ from ..util import ensure_dir, realpath_startswith, sanitize_host, write_yaml_at DEFAULT_DRY_RUN_TIMEOUT_SECONDS = 900 RSYNC_PARTIAL_VANISHED_EXIT_CODE = 24 +SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES = 200_000 def dry_run_log_path(host: str, run_id: int | None = None) -> Path: @@ -100,7 +101,11 @@ def _collect_run_stats( ) -> dict[str, Any]: stats: dict[str, Any] = { "rsync": read_rsync_stats(log_path), - "storage": collect_storage_stats(backup_root=backup_root, snapshot_dir=snapshot_dir), + "storage": collect_storage_stats( + backup_root=backup_root, + snapshot_dir=snapshot_dir, + snapshot_max_entries=SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES if snapshot_dir is not None else None, + ), } if duration_seconds is not None: stats["duration_seconds"] = int(duration_seconds) diff --git a/src/pobsync/run_stats.py b/src/pobsync/run_stats.py index 5bf36d2..ee8ecb5 100644 --- a/src/pobsync/run_stats.py +++ b/src/pobsync/run_stats.py @@ -62,7 +62,12 @@ def read_rsync_stats(log_path: Path) -> dict[str, Any]: return parse_rsync_stats(text) -def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None) -> dict[str, Any]: +def collect_storage_stats( + *, + backup_root: Path, + snapshot_dir: Path | None = None, + snapshot_max_entries: int | None = None, +) -> dict[str, Any]: stats: dict[str, Any] = { "backup_root": str(backup_root), } @@ -71,7 +76,7 @@ def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None stats["capacity"] = capacity if snapshot_dir is not None: - snapshot_usage = tree_usage(snapshot_dir) + snapshot_usage = tree_usage(snapshot_dir, max_entries=snapshot_max_entries) if snapshot_usage: stats["snapshot"] = { "path": str(snapshot_dir), @@ -103,13 +108,15 @@ def filesystem_capacity(path: Path) -> dict[str, Any]: } -def tree_usage(path: Path) -> dict[str, Any]: +def tree_usage(path: Path, *, max_entries: int | None = None) -> dict[str, Any]: apparent_size = 0 allocated_size = 0 files = 0 directories = 0 hardlinked_files = 0 hardlinked_apparent_size = 0 + entries_scanned = 0 + scan_limited = False seen_allocated_inodes: set[tuple[int, int]] = set() try: @@ -119,6 +126,7 @@ def tree_usage(path: Path) -> dict[str, Any]: if path.is_file(): files = 1 + entries_scanned = 1 apparent_size = root_stat.st_size allocated_size = int(getattr(root_stat, "st_blocks", 0) * 512) if root_stat.st_nlink > 1: @@ -126,14 +134,25 @@ def tree_usage(path: Path) -> dict[str, Any]: hardlinked_apparent_size = root_stat.st_size else: for current_root, dirnames, filenames in path.walk(): - directories += len(dirnames) + for _dirname in dirnames: + if _scan_limit_reached(entries_scanned, max_entries): + scan_limited = True + break + directories += 1 + entries_scanned += 1 + if scan_limited: + break for filename in filenames: + if _scan_limit_reached(entries_scanned, max_entries): + scan_limited = True + break file_path = current_root / filename try: file_stat = file_path.lstat() except OSError: continue files += 1 + entries_scanned += 1 apparent_size += file_stat.st_size inode_key = (file_stat.st_dev, file_stat.st_ino) if inode_key not in seen_allocated_inodes: @@ -142,6 +161,8 @@ def tree_usage(path: Path) -> dict[str, Any]: if file_stat.st_nlink > 1: hardlinked_files += 1 hardlinked_apparent_size += file_stat.st_size + if scan_limited: + break return { "path": str(path), @@ -149,12 +170,19 @@ def tree_usage(path: Path) -> dict[str, Any]: "allocated_size_bytes": int(allocated_size), "files": files, "directories": directories, + "entries_scanned": entries_scanned, + "scan_limited": scan_limited, + "max_entries": max_entries, "hardlinked_files": hardlinked_files, "hardlinked_apparent_size_bytes": int(hardlinked_apparent_size), "hardlink_apparent_ratio": round(hardlinked_apparent_size / apparent_size, 4) if apparent_size else 0.0, } +def _scan_limit_reached(entries_scanned: int, max_entries: int | None) -> bool: + return max_entries is not None and max_entries >= 0 and entries_scanned >= max_entries + + def _parse_colon_stat(line: str, stats: dict[str, Any]) -> None: if ":" not in line: return diff --git a/src/pobsync_backend/tests/test_run_scheduled_config_source.py b/src/pobsync_backend/tests/test_run_scheduled_config_source.py index e15995c..1b875e3 100644 --- a/src/pobsync_backend/tests/test_run_scheduled_config_source.py +++ b/src/pobsync_backend/tests/test_run_scheduled_config_source.py @@ -6,7 +6,7 @@ from unittest.mock import patch from django.test import SimpleTestCase -from pobsync.commands.run_scheduled import run_scheduled +from pobsync.commands.run_scheduled import SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES, run_scheduled from pobsync.errors import ConfigError from pobsync.rsync import RsyncResult @@ -270,9 +270,12 @@ class RunScheduledConfigSourceTests(SimpleTestCase): self.assertEqual(result["stats"]["rsync"]["files_transferred"], 2) self.assertEqual(result["stats"]["rsync"]["link_dest_estimated_savings_bytes"], 1500) self.assertIn("snapshot", result["stats"]["storage"]) + self.assertEqual(result["stats"]["storage"]["snapshot"]["max_entries"], SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES) + self.assertFalse(result["stats"]["storage"]["snapshot"]["scan_limited"]) self.assertIn("capacity", result["stats"]["storage"]) self.assertIn("stats:", meta_text) self.assertIn("files_total: 10", meta_text) + self.assertIn(f"max_entries: {SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES}", meta_text) def test_real_run_reports_running_state_callback_before_rsync_returns(self) -> None: states = [] diff --git a/src/pobsync_backend/tests/test_run_stats.py b/src/pobsync_backend/tests/test_run_stats.py index 568dc39..8b289cc 100644 --- a/src/pobsync_backend/tests/test_run_stats.py +++ b/src/pobsync_backend/tests/test_run_stats.py @@ -6,7 +6,7 @@ from tempfile import TemporaryDirectory from django.test import SimpleTestCase -from pobsync.run_stats import parse_rsync_stats, tree_usage +from pobsync.run_stats import collect_storage_stats, parse_rsync_stats, tree_usage class RunStatsTests(SimpleTestCase): @@ -58,3 +58,37 @@ total size is 1.50M speedup is 125.00 self.assertEqual(stats["hardlinked_files"], 2) self.assertEqual(stats["hardlinked_apparent_size_bytes"], 6) self.assertEqual(stats["hardlink_apparent_ratio"], 1.0) + self.assertFalse(stats["scan_limited"]) + + def test_tree_usage_can_limit_large_scans(self) -> None: + with TemporaryDirectory() as tmp: + root = Path(tmp) + for index in range(5): + (root / f"file-{index}").write_bytes(b"x") + + stats = tree_usage(root, max_entries=2) + + self.assertEqual(stats["files"], 2) + self.assertEqual(stats["entries_scanned"], 2) + self.assertEqual(stats["max_entries"], 2) + self.assertTrue(stats["scan_limited"]) + self.assertEqual(stats["apparent_size_bytes"], 2) + + def test_collect_storage_stats_marks_limited_snapshot_scan(self) -> None: + with TemporaryDirectory() as tmp: + root = Path(tmp) + snapshot = root / "snapshot" + snapshot.mkdir() + for index in range(4): + (snapshot / f"file-{index}").write_bytes(b"x") + + stats = collect_storage_stats( + backup_root=root, + snapshot_dir=snapshot, + snapshot_max_entries=1, + ) + + self.assertEqual(stats["snapshot"]["files"], 1) + self.assertEqual(stats["snapshot"]["entries_scanned"], 1) + self.assertEqual(stats["snapshot"]["max_entries"], 1) + self.assertTrue(stats["snapshot"]["scan_limited"])