diff --git a/src/pobsync/commands/run_scheduled.py b/src/pobsync/commands/run_scheduled.py index b9353f4..ee00ecd 100644 --- a/src/pobsync/commands/run_scheduled.py +++ b/src/pobsync/commands/run_scheduled.py @@ -24,6 +24,7 @@ from ..util import ensure_dir, realpath_startswith, sanitize_host, write_yaml_at DEFAULT_DRY_RUN_TIMEOUT_SECONDS = 900 RSYNC_PARTIAL_VANISHED_EXIT_CODE = 24 +SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES = 200_000 def dry_run_log_path(host: str, run_id: int | None = None) -> Path: @@ -100,7 +101,11 @@ def _collect_run_stats( ) -> dict[str, Any]: stats: dict[str, Any] = { "rsync": read_rsync_stats(log_path), - "storage": collect_storage_stats(backup_root=backup_root, snapshot_dir=snapshot_dir), + "storage": collect_storage_stats( + backup_root=backup_root, + snapshot_dir=snapshot_dir, + snapshot_max_entries=SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES if snapshot_dir is not None else None, + ), } if duration_seconds is not None: stats["duration_seconds"] = int(duration_seconds) diff --git a/src/pobsync/run_stats.py b/src/pobsync/run_stats.py index 5bf36d2..ee8ecb5 100644 --- a/src/pobsync/run_stats.py +++ b/src/pobsync/run_stats.py @@ -62,7 +62,12 @@ def read_rsync_stats(log_path: Path) -> dict[str, Any]: return parse_rsync_stats(text) -def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None) -> dict[str, Any]: +def collect_storage_stats( + *, + backup_root: Path, + snapshot_dir: Path | None = None, + snapshot_max_entries: int | None = None, +) -> dict[str, Any]: stats: dict[str, Any] = { "backup_root": str(backup_root), } @@ -71,7 +76,7 @@ def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None stats["capacity"] = capacity if snapshot_dir is not None: - snapshot_usage = tree_usage(snapshot_dir) + snapshot_usage = tree_usage(snapshot_dir, max_entries=snapshot_max_entries) if snapshot_usage: stats["snapshot"] = { "path": str(snapshot_dir), @@ -103,13 +108,15 @@ def filesystem_capacity(path: Path) -> dict[str, Any]: } -def tree_usage(path: Path) -> dict[str, Any]: +def tree_usage(path: Path, *, max_entries: int | None = None) -> dict[str, Any]: apparent_size = 0 allocated_size = 0 files = 0 directories = 0 hardlinked_files = 0 hardlinked_apparent_size = 0 + entries_scanned = 0 + scan_limited = False seen_allocated_inodes: set[tuple[int, int]] = set() try: @@ -119,6 +126,7 @@ def tree_usage(path: Path) -> dict[str, Any]: if path.is_file(): files = 1 + entries_scanned = 1 apparent_size = root_stat.st_size allocated_size = int(getattr(root_stat, "st_blocks", 0) * 512) if root_stat.st_nlink > 1: @@ -126,14 +134,25 @@ def tree_usage(path: Path) -> dict[str, Any]: hardlinked_apparent_size = root_stat.st_size else: for current_root, dirnames, filenames in path.walk(): - directories += len(dirnames) + for _dirname in dirnames: + if _scan_limit_reached(entries_scanned, max_entries): + scan_limited = True + break + directories += 1 + entries_scanned += 1 + if scan_limited: + break for filename in filenames: + if _scan_limit_reached(entries_scanned, max_entries): + scan_limited = True + break file_path = current_root / filename try: file_stat = file_path.lstat() except OSError: continue files += 1 + entries_scanned += 1 apparent_size += file_stat.st_size inode_key = (file_stat.st_dev, file_stat.st_ino) if inode_key not in seen_allocated_inodes: @@ -142,6 +161,8 @@ def tree_usage(path: Path) -> dict[str, Any]: if file_stat.st_nlink > 1: hardlinked_files += 1 hardlinked_apparent_size += file_stat.st_size + if scan_limited: + break return { "path": str(path), @@ -149,12 +170,19 @@ def tree_usage(path: Path) -> dict[str, Any]: "allocated_size_bytes": int(allocated_size), "files": files, "directories": directories, + "entries_scanned": entries_scanned, + "scan_limited": scan_limited, + "max_entries": max_entries, "hardlinked_files": hardlinked_files, "hardlinked_apparent_size_bytes": int(hardlinked_apparent_size), "hardlink_apparent_ratio": round(hardlinked_apparent_size / apparent_size, 4) if apparent_size else 0.0, } +def _scan_limit_reached(entries_scanned: int, max_entries: int | None) -> bool: + return max_entries is not None and max_entries >= 0 and entries_scanned >= max_entries + + def _parse_colon_stat(line: str, stats: dict[str, Any]) -> None: if ":" not in line: return diff --git a/src/pobsync_backend/tests/test_run_scheduled_config_source.py b/src/pobsync_backend/tests/test_run_scheduled_config_source.py index e15995c..1b875e3 100644 --- a/src/pobsync_backend/tests/test_run_scheduled_config_source.py +++ b/src/pobsync_backend/tests/test_run_scheduled_config_source.py @@ -6,7 +6,7 @@ from unittest.mock import patch from django.test import SimpleTestCase -from pobsync.commands.run_scheduled import run_scheduled +from pobsync.commands.run_scheduled import SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES, run_scheduled from pobsync.errors import ConfigError from pobsync.rsync import RsyncResult @@ -270,9 +270,12 @@ class RunScheduledConfigSourceTests(SimpleTestCase): self.assertEqual(result["stats"]["rsync"]["files_transferred"], 2) self.assertEqual(result["stats"]["rsync"]["link_dest_estimated_savings_bytes"], 1500) self.assertIn("snapshot", result["stats"]["storage"]) + self.assertEqual(result["stats"]["storage"]["snapshot"]["max_entries"], SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES) + self.assertFalse(result["stats"]["storage"]["snapshot"]["scan_limited"]) self.assertIn("capacity", result["stats"]["storage"]) self.assertIn("stats:", meta_text) self.assertIn("files_total: 10", meta_text) + self.assertIn(f"max_entries: {SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES}", meta_text) def test_real_run_reports_running_state_callback_before_rsync_returns(self) -> None: states = [] diff --git a/src/pobsync_backend/tests/test_run_stats.py b/src/pobsync_backend/tests/test_run_stats.py index 568dc39..8b289cc 100644 --- a/src/pobsync_backend/tests/test_run_stats.py +++ b/src/pobsync_backend/tests/test_run_stats.py @@ -6,7 +6,7 @@ from tempfile import TemporaryDirectory from django.test import SimpleTestCase -from pobsync.run_stats import parse_rsync_stats, tree_usage +from pobsync.run_stats import collect_storage_stats, parse_rsync_stats, tree_usage class RunStatsTests(SimpleTestCase): @@ -58,3 +58,37 @@ total size is 1.50M speedup is 125.00 self.assertEqual(stats["hardlinked_files"], 2) self.assertEqual(stats["hardlinked_apparent_size_bytes"], 6) self.assertEqual(stats["hardlink_apparent_ratio"], 1.0) + self.assertFalse(stats["scan_limited"]) + + def test_tree_usage_can_limit_large_scans(self) -> None: + with TemporaryDirectory() as tmp: + root = Path(tmp) + for index in range(5): + (root / f"file-{index}").write_bytes(b"x") + + stats = tree_usage(root, max_entries=2) + + self.assertEqual(stats["files"], 2) + self.assertEqual(stats["entries_scanned"], 2) + self.assertEqual(stats["max_entries"], 2) + self.assertTrue(stats["scan_limited"]) + self.assertEqual(stats["apparent_size_bytes"], 2) + + def test_collect_storage_stats_marks_limited_snapshot_scan(self) -> None: + with TemporaryDirectory() as tmp: + root = Path(tmp) + snapshot = root / "snapshot" + snapshot.mkdir() + for index in range(4): + (snapshot / f"file-{index}").write_bytes(b"x") + + stats = collect_storage_stats( + backup_root=root, + snapshot_dir=snapshot, + snapshot_max_entries=1, + ) + + self.assertEqual(stats["snapshot"]["files"], 1) + self.assertEqual(stats["snapshot"]["entries_scanned"], 1) + self.assertEqual(stats["snapshot"]["max_entries"], 1) + self.assertTrue(stats["snapshot"]["scan_limited"])