(bugfix) Bound snapshot storage metadata scans #101

Merged
parkel merged 1 commits from issue-100-bound-storage-metadata-scans into master 2026-06-08 23:26:24 +02:00
4 changed files with 77 additions and 7 deletions
Showing only changes of commit 9ece39b72e - Show all commits

View File

@@ -24,6 +24,7 @@ from ..util import ensure_dir, realpath_startswith, sanitize_host, write_yaml_at
DEFAULT_DRY_RUN_TIMEOUT_SECONDS = 900
RSYNC_PARTIAL_VANISHED_EXIT_CODE = 24
SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES = 200_000
def dry_run_log_path(host: str, run_id: int | None = None) -> Path:
@@ -100,7 +101,11 @@ def _collect_run_stats(
) -> dict[str, Any]:
stats: dict[str, Any] = {
"rsync": read_rsync_stats(log_path),
"storage": collect_storage_stats(backup_root=backup_root, snapshot_dir=snapshot_dir),
"storage": collect_storage_stats(
backup_root=backup_root,
snapshot_dir=snapshot_dir,
snapshot_max_entries=SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES if snapshot_dir is not None else None,
),
}
if duration_seconds is not None:
stats["duration_seconds"] = int(duration_seconds)

View File

@@ -62,7 +62,12 @@ def read_rsync_stats(log_path: Path) -> dict[str, Any]:
return parse_rsync_stats(text)
def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None) -> dict[str, Any]:
def collect_storage_stats(
*,
backup_root: Path,
snapshot_dir: Path | None = None,
snapshot_max_entries: int | None = None,
) -> dict[str, Any]:
stats: dict[str, Any] = {
"backup_root": str(backup_root),
}
@@ -71,7 +76,7 @@ def collect_storage_stats(*, backup_root: Path, snapshot_dir: Path | None = None
stats["capacity"] = capacity
if snapshot_dir is not None:
snapshot_usage = tree_usage(snapshot_dir)
snapshot_usage = tree_usage(snapshot_dir, max_entries=snapshot_max_entries)
if snapshot_usage:
stats["snapshot"] = {
"path": str(snapshot_dir),
@@ -103,13 +108,15 @@ def filesystem_capacity(path: Path) -> dict[str, Any]:
}
def tree_usage(path: Path) -> dict[str, Any]:
def tree_usage(path: Path, *, max_entries: int | None = None) -> dict[str, Any]:
apparent_size = 0
allocated_size = 0
files = 0
directories = 0
hardlinked_files = 0
hardlinked_apparent_size = 0
entries_scanned = 0
scan_limited = False
seen_allocated_inodes: set[tuple[int, int]] = set()
try:
@@ -119,6 +126,7 @@ def tree_usage(path: Path) -> dict[str, Any]:
if path.is_file():
files = 1
entries_scanned = 1
apparent_size = root_stat.st_size
allocated_size = int(getattr(root_stat, "st_blocks", 0) * 512)
if root_stat.st_nlink > 1:
@@ -126,14 +134,25 @@ def tree_usage(path: Path) -> dict[str, Any]:
hardlinked_apparent_size = root_stat.st_size
else:
for current_root, dirnames, filenames in path.walk():
directories += len(dirnames)
for _dirname in dirnames:
if _scan_limit_reached(entries_scanned, max_entries):
scan_limited = True
break
directories += 1
entries_scanned += 1
if scan_limited:
break
for filename in filenames:
if _scan_limit_reached(entries_scanned, max_entries):
scan_limited = True
break
file_path = current_root / filename
try:
file_stat = file_path.lstat()
except OSError:
continue
files += 1
entries_scanned += 1
apparent_size += file_stat.st_size
inode_key = (file_stat.st_dev, file_stat.st_ino)
if inode_key not in seen_allocated_inodes:
@@ -142,6 +161,8 @@ def tree_usage(path: Path) -> dict[str, Any]:
if file_stat.st_nlink > 1:
hardlinked_files += 1
hardlinked_apparent_size += file_stat.st_size
if scan_limited:
break
return {
"path": str(path),
@@ -149,12 +170,19 @@ def tree_usage(path: Path) -> dict[str, Any]:
"allocated_size_bytes": int(allocated_size),
"files": files,
"directories": directories,
"entries_scanned": entries_scanned,
"scan_limited": scan_limited,
"max_entries": max_entries,
"hardlinked_files": hardlinked_files,
"hardlinked_apparent_size_bytes": int(hardlinked_apparent_size),
"hardlink_apparent_ratio": round(hardlinked_apparent_size / apparent_size, 4) if apparent_size else 0.0,
}
def _scan_limit_reached(entries_scanned: int, max_entries: int | None) -> bool:
return max_entries is not None and max_entries >= 0 and entries_scanned >= max_entries
def _parse_colon_stat(line: str, stats: dict[str, Any]) -> None:
if ":" not in line:
return

View File

@@ -6,7 +6,7 @@ from unittest.mock import patch
from django.test import SimpleTestCase
from pobsync.commands.run_scheduled import run_scheduled
from pobsync.commands.run_scheduled import SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES, run_scheduled
from pobsync.errors import ConfigError
from pobsync.rsync import RsyncResult
@@ -270,9 +270,12 @@ class RunScheduledConfigSourceTests(SimpleTestCase):
self.assertEqual(result["stats"]["rsync"]["files_transferred"], 2)
self.assertEqual(result["stats"]["rsync"]["link_dest_estimated_savings_bytes"], 1500)
self.assertIn("snapshot", result["stats"]["storage"])
self.assertEqual(result["stats"]["storage"]["snapshot"]["max_entries"], SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES)
self.assertFalse(result["stats"]["storage"]["snapshot"]["scan_limited"])
self.assertIn("capacity", result["stats"]["storage"])
self.assertIn("stats:", meta_text)
self.assertIn("files_total: 10", meta_text)
self.assertIn(f"max_entries: {SNAPSHOT_STORAGE_SCAN_MAX_ENTRIES}", meta_text)
def test_real_run_reports_running_state_callback_before_rsync_returns(self) -> None:
states = []

View File

@@ -6,7 +6,7 @@ from tempfile import TemporaryDirectory
from django.test import SimpleTestCase
from pobsync.run_stats import parse_rsync_stats, tree_usage
from pobsync.run_stats import collect_storage_stats, parse_rsync_stats, tree_usage
class RunStatsTests(SimpleTestCase):
@@ -58,3 +58,37 @@ total size is 1.50M speedup is 125.00
self.assertEqual(stats["hardlinked_files"], 2)
self.assertEqual(stats["hardlinked_apparent_size_bytes"], 6)
self.assertEqual(stats["hardlink_apparent_ratio"], 1.0)
self.assertFalse(stats["scan_limited"])
def test_tree_usage_can_limit_large_scans(self) -> None:
with TemporaryDirectory() as tmp:
root = Path(tmp)
for index in range(5):
(root / f"file-{index}").write_bytes(b"x")
stats = tree_usage(root, max_entries=2)
self.assertEqual(stats["files"], 2)
self.assertEqual(stats["entries_scanned"], 2)
self.assertEqual(stats["max_entries"], 2)
self.assertTrue(stats["scan_limited"])
self.assertEqual(stats["apparent_size_bytes"], 2)
def test_collect_storage_stats_marks_limited_snapshot_scan(self) -> None:
with TemporaryDirectory() as tmp:
root = Path(tmp)
snapshot = root / "snapshot"
snapshot.mkdir()
for index in range(4):
(snapshot / f"file-{index}").write_bytes(b"x")
stats = collect_storage_stats(
backup_root=root,
snapshot_dir=snapshot,
snapshot_max_entries=1,
)
self.assertEqual(stats["snapshot"]["files"], 1)
self.assertEqual(stats["snapshot"]["entries_scanned"], 1)
self.assertEqual(stats["snapshot"]["max_entries"], 1)
self.assertTrue(stats["snapshot"]["scan_limited"])