(feature) Improve run debugging and log filtering #12

Merged
parkel merged 2 commits from issue-2-run-detail-logging-polish into master 2026-05-21 00:26:45 +02:00
5 changed files with 222 additions and 14 deletions

View File

@@ -62,6 +62,10 @@
.metric .label { color: var(--muted); font-size: 12px; text-transform: uppercase; }
.metric .value { font-size: 26px; font-weight: 650; margin-top: 4px; }
.panel { padding: 16px; margin-bottom: 18px; overflow: auto; }
.panel.highlight { border-left: 4px solid var(--border); }
.panel.highlight.failed { border-left-color: var(--failed); background: #fff7f7; }
.panel.highlight.warning { border-left-color: var(--running); background: #fffaf0; }
.panel.highlight.success { border-left-color: var(--success); background: #f5fbf7; }
table { width: 100%; border-collapse: collapse; min-width: 640px; }
th, td { border-bottom: 1px solid var(--border); padding: 9px 8px; text-align: left; vertical-align: top; }
th { color: var(--muted); font-size: 12px; font-weight: 650; text-transform: uppercase; }

View File

@@ -29,6 +29,22 @@
{% endfor %}
</select>
</div>
<div class="field">
<label for="window">Time window</label>
<select id="window" name="window">
{% for value, label in time_windows.items %}
<option value="{{ value }}" {% if selected_window == value %}selected{% endif %}>{{ label }}</option>
{% endfor %}
</select>
</div>
<div class="field">
<label for="host">Host contains</label>
<input id="host" name="host" value="{{ host_filter }}" placeholder="web-01.example.test">
</div>
<div class="field">
<label for="run">Run</label>
<input id="run" name="run" value="{{ run_filter }}" inputmode="numeric" placeholder="12">
</div>
<div class="field">
<label for="q">Message contains</label>
<input id="q" name="q" value="{{ query }}">

View File

@@ -17,11 +17,22 @@
<section class="grid" aria-label="Run summary">
<div class="metric"><div class="label">Host</div><div class="value">{{ run.host.host }}</div></div>
<div class="metric"><div class="label">Status</div><div class="value">{{ run.status }}</div></div>
<div class="metric"><div class="label">Status</div><div class="value"><span class="status {{ run.status }}">{{ run.status }}</span></div></div>
<div class="metric"><div class="label">Type</div><div class="value">{{ run.run_type }}</div></div>
<div class="metric"><div class="label">Rsync</div><div class="value">{{ run.rsync_exit_code|default:"" }}</div></div>
</section>
{% if failure %}
<section class="panel highlight failed">
<h2>Failure</h2>
<div class="stack">
<div><strong>Category:</strong> {{ failure.category|default:"unknown" }}</div>
<div><strong>Summary:</strong> {{ failure.summary|default:"" }}</div>
<div><strong>Hint:</strong> {{ failure.hint|default:"" }}</div>
</div>
</section>
{% endif %}
<div class="two-col">
<section class="panel">
<h2>Timing</h2>
@@ -64,6 +75,36 @@
</section>
{% endif %}
<section class="panel">
<h2>Rsync Command</h2>
{% if rsync_command %}
<pre>{% for part in rsync_command %}{{ part }}{% if not forloop.last %}
{% endif %}{% endfor %}</pre>
{% else %}
<p class="muted">No rsync command recorded yet.</p>
{% endif %}
</section>
<section class="panel">
<h2>Rsync Log</h2>
<div class="stack spaced">
{% if rsync_log_exists %}
<div><a href="{% url 'run_rsync_log' run.id %}">Open full rsync log</a></div>
<div class="muted">{{ rsync_log_path }}</div>
{% elif rsync_log_path %}
<div class="muted">{{ rsync_log_path }} (missing)</div>
{% else %}
<div class="muted">No rsync log path recorded yet.</div>
{% endif %}
</div>
{% if rsync_log_tail %}
<pre>{% for line in rsync_log_tail %}{{ line }}{% if not forloop.last %}
{% endif %}{% endfor %}</pre>
{% else %}
<p class="muted">No recent rsync log output recorded yet.</p>
{% endif %}
</section>
{% if stats %}
<section class="panel">
<h2>Stats</h2>
@@ -86,8 +127,21 @@
</section>
{% endif %}
{% if has_prune_result %}
<section class="panel highlight {% if prune_result.ok %}success{% else %}warning{% endif %}">
<h2>Retention</h2>
<div class="stack">
<div><strong>Status:</strong> {% if prune_result.ok %}ok{% else %}warning{% endif %}</div>
{% if prune_result.source %}<div><strong>Source:</strong> {{ prune_result.source }}</div>{% endif %}
{% if prune_result.deleted %}<div><strong>Deleted:</strong> {{ prune_result.deleted|length }}</div>{% endif %}
{% if prune_result.error %}<div><strong>Error:</strong> {{ prune_result.error }}</div>{% endif %}
{% if prune_result.type %}<div><strong>Type:</strong> {{ prune_result.type }}</div>{% endif %}
</div>
</section>
{% endif %}
<section class="panel">
<h2>Result</h2>
<h2>Raw Result</h2>
<pre>{{ result_json }}</pre>
</section>
{% endblock %}

View File

@@ -150,21 +150,41 @@ class ViewTests(TestCase):
completed = subprocess.CompletedProcess(
args=["journalctl"],
returncode=0,
stdout="2026-05-19 pobsync-worker.service failed backup\n2026-05-19 pobsync-web.service started\n",
stdout=(
"2026-05-19 pobsync-worker.service web-01 failed backup run 12\n"
"2026-05-19 pobsync-worker.service web-02 failed backup run 12\n"
"2026-05-19 pobsync-web.service web-01 started run 12\n"
),
stderr="",
)
with patch("pobsync_backend.views.shutil.which", return_value="/usr/bin/journalctl"), patch(
"pobsync_backend.views.subprocess.run", return_value=completed
) as run:
response = self.client.get(reverse("logs"), {"unit": "pobsync-worker.service", "priority": "0..3", "q": "failed"})
response = self.client.get(
reverse("logs"),
{
"unit": "pobsync-worker.service",
"priority": "0..3",
"window": "6h",
"host": "web-01",
"run": "12",
"q": "failed",
},
)
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Logs")
self.assertContains(response, "failed backup")
self.assertContains(response, "web-01 failed backup run 12")
self.assertNotContains(response, "web-02 failed backup run 12")
self.assertNotContains(response, "started")
self.assertIn("-u", run.call_args.args[0])
self.assertIn("pobsync-worker.service", run.call_args.args[0])
command = run.call_args.args[0]
self.assertIn("-u", command)
self.assertIn("pobsync-worker.service", command)
self.assertIn("-p", command)
self.assertIn("0..3", command)
self.assertIn("--since", command)
self.assertIn("6 hours ago", command)
def test_ssh_credentials_view_creates_key(self) -> None:
self.client.force_login(self.staff_user)
@@ -857,6 +877,11 @@ class ViewTests(TestCase):
result={
"ok": True,
"snapshot": snapshot.path,
"rsync": {
"command": ["rsync", "--archive", "root@web-01:/", snapshot.path],
"exit_code": 0,
"log_tail": ["sending incremental file list", "sent 500 bytes"],
},
"requested": {
"dry_run": True,
"verbose_output": True,
@@ -889,6 +914,10 @@ class ViewTests(TestCase):
self.assertContains(response, "Requested Options")
self.assertContains(response, "Dry run:</strong> yes")
self.assertContains(response, "Verbose rsync output:</strong> yes")
self.assertContains(response, "Rsync Command")
self.assertContains(response, "--archive")
self.assertContains(response, "Rsync Log")
self.assertContains(response, "sending incremental file list")
self.assertContains(response, "Stats")
self.assertContains(response, "Files seen:</strong> 10")
self.assertContains(response, "Estimated link-dest saving")
@@ -901,7 +930,7 @@ class ViewTests(TestCase):
with TemporaryDirectory() as tmp:
log_path = Path(tmp) / "snapshot" / "meta" / "rsync.log"
log_path.parent.mkdir(parents=True)
log_path.write_text("rsync log line\n", encoding="utf-8")
log_path.write_text("old line\nrsync log line\n", encoding="utf-8")
run = BackupRun.objects.create(
host=host,
status=BackupRun.Status.SUCCESS,
@@ -915,8 +944,40 @@ class ViewTests(TestCase):
self.assertContains(response, reverse("run_rsync_log", args=[run.id]))
self.assertContains(response, str(log_path))
self.assertContains(response, "rsync log line")
self.assertEqual(log_response.status_code, 200)
self.assertEqual(log_body, b"rsync log line\n")
self.assertEqual(log_body, b"old line\nrsync log line\n")
def test_run_detail_surfaces_failure_and_retention_warning(self) -> None:
self.client.force_login(self.staff_user)
host = HostConfig.objects.create(host="web-01", address="web-01.example.test")
run = BackupRun.objects.create(
host=host,
status=BackupRun.Status.WARNING,
rsync_exit_code=0,
result={
"ok": True,
"failure": {
"category": "transport",
"summary": "SSH connection dropped.",
"hint": "Check network connectivity.",
},
"prune": {
"ok": False,
"type": "ConfigError",
"error": "Deletion blocked by --max-delete=0",
},
},
)
response = self.client.get(reverse("run_detail", args=[run.id]))
self.assertEqual(response.status_code, 200)
self.assertContains(response, "Failure")
self.assertContains(response, "transport")
self.assertContains(response, "Check network connectivity.")
self.assertContains(response, "Retention")
self.assertContains(response, "Deletion blocked by --max-delete=0")
def test_run_detail_infers_rsync_log_from_snapshot_path(self) -> None:
self.client.force_login(self.staff_user)

View File

@@ -362,15 +362,26 @@ def queue_manual_backup(request, host: str):
@staff_member_required
def run_detail(request, run_id: int):
run = get_object_or_404(BackupRun.objects.select_related("host", "snapshot"), id=run_id)
run_stats = run.result.get("stats") if isinstance(run.result, dict) else {}
result = run.result if isinstance(run.result, dict) else {}
run_stats = result.get("stats") if isinstance(result.get("stats"), dict) else {}
rsync_result = result.get("rsync") if isinstance(result.get("rsync"), dict) else {}
failure = result.get("failure") if isinstance(result.get("failure"), dict) else {}
prune_result = result.get("prune") if isinstance(result.get("prune"), dict) else {}
rsync_log_path = _run_rsync_log_path(run)
rsync_log_tail = _run_rsync_log_tail(rsync_result, rsync_log_path)
context = {
"run": run,
"can_cancel": run.status in {BackupRun.Status.QUEUED, BackupRun.Status.RUNNING},
"requested": run.result.get("requested") if isinstance(run.result, dict) else {},
"requested": result.get("requested") if isinstance(result.get("requested"), dict) else {},
"stats": run_stats if isinstance(run_stats, dict) else {},
"rsync": rsync_result,
"rsync_command": _run_rsync_command(rsync_result),
"failure": failure,
"prune_result": prune_result,
"has_prune_result": bool(prune_result),
"rsync_log_path": str(rsync_log_path) if rsync_log_path is not None else "",
"rsync_log_exists": bool(rsync_log_path and rsync_log_path.exists()),
"rsync_log_tail": rsync_log_tail,
"result_json": _pretty_json(run.result),
}
return render(request, "pobsync_backend/run_detail.html", context)
@@ -662,6 +673,26 @@ def _run_rsync_log_path(run: BackupRun) -> Path | None:
return None
def _run_rsync_command(rsync_result: dict) -> list[str]:
command = rsync_result.get("command")
if not isinstance(command, list):
return []
return [str(part) for part in command]
def _run_rsync_log_tail(rsync_result: dict, log_path: Path | None, *, max_lines: int = 30) -> list[str]:
log_tail = rsync_result.get("log_tail")
if isinstance(log_tail, list):
return [str(line) for line in log_tail[-max_lines:]]
if log_path is None or not log_path.is_file():
return []
try:
with log_path.open("r", encoding="utf-8", errors="replace") as handle:
return handle.read().splitlines()[-max_lines:]
except OSError:
return []
def _log_context(request) -> dict[str, object]:
units = ("pobsync-web.service", "pobsync-worker.service", "pobsync-scheduler.service")
priorities = {
@@ -672,8 +703,26 @@ def _log_context(request) -> dict[str, object]:
"6": "Info",
"7": "Debug",
}
time_windows = {
"1h": "Last hour",
"6h": "Last 6 hours",
"24h": "Last 24 hours",
"7d": "Last 7 days",
"": "All available",
}
since_values = {
"1h": "1 hour ago",
"6h": "6 hours ago",
"24h": "24 hours ago",
"7d": "7 days ago",
}
selected_unit = request.GET.get("unit", "")
priority = request.GET.get("priority", "0..4")
time_window = request.GET.get("window", "24h")
if time_window not in time_windows:
time_window = "24h"
host_filter = request.GET.get("host", "").strip()
run_filter = request.GET.get("run", "").strip()
query = request.GET.get("q", "").strip()
lines = []
error = ""
@@ -682,6 +731,8 @@ def _log_context(request) -> dict[str, object]:
error = "journalctl is not available in this runtime."
else:
command = ["journalctl", "--no-pager", "-n", "300", "-o", "short-iso"]
if time_window:
command.extend(["--since", since_values[time_window]])
if selected_unit in units:
command.extend(["-u", selected_unit])
else:
@@ -694,16 +745,38 @@ def _log_context(request) -> dict[str, object]:
error = result.stderr.strip() or "Could not read journal logs."
else:
lines = result.stdout.splitlines()
if query:
lowered_query = query.lower()
lines = [line for line in lines if lowered_query in line.lower()]
lines = _filter_log_lines(lines, query=query, host=host_filter, run_id=run_filter)
return {
"units": units,
"priorities": priorities,
"time_windows": time_windows,
"selected_unit": selected_unit,
"selected_priority": priority,
"selected_window": time_window,
"host_filter": host_filter,
"run_filter": run_filter,
"query": query,
"lines": lines,
"error": error,
}
def _filter_log_lines(lines: list[str], *, query: str, host: str, run_id: str) -> list[str]:
filters = []
if query:
filters.append(lambda line: query.lower() in line.lower())
if host:
filters.append(lambda line: host.lower() in line.lower())
if run_id:
run_tokens = (
f"run {run_id}",
f"run={run_id}",
f"run_id={run_id}",
f"run-{run_id}",
f"#{run_id}",
)
filters.append(lambda line: any(token in line.lower() for token in run_tokens))
if not filters:
return lines
return [line for line in lines if all(matches(line) for matches in filters)]