From 00c914401022d14d271e5d6967a6daca9f681b2d Mon Sep 17 00:00:00 2001
From: Jan Novak <jan.novak@livesport.eu>
Date: Thu, 26 Feb 2026 09:46:16 +0100
Subject: [PATCH] Fix DATA_DIR usage in stats/history paths, set env in
 Dockerfile, add validation docs

- scraper_stats.py: respect DATA_DIR env var when writing stats_*.json files
- generate_status.py: read stats files and write history from DATA_DIR instead of HERE
- build/Dockerfile: set DATA_DIR=/app/data as default env var
- docs/validation.md: end-to-end Docker validation recipe

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 build/Dockerfile   |   1 +
 docs/validation.md | 123 +++++++++++++++++++++++++++++++++++++++++++++
 generate_status.py |   4 +-
 scraper_stats.py   |   6 ++-
 4 files changed, 130 insertions(+), 4 deletions(-)
 create mode 100644 docs/validation.md

diff --git a/build/Dockerfile b/build/Dockerfile
index 4dc54da..541f268 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -5,6 +5,7 @@ RUN apk add --no-cache curl bash tzdata \
     && echo "Europe/Prague" > /etc/timezone
 
 ENV PYTHONUNBUFFERED=1
+ENV DATA_DIR=/app/data
 
 WORKDIR /app
 
diff --git a/docs/validation.md b/docs/validation.md
new file mode 100644
index 0000000..cc6a8da
--- /dev/null
+++ b/docs/validation.md
@@ -0,0 +1,123 @@
+# Validation Recipe
+
+End-to-end check that scraping, data persistence, history, and the status page all work correctly in Docker.
+
+## What it verifies
+
+- All scrapers run and write output to `DATA_DIR` (`/app/data`)
+- `stats_*.json` land in `/app/data/` (not in `/app/`)
+- `status.json` and `scraper_history.json` land in `/app/data/`
+- `/api/status`, `/api/status/history`, and `/scrapers-status` serve correct data
+- History accumulates across runs
+
+## Steps
+
+### 1. Build the image
+
+```bash
+make build
+```
+
+### 2. Start a clean validation container
+
+```bash
+# Stop/remove any leftover container and volume from a previous run
+docker stop maru-hleda-byt-validation 2>/dev/null; docker rm maru-hleda-byt-validation 2>/dev/null
+docker volume rm maru-hleda-byt-validation-data 2>/dev/null
+
+docker run -d --name maru-hleda-byt-validation \
+  -p 8081:8080 \
+  -v maru-hleda-byt-validation-data:/app/data \
+  maru-hleda-byt
+```
+
+Give the container ~3 seconds to start. The entrypoint launches a background full scrape automatically — suppress it so only controlled runs execute:
+
+```bash
+sleep 3
+docker exec maru-hleda-byt-validation pkill -f run_all.sh 2>/dev/null || true
+docker exec maru-hleda-byt-validation rm -f /app/data/scraper_running.json 2>/dev/null || true
+```
+
+### 3. Run a limited scrape (run 1)
+
+```bash
+docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10
+```
+
+Expected output (last few lines):
+```
+Status uložen: /app/data/status.json
+Historie uložena: /app/data/scraper_history.json (1 záznamů)
+```
+
+### 4. Verify data files are in `/app/data/`
+
+```bash
+docker exec maru-hleda-byt-validation ls /app/data/
+```
+
+Expected files:
+```
+byty_cityhome.json   byty_idnes.json   byty_merged.json
+byty_realingo.json   byty_sreality.json
+mapa_bytu.html
+scraper_history.json
+stats_bezrealitky.json  stats_cityhome.json  stats_idnes.json
+stats_realingo.json     stats_sreality.json
+status.json
+```
+
+### 5. Run a second limited scrape (run 2)
+
+```bash
+docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10
+```
+
+Expected last line: `Historie uložena: /app/data/scraper_history.json (2 záznamů)`
+
+### 6. Verify history via API
+
+```bash
+curl -s http://localhost:8081/api/status/history | python3 -c "
+import json, sys
+h = json.load(sys.stdin)
+print(f'{len(h)} entries:')
+for i, e in enumerate(h):
+    print(f'  [{i}] {e[\"timestamp\"]} total={e[\"total_accepted\"]}')
+"
+```
+
+Expected: 2 entries with different timestamps.
+
+```bash
+curl -s http://localhost:8081/api/status | python3 -c "
+import json, sys; s=json.load(sys.stdin)
+print(f'status={s[\"status\"]} total={s[\"total_accepted\"]} ts={s[\"timestamp\"]}')
+"
+```
+
+Expected: `status=done total=<N> ts=<latest timestamp>`
+
+### 7. Check the status page
+
+Open http://localhost:8081/scrapers-status in a browser (or `curl -s http://localhost:8081/scrapers-status | grep -c "clickable-row"` — should print `2`).
+
+### 8. Clean up
+
+```bash
+docker stop maru-hleda-byt-validation && docker rm maru-hleda-byt-validation
+docker volume rm maru-hleda-byt-validation-data
+```
+
+Or use the Makefile shortcut:
+
+```bash
+make validation-stop
+```
+
+## Notes
+
+- PSN scraper does not support `--max-pages` and will always fail with this command; `success=False` in history is expected during validation.
+- Bezrealitky may return 0 results with a 1-page limit; `byty_bezrealitky.json` will be absent from `/app/data/` in that case — this is normal.
+- `make validation` (the Makefile target) runs the same limited scrape but does not suppress the background startup scrape, so two concurrent runs may occur. Use the manual steps above for a clean controlled test.
diff --git a/generate_status.py b/generate_status.py
index c3a5cb6..db559bb 100644
--- a/generate_status.py
+++ b/generate_status.py
@@ -58,7 +58,7 @@ def read_scraper_stats(path: Path) -> dict:
 
 def append_to_history(status: dict, keep: int) -> None:
     """Append the current status entry to scraper_history.json, keeping only `keep` latest."""
-    history_path = HERE / HISTORY_FILE
+    history_path = DATA_DIR / HISTORY_FILE
     history: list = []
     if history_path.exists():
         try:
@@ -98,7 +98,7 @@ def main():
         info["name"] = name
 
         # Merge in stats from the per-scraper stats file (authoritative for run data)
-        stats = read_scraper_stats(HERE / STATS_FILES[name])
+        stats = read_scraper_stats(DATA_DIR / STATS_FILES[name])
         for key in ("accepted", "fetched", "pages", "cache_hits", "excluded", "excluded_total",
                     "success", "duration_sec", "error"):
             if key in stats:
diff --git a/scraper_stats.py b/scraper_stats.py
index 2dd1f81..b605533 100644
--- a/scraper_stats.py
+++ b/scraper_stats.py
@@ -2,12 +2,14 @@
 from __future__ import annotations
 
 import json
+import os
 from pathlib import Path
 
 HERE = Path(__file__).parent
+DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
 
 
 def write_stats(filename: str, stats: dict) -> None:
-    """Write scraper run stats dict to a JSON file next to this module."""
-    path = HERE / filename
+    """Write scraper run stats dict to the data directory."""
+    path = DATA_DIR / filename
     path.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")