From 00c914401022d14d271e5d6967a6daca9f681b2d Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Thu, 26 Feb 2026 09:46:16 +0100 Subject: [PATCH] Fix DATA_DIR usage in stats/history paths, set env in Dockerfile, add validation docs - scraper_stats.py: respect DATA_DIR env var when writing stats_*.json files - generate_status.py: read stats files and write history from DATA_DIR instead of HERE - build/Dockerfile: set DATA_DIR=/app/data as default env var - docs/validation.md: end-to-end Docker validation recipe Co-Authored-By: Claude Sonnet 4.6 --- build/Dockerfile | 1 + docs/validation.md | 123 +++++++++++++++++++++++++++++++++++++++++++++ generate_status.py | 4 +- scraper_stats.py | 6 ++- 4 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 docs/validation.md diff --git a/build/Dockerfile b/build/Dockerfile index 4dc54da..541f268 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -5,6 +5,7 @@ RUN apk add --no-cache curl bash tzdata \ && echo "Europe/Prague" > /etc/timezone ENV PYTHONUNBUFFERED=1 +ENV DATA_DIR=/app/data WORKDIR /app diff --git a/docs/validation.md b/docs/validation.md new file mode 100644 index 0000000..cc6a8da --- /dev/null +++ b/docs/validation.md @@ -0,0 +1,123 @@ +# Validation Recipe + +End-to-end check that scraping, data persistence, history, and the status page all work correctly in Docker. + +## What it verifies + +- All scrapers run and write output to `DATA_DIR` (`/app/data`) +- `stats_*.json` land in `/app/data/` (not in `/app/`) +- `status.json` and `scraper_history.json` land in `/app/data/` +- `/api/status`, `/api/status/history`, and `/scrapers-status` serve correct data +- History accumulates across runs + +## Steps + +### 1. Build the image + +```bash +make build +``` + +### 2. Start a clean validation container + +```bash +# Stop/remove any leftover container and volume from a previous run +docker stop maru-hleda-byt-validation 2>/dev/null; docker rm maru-hleda-byt-validation 2>/dev/null +docker volume rm maru-hleda-byt-validation-data 2>/dev/null + +docker run -d --name maru-hleda-byt-validation \ + -p 8081:8080 \ + -v maru-hleda-byt-validation-data:/app/data \ + maru-hleda-byt +``` + +Give the container ~3 seconds to start. The entrypoint launches a background full scrape automatically — suppress it so only controlled runs execute: + +```bash +sleep 3 +docker exec maru-hleda-byt-validation pkill -f run_all.sh 2>/dev/null || true +docker exec maru-hleda-byt-validation rm -f /app/data/scraper_running.json 2>/dev/null || true +``` + +### 3. Run a limited scrape (run 1) + +```bash +docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10 +``` + +Expected output (last few lines): +``` +Status uložen: /app/data/status.json +Historie uložena: /app/data/scraper_history.json (1 záznamů) +``` + +### 4. Verify data files are in `/app/data/` + +```bash +docker exec maru-hleda-byt-validation ls /app/data/ +``` + +Expected files: +``` +byty_cityhome.json byty_idnes.json byty_merged.json +byty_realingo.json byty_sreality.json +mapa_bytu.html +scraper_history.json +stats_bezrealitky.json stats_cityhome.json stats_idnes.json +stats_realingo.json stats_sreality.json +status.json +``` + +### 5. Run a second limited scrape (run 2) + +```bash +docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10 +``` + +Expected last line: `Historie uložena: /app/data/scraper_history.json (2 záznamů)` + +### 6. Verify history via API + +```bash +curl -s http://localhost:8081/api/status/history | python3 -c " +import json, sys +h = json.load(sys.stdin) +print(f'{len(h)} entries:') +for i, e in enumerate(h): + print(f' [{i}] {e[\"timestamp\"]} total={e[\"total_accepted\"]}') +" +``` + +Expected: 2 entries with different timestamps. + +```bash +curl -s http://localhost:8081/api/status | python3 -c " +import json, sys; s=json.load(sys.stdin) +print(f'status={s[\"status\"]} total={s[\"total_accepted\"]} ts={s[\"timestamp\"]}') +" +``` + +Expected: `status=done total= ts=` + +### 7. Check the status page + +Open http://localhost:8081/scrapers-status in a browser (or `curl -s http://localhost:8081/scrapers-status | grep -c "clickable-row"` — should print `2`). + +### 8. Clean up + +```bash +docker stop maru-hleda-byt-validation && docker rm maru-hleda-byt-validation +docker volume rm maru-hleda-byt-validation-data +``` + +Or use the Makefile shortcut: + +```bash +make validation-stop +``` + +## Notes + +- PSN scraper does not support `--max-pages` and will always fail with this command; `success=False` in history is expected during validation. +- Bezrealitky may return 0 results with a 1-page limit; `byty_bezrealitky.json` will be absent from `/app/data/` in that case — this is normal. +- `make validation` (the Makefile target) runs the same limited scrape but does not suppress the background startup scrape, so two concurrent runs may occur. Use the manual steps above for a clean controlled test. diff --git a/generate_status.py b/generate_status.py index c3a5cb6..db559bb 100644 --- a/generate_status.py +++ b/generate_status.py @@ -58,7 +58,7 @@ def read_scraper_stats(path: Path) -> dict: def append_to_history(status: dict, keep: int) -> None: """Append the current status entry to scraper_history.json, keeping only `keep` latest.""" - history_path = HERE / HISTORY_FILE + history_path = DATA_DIR / HISTORY_FILE history: list = [] if history_path.exists(): try: @@ -98,7 +98,7 @@ def main(): info["name"] = name # Merge in stats from the per-scraper stats file (authoritative for run data) - stats = read_scraper_stats(HERE / STATS_FILES[name]) + stats = read_scraper_stats(DATA_DIR / STATS_FILES[name]) for key in ("accepted", "fetched", "pages", "cache_hits", "excluded", "excluded_total", "success", "duration_sec", "error"): if key in stats: diff --git a/scraper_stats.py b/scraper_stats.py index 2dd1f81..b605533 100644 --- a/scraper_stats.py +++ b/scraper_stats.py @@ -2,12 +2,14 @@ from __future__ import annotations import json +import os from pathlib import Path HERE = Path(__file__).parent +DATA_DIR = Path(os.environ.get("DATA_DIR", HERE)) def write_stats(filename: str, stats: dict) -> None: - """Write scraper run stats dict to a JSON file next to this module.""" - path = HERE / filename + """Write scraper run stats dict to the data directory.""" + path = DATA_DIR / filename path.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")