From 5fb3b984b6619efef721a52a7578859303b16ff9 Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Thu, 26 Feb 2026 00:30:25 +0100 Subject: [PATCH 1/3] Add status dashboard, server, scraper stats, and DATA_DIR support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Key changes: - Replace ratings_server.py + status.html with a unified server.py that serves the map, scraper status dashboard, and ratings API in one process - Add scraper_stats.py utility: each scraper writes per-run stats (fetched, accepted, excluded, duration) to stats_.json for the status page - generate_status.py: respect DATA_DIR env var so status.json lands in the configured data directory instead of always the project root - run_all.sh: replace the {"status":"running"} overwrite of status.json with a dedicated scraper_running.json lock file; trap on EXIT ensures cleanup even on kill/error, preventing the previous run's results from being wiped - server.py: detect running state via scraper_running.json existence instead of status["status"] field, eliminating the dual-use race condition - Makefile: add serve (local dev), debug (Docker debug container) targets; add SERVER_PORT variable - build/Dockerfile + entrypoint.sh: switch to server.py, set DATA_DIR, adjust volume mounts - .gitignore: add *.json and *.log to keep runtime data files out of VCS - mapa_bytu.html: price-per-m² colouring, status link, UX tweaks Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 3 + Makefile | 30 +- build/Dockerfile | 5 +- build/entrypoint.sh | 9 +- generate_status.py | 163 ++++----- mapa_bytu.html | 781 ++++++------------------------------------ ratings_server.py | 116 ------- run_all.sh | 15 +- scrape_and_map.py | 38 +- scrape_bezrealitky.py | 40 ++- scrape_cityhome.py | 38 +- scrape_idnes.py | 40 ++- scrape_psn.py | 39 ++- scrape_realingo.py | 40 ++- scraper_stats.py | 13 + server.py | 477 ++++++++++++++++++++++++++ status.html | 204 ----------- 17 files changed, 929 insertions(+), 1122 deletions(-) delete mode 100644 ratings_server.py create mode 100644 scraper_stats.py create mode 100644 server.py delete mode 100644 status.html diff --git a/.gitignore b/.gitignore index 885cbd0..c3c7fc7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ .vscode/ __pycache__/ byty_*.json +*.json +*.log + diff --git a/Makefile b/Makefile index c0fd73f..f11075c 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,13 @@ CONTAINER_NAME := maru-hleda-byt VOLUME_NAME := maru-hleda-byt-data VALIDATION_CONTAINER := maru-hleda-byt-validation VALIDATION_VOLUME := maru-hleda-byt-validation-data +DEBUG_CONTAINER := maru-hleda-byt-debug +DEBUG_VOLUME := maru-hleda-byt-debug-data +DEBUG_PORT ?= 8082 PORT := 8080 +SERVER_PORT ?= 8080 -.PHONY: build run stop logs scrape restart clean help validation validation-local validation-stop validation-local-debug +.PHONY: build run stop logs scrape restart clean help serve validation validation-local validation-stop validation-local-debug debug debug-stop help: @echo "Available targets:" @@ -20,6 +24,9 @@ help: @echo " validation-local-debug - Run validation locally with DEBUG logging" @echo " restart - Restart the container (stop and run again)" @echo " clean - Stop container and remove the Docker image" + @echo " serve - Start server.py locally on port 8080" + @echo " debug - Build and run debug Docker container with limited scrape (port $(DEBUG_PORT))" + @echo " debug-stop - Stop and remove the debug Docker container" @echo " help - Show this help message" build: @@ -59,6 +66,27 @@ validation-stop: @docker rm $(VALIDATION_CONTAINER) 2>/dev/null || true @echo "Validation container stopped and removed" +debug: build + @docker stop $(DEBUG_CONTAINER) 2>/dev/null || true + @docker rm $(DEBUG_CONTAINER) 2>/dev/null || true + docker run -d --name $(DEBUG_CONTAINER) \ + -p $(DEBUG_PORT):8080 \ + -v $(DEBUG_VOLUME):/app/data \ + -e LOG_LEVEL=DEBUG \ + $(IMAGE_NAME) + @sleep 2 + docker exec $(DEBUG_CONTAINER) bash /app/run_all.sh --max-pages 1 --max-properties 10 + @echo "Debug app at http://localhost:$(DEBUG_PORT)/mapa_bytu.html" + @echo "Debug status at http://localhost:$(DEBUG_PORT)/scrapers-status" + +debug-stop: + @docker stop $(DEBUG_CONTAINER) 2>/dev/null || true + @docker rm $(DEBUG_CONTAINER) 2>/dev/null || true + @echo "Debug container stopped and removed" + +serve: + DATA_DIR=. SERVER_PORT=$(SERVER_PORT) python3 server.py + validation-local: ./run_all.sh --max-pages 1 --max-properties 10 diff --git a/build/Dockerfile b/build/Dockerfile index a9bc15f..4dc54da 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -10,7 +10,8 @@ WORKDIR /app COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \ - merge_and_map.py regen_map.py run_all.sh ratings_server.py ./ + merge_and_map.py regen_map.py generate_status.py scraper_stats.py \ + run_all.sh server.py ./ COPY build/crontab /etc/crontabs/root COPY build/entrypoint.sh /entrypoint.sh @@ -18,7 +19,7 @@ RUN chmod +x /entrypoint.sh run_all.sh RUN mkdir -p /app/data -EXPOSE 8080 8081 +EXPOSE 8080 HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 diff --git a/build/entrypoint.sh b/build/entrypoint.sh index da84e83..0b3ad11 100644 --- a/build/entrypoint.sh +++ b/build/entrypoint.sh @@ -1,7 +1,7 @@ #!/bin/bash set -euo pipefail -DATA_DIR="/app/data" +export DATA_DIR="/app/data" # Create symlinks so scripts (which write to /app/) persist data to the volume for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ @@ -18,8 +18,5 @@ crond -b -l 2 echo "[entrypoint] Starting initial scrape in background..." bash /app/run_all.sh & -echo "[entrypoint] Starting ratings API server on port 8081..." -DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py & - -echo "[entrypoint] Starting HTTP server on port 8080..." -exec python3 -m http.server 8080 --directory "$DATA_DIR" +echo "[entrypoint] Starting server on port 8080..." +exec python3 /app/server.py diff --git a/generate_status.py b/generate_status.py index 8eaf1f5..ed505ae 100644 --- a/generate_status.py +++ b/generate_status.py @@ -1,16 +1,15 @@ #!/usr/bin/env python3 -"""Generate status.json from scraper JSON outputs and run log.""" +"""Generate status.json from scraper JSON outputs and per-scraper stats files.""" from __future__ import annotations +import argparse import json import os -import re -import sys from datetime import datetime from pathlib import Path -from typing import Optional HERE = Path(__file__).parent +DATA_DIR = Path(os.environ.get("DATA_DIR", HERE)) SOURCE_FILES = { "Sreality": "byty_sreality.json", @@ -21,7 +20,17 @@ SOURCE_FILES = { "CityHome": "byty_cityhome.json", } +STATS_FILES = { + "Sreality": "stats_sreality.json", + "Realingo": "stats_realingo.json", + "Bezrealitky": "stats_bezrealitky.json", + "iDNES": "stats_idnes.json", + "PSN": "stats_psn.json", + "CityHome": "stats_cityhome.json", +} + MERGED_FILE = "byty_merged.json" +HISTORY_FILE = "scraper_history.json" def count_source(path: Path) -> dict: @@ -36,105 +45,51 @@ def count_source(path: Path) -> dict: return {"accepted": 0, "error": str(e)} -def parse_log(log_path: str) -> dict[str, dict]: - """Parse scraper run log and extract per-source statistics. - - Scrapers log summary lines like: - ✓ Vyhovující byty: 12 - Vyloučeno (prodáno): 5 - Staženo stránek: 3 - Staženo inzerátů: 48 - Celkem bytů v cache: 120 - and section headers like: - [2/6] Realingo - """ - if not log_path or not os.path.exists(log_path): +def read_scraper_stats(path: Path) -> dict: + """Load a per-scraper stats JSON. Returns {} on missing or corrupt file.""" + if not path.exists(): + return {} + try: + data = json.loads(path.read_text(encoding="utf-8")) + return data if isinstance(data, dict) else {} + except Exception: return {} - with open(log_path, encoding="utf-8") as f: - content = f.read() - # Split into per-source sections by the [N/6] Step header - # Each section header looks like "[2/6] Realingo\n----..." - section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE) - sections_found = list(section_pattern.finditer(content)) +def append_to_history(status: dict, keep: int) -> None: + """Append the current status entry to scraper_history.json, keeping only `keep` latest.""" + history_path = HERE / HISTORY_FILE + history: list = [] + if history_path.exists(): + try: + history = json.loads(history_path.read_text(encoding="utf-8")) + if not isinstance(history, list): + history = [] + except Exception: + history = [] - if not sections_found: - return {} + history.append(status) - stats = {} - for i, match in enumerate(sections_found): - step_name = match.group(2).strip() - start = match.end() - end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content) - section_text = content[start:end] + # Keep only the N most recent entries + if keep > 0 and len(history) > keep: + history = history[-keep:] - # Identify which sources this section covers - # "PSN + CityHome" covers both - source_names = [] - for name in SOURCE_FILES: - if name.lower() in step_name.lower(): - source_names.append(name) - if not source_names: - continue - - # Parse numeric summary lines - def extract(pattern: str) -> Optional[int]: - m = re.search(pattern, section_text) - return int(m.group(1)) if m else None - - # Lines present in all/most scrapers - accepted = extract(r'Vyhovující byty[:\s]+(\d+)') - fetched = extract(r'Staženo inzerátů[:\s]+(\d+)') - pages = extract(r'Staženo stránek[:\s]+(\d+)') - cached = extract(r'Celkem bytů v cache[:\s]+(\d+)') - cache_hits = extract(r'Cache hit[:\s]+(\d+)') - - # Rejection reasons — collect all into a dict - excluded = {} - for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text): - excluded[m.group(1)] = int(m.group(2)) - # Also PSN-style "Vyloučeno (prodáno): N" - total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)') - - entry = {} - if accepted is not None: - entry["accepted"] = accepted - if fetched is not None: - entry["fetched"] = fetched - if pages is not None: - entry["pages"] = pages - if cached is not None: - entry["cached"] = cached - if cache_hits is not None: - entry["cache_hits"] = cache_hits - if excluded: - entry["excluded"] = excluded - elif total_excluded is not None: - entry["excluded_total"] = total_excluded - - for name in source_names: - stats[name] = entry - - return stats + history_path.write_text(json.dumps(history, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Historie uložena: {history_path} ({len(history)} záznamů)") def main(): - start_time = None - duration_sec = None + parser = argparse.ArgumentParser(description="Generate status.json from scraper outputs.") + parser.add_argument("--start-time", dest="start_time", default=None, + help="ISO timestamp of scrape start (default: now)") + parser.add_argument("--duration", dest="duration", type=int, default=None, + help="Run duration in seconds") + parser.add_argument("--keep", dest="keep", type=int, default=5, + help="Number of history entries to keep (default: 5, 0=unlimited)") + args = parser.parse_args() - if len(sys.argv) >= 3: - start_time = sys.argv[1] - try: - duration_sec = int(sys.argv[2]) - except ValueError: - pass - - if not start_time: - start_time = datetime.now().isoformat(timespec="seconds") - - log_path = sys.argv[3] if len(sys.argv) >= 4 else None - log_stats = parse_log(log_path) + start_time = args.start_time or datetime.now().isoformat(timespec="seconds") + duration_sec = args.duration sources = [] for name, filename in SOURCE_FILES.items(): @@ -142,14 +97,12 @@ def main(): info = count_source(path) info["name"] = name - # Merge log stats - ls = log_stats.get(name, {}) - for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"): - if k in ls: - info[k] = ls[k] - # Override accepted from log if available (log is authoritative for latest run) - if "accepted" in ls: - info["accepted"] = ls["accepted"] + # Merge in stats from the per-scraper stats file (authoritative for run data) + stats = read_scraper_stats(HERE / STATS_FILES[name]) + for key in ("accepted", "fetched", "pages", "cache_hits", "excluded", "excluded_total", + "success", "duration_sec", "error"): + if key in stats: + info[key] = stats[key] sources.append(info) @@ -168,17 +121,21 @@ def main(): duplicates_removed = total_accepted - deduplicated if deduplicated else 0 + # Top-level success: True if no source has an error + success = not any("error" in s for s in sources) + status = { "status": "done", "timestamp": start_time, "duration_sec": duration_sec, + "success": success, "total_accepted": total_accepted, "deduplicated": deduplicated, "duplicates_removed": duplicates_removed, "sources": sources, } - out = HERE / "status.json" + out = DATA_DIR / "status.json" out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Status uložen: {out}") print(f" Celkem bytů (před dedup): {total_accepted}") @@ -197,6 +154,8 @@ def main(): parts.append(f"[CHYBA: {err}]") print(" " + " ".join(parts)) + append_to_history(status, args.keep) + if __name__ == "__main__": main() diff --git a/mapa_bytu.html b/mapa_bytu.html index 55b4bec..9ef1109 100644 --- a/mapa_bytu.html +++ b/mapa_bytu.html @@ -3,7 +3,7 @@ -Byty v Praze — mapa (713 bytů) +Byty v Praze — mapa (132 bytů) \n' + ) + page_header = '

Scraper status

\n
maru-hleda-byt
\n' + footer = '' + + if status is None: + return ( + head_open + '\n\n' + page_header + + '

Status není k dispozici.

\n' + + footer + '\n\n' + ) + + if is_running: + return ( + head_open + + '\n' + + '\n\n' + page_header + + '
' + + '
Scraper právě běží…
\n' + + footer + '\n\n' + ) + + # ── Done state ──────────────────────────────────────────────────────────── + ts = status.get("timestamp", "") + duration = status.get("duration_sec") + total_accepted = status.get("total_accepted", 0) + deduplicated = status.get("deduplicated") + + ts_card = ( + '

Poslední scrape

' + f'
{_fmt_date(ts)}
' + + (f'
Trvání: {round(duration)}s
' if duration is not None else "") + + '
' + ) + + sum_card = ( + '

Souhrn

' + f'
Vyhovujících bytů' + f'{total_accepted}
' + + ( + f'
Po deduplikaci (v mapě)' + f'{deduplicated}
' + if deduplicated is not None else "" + ) + + '
' + ) + + rows_for_js = list(reversed(history)) + body = ( + page_header + + ts_card + "\n" + + sum_card + "\n" + + _sources_html(status.get("sources", [])) + "\n" + + _history_html(history) + "\n" + + footer + ) + modal = _modal_script(json.dumps(rows_for_js, ensure_ascii=False)) + return head_open + '\n\n' + body + '\n' + modal + '\n\n' + + +# ── HTTP handler ────────────────────────────────────────────────────────────── + +class Handler(SimpleHTTPRequestHandler): + def log_message(self, format, *args): + pass # suppress default access log; use our own where needed + + def _send_json(self, status: int, body, extra_headers=None): + payload = json.dumps(body, ensure_ascii=False).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Content-Length", str(len(payload))) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + if extra_headers: + for k, v in extra_headers.items(): + self.send_header(k, v) + self.end_headers() + self.wfile.write(payload) + + def do_OPTIONS(self): + self.send_response(204) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + self.end_headers() + + def do_GET(self): + if self.path.startswith("/api/"): + self._handle_api_get() + elif self.path.rstrip("/") == "/scrapers-status": + self._serve_status_page() + else: + log.debug("GET %s → static file: %s", self.path, self.translate_path(self.path)) + super().do_GET() + + def _handle_api_get(self): + if self.path in ("/api/ratings", "/api/ratings/export"): + ratings = load_ratings() + extra = None + if self.path == "/api/ratings/export": + extra = {"Content-Disposition": 'attachment; filename="ratings.json"'} + log.info("GET %s → %d ratings", self.path, len(ratings)) + self._send_json(200, ratings, extra) + elif self.path == "/api/status": + data = _load_json(DATA_DIR / "status.json") + if data is None: + self._send_json(404, {"error": "status not available"}) + return + log.info("GET /api/status → ok") + self._send_json(200, data) + elif self.path == "/api/status/history": + data = _load_json(DATA_DIR / "scraper_history.json", default=[]) + if not isinstance(data, list): + data = [] + log.info("GET /api/status/history → %d entries", len(data)) + self._send_json(200, data) + else: + self._send_json(404, {"error": "not found"}) + + def _serve_status_page(self): + status = _load_json(DATA_DIR / "status.json") + history = _load_json(DATA_DIR / "scraper_history.json", default=[]) + if not isinstance(history, list): + history = [] + is_running = (DATA_DIR / "scraper_running.json").exists() + html = _render_status_html(status, history, is_running) + payload = html.encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def do_POST(self): + if self.path == "/api/ratings": + length = int(self.headers.get("Content-Length", 0)) + if length == 0: + self._send_json(400, {"error": "empty body"}) + return + try: + raw = self.rfile.read(length) + data = json.loads(raw.decode("utf-8")) + except Exception as e: + log.warning("Bad request body: %s", e) + self._send_json(400, {"error": "invalid JSON"}) + return + if not isinstance(data, dict): + self._send_json(400, {"error": "expected JSON object"}) + return + save_ratings(data) + log.info("POST /api/ratings → saved %d ratings", len(data)) + self._send_json(200, {"ok": True, "count": len(data)}) + else: + self._send_json(404, {"error": "not found"}) + + +if __name__ == "__main__": + log.info("Server starting on port %d, data dir: %s", PORT, DATA_DIR) + handler = functools.partial(Handler, directory=str(DATA_DIR)) + server = HTTPServer(("0.0.0.0", PORT), handler) + try: + server.serve_forever() + except KeyboardInterrupt: + log.info("Stopped.") + sys.exit(0) diff --git a/status.html b/status.html deleted file mode 100644 index 3f6da1b..0000000 --- a/status.html +++ /dev/null @@ -1,204 +0,0 @@ - - - - - -Scraper status - - - - -

Scraper status

-
maru-hleda-byt
- -
-
-
-
Nacitam status...
-
-
- - - - - - From 44c02b45b44299d2d121eb45bf3e425b46e3b54f Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Thu, 26 Feb 2026 08:53:27 +0100 Subject: [PATCH 2/3] Increase history retention to 20, run scrapers every 4 hours MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - generate_status.py: raise --keep default from 5 to 20 entries - build/crontab: change schedule from 06:00/18:00 to every 4 hours (*/4) covers 6 runs/day ≈ 3.3 days of history at default retention Co-Authored-By: Claude Sonnet 4.6 --- build/crontab | 2 +- generate_status.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build/crontab b/build/crontab index 1b3dfd8..bbc25db 100644 --- a/build/crontab +++ b/build/crontab @@ -1 +1 @@ -0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 +0 */4 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 diff --git a/generate_status.py b/generate_status.py index ed505ae..c3a5cb6 100644 --- a/generate_status.py +++ b/generate_status.py @@ -84,8 +84,8 @@ def main(): help="ISO timestamp of scrape start (default: now)") parser.add_argument("--duration", dest="duration", type=int, default=None, help="Run duration in seconds") - parser.add_argument("--keep", dest="keep", type=int, default=5, - help="Number of history entries to keep (default: 5, 0=unlimited)") + parser.add_argument("--keep", dest="keep", type=int, default=20, + help="Number of history entries to keep (default: 20, 0=unlimited)") args = parser.parse_args() start_time = args.start_time or datetime.now().isoformat(timespec="seconds") From 00c914401022d14d271e5d6967a6daca9f681b2d Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Thu, 26 Feb 2026 09:46:16 +0100 Subject: [PATCH 3/3] Fix DATA_DIR usage in stats/history paths, set env in Dockerfile, add validation docs - scraper_stats.py: respect DATA_DIR env var when writing stats_*.json files - generate_status.py: read stats files and write history from DATA_DIR instead of HERE - build/Dockerfile: set DATA_DIR=/app/data as default env var - docs/validation.md: end-to-end Docker validation recipe Co-Authored-By: Claude Sonnet 4.6 --- build/Dockerfile | 1 + docs/validation.md | 123 +++++++++++++++++++++++++++++++++++++++++++++ generate_status.py | 4 +- scraper_stats.py | 6 ++- 4 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 docs/validation.md diff --git a/build/Dockerfile b/build/Dockerfile index 4dc54da..541f268 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -5,6 +5,7 @@ RUN apk add --no-cache curl bash tzdata \ && echo "Europe/Prague" > /etc/timezone ENV PYTHONUNBUFFERED=1 +ENV DATA_DIR=/app/data WORKDIR /app diff --git a/docs/validation.md b/docs/validation.md new file mode 100644 index 0000000..cc6a8da --- /dev/null +++ b/docs/validation.md @@ -0,0 +1,123 @@ +# Validation Recipe + +End-to-end check that scraping, data persistence, history, and the status page all work correctly in Docker. + +## What it verifies + +- All scrapers run and write output to `DATA_DIR` (`/app/data`) +- `stats_*.json` land in `/app/data/` (not in `/app/`) +- `status.json` and `scraper_history.json` land in `/app/data/` +- `/api/status`, `/api/status/history`, and `/scrapers-status` serve correct data +- History accumulates across runs + +## Steps + +### 1. Build the image + +```bash +make build +``` + +### 2. Start a clean validation container + +```bash +# Stop/remove any leftover container and volume from a previous run +docker stop maru-hleda-byt-validation 2>/dev/null; docker rm maru-hleda-byt-validation 2>/dev/null +docker volume rm maru-hleda-byt-validation-data 2>/dev/null + +docker run -d --name maru-hleda-byt-validation \ + -p 8081:8080 \ + -v maru-hleda-byt-validation-data:/app/data \ + maru-hleda-byt +``` + +Give the container ~3 seconds to start. The entrypoint launches a background full scrape automatically — suppress it so only controlled runs execute: + +```bash +sleep 3 +docker exec maru-hleda-byt-validation pkill -f run_all.sh 2>/dev/null || true +docker exec maru-hleda-byt-validation rm -f /app/data/scraper_running.json 2>/dev/null || true +``` + +### 3. Run a limited scrape (run 1) + +```bash +docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10 +``` + +Expected output (last few lines): +``` +Status uložen: /app/data/status.json +Historie uložena: /app/data/scraper_history.json (1 záznamů) +``` + +### 4. Verify data files are in `/app/data/` + +```bash +docker exec maru-hleda-byt-validation ls /app/data/ +``` + +Expected files: +``` +byty_cityhome.json byty_idnes.json byty_merged.json +byty_realingo.json byty_sreality.json +mapa_bytu.html +scraper_history.json +stats_bezrealitky.json stats_cityhome.json stats_idnes.json +stats_realingo.json stats_sreality.json +status.json +``` + +### 5. Run a second limited scrape (run 2) + +```bash +docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10 +``` + +Expected last line: `Historie uložena: /app/data/scraper_history.json (2 záznamů)` + +### 6. Verify history via API + +```bash +curl -s http://localhost:8081/api/status/history | python3 -c " +import json, sys +h = json.load(sys.stdin) +print(f'{len(h)} entries:') +for i, e in enumerate(h): + print(f' [{i}] {e[\"timestamp\"]} total={e[\"total_accepted\"]}') +" +``` + +Expected: 2 entries with different timestamps. + +```bash +curl -s http://localhost:8081/api/status | python3 -c " +import json, sys; s=json.load(sys.stdin) +print(f'status={s[\"status\"]} total={s[\"total_accepted\"]} ts={s[\"timestamp\"]}') +" +``` + +Expected: `status=done total= ts=` + +### 7. Check the status page + +Open http://localhost:8081/scrapers-status in a browser (or `curl -s http://localhost:8081/scrapers-status | grep -c "clickable-row"` — should print `2`). + +### 8. Clean up + +```bash +docker stop maru-hleda-byt-validation && docker rm maru-hleda-byt-validation +docker volume rm maru-hleda-byt-validation-data +``` + +Or use the Makefile shortcut: + +```bash +make validation-stop +``` + +## Notes + +- PSN scraper does not support `--max-pages` and will always fail with this command; `success=False` in history is expected during validation. +- Bezrealitky may return 0 results with a 1-page limit; `byty_bezrealitky.json` will be absent from `/app/data/` in that case — this is normal. +- `make validation` (the Makefile target) runs the same limited scrape but does not suppress the background startup scrape, so two concurrent runs may occur. Use the manual steps above for a clean controlled test. diff --git a/generate_status.py b/generate_status.py index c3a5cb6..db559bb 100644 --- a/generate_status.py +++ b/generate_status.py @@ -58,7 +58,7 @@ def read_scraper_stats(path: Path) -> dict: def append_to_history(status: dict, keep: int) -> None: """Append the current status entry to scraper_history.json, keeping only `keep` latest.""" - history_path = HERE / HISTORY_FILE + history_path = DATA_DIR / HISTORY_FILE history: list = [] if history_path.exists(): try: @@ -98,7 +98,7 @@ def main(): info["name"] = name # Merge in stats from the per-scraper stats file (authoritative for run data) - stats = read_scraper_stats(HERE / STATS_FILES[name]) + stats = read_scraper_stats(DATA_DIR / STATS_FILES[name]) for key in ("accepted", "fetched", "pages", "cache_hits", "excluded", "excluded_total", "success", "duration_sec", "error"): if key in stats: diff --git a/scraper_stats.py b/scraper_stats.py index 2dd1f81..b605533 100644 --- a/scraper_stats.py +++ b/scraper_stats.py @@ -2,12 +2,14 @@ from __future__ import annotations import json +import os from pathlib import Path HERE = Path(__file__).parent +DATA_DIR = Path(os.environ.get("DATA_DIR", HERE)) def write_stats(filename: str, stats: dict) -> None: - """Write scraper run stats dict to a JSON file next to this module.""" - path = HERE / filename + """Write scraper run stats dict to the data directory.""" + path = DATA_DIR / filename path.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")