From 5fb3b984b6619efef721a52a7578859303b16ff9 Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Thu, 26 Feb 2026 00:30:25 +0100 Subject: [PATCH] Add status dashboard, server, scraper stats, and DATA_DIR support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Key changes: - Replace ratings_server.py + status.html with a unified server.py that serves the map, scraper status dashboard, and ratings API in one process - Add scraper_stats.py utility: each scraper writes per-run stats (fetched, accepted, excluded, duration) to stats_.json for the status page - generate_status.py: respect DATA_DIR env var so status.json lands in the configured data directory instead of always the project root - run_all.sh: replace the {"status":"running"} overwrite of status.json with a dedicated scraper_running.json lock file; trap on EXIT ensures cleanup even on kill/error, preventing the previous run's results from being wiped - server.py: detect running state via scraper_running.json existence instead of status["status"] field, eliminating the dual-use race condition - Makefile: add serve (local dev), debug (Docker debug container) targets; add SERVER_PORT variable - build/Dockerfile + entrypoint.sh: switch to server.py, set DATA_DIR, adjust volume mounts - .gitignore: add *.json and *.log to keep runtime data files out of VCS - mapa_bytu.html: price-per-m² colouring, status link, UX tweaks Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 3 + Makefile | 30 +- build/Dockerfile | 5 +- build/entrypoint.sh | 9 +- generate_status.py | 163 ++++----- mapa_bytu.html | 781 ++++++------------------------------------ ratings_server.py | 116 ------- run_all.sh | 15 +- scrape_and_map.py | 38 +- scrape_bezrealitky.py | 40 ++- scrape_cityhome.py | 38 +- scrape_idnes.py | 40 ++- scrape_psn.py | 39 ++- scrape_realingo.py | 40 ++- scraper_stats.py | 13 + server.py | 477 ++++++++++++++++++++++++++ status.html | 204 ----------- 17 files changed, 929 insertions(+), 1122 deletions(-) delete mode 100644 ratings_server.py create mode 100644 scraper_stats.py create mode 100644 server.py delete mode 100644 status.html diff --git a/.gitignore b/.gitignore index 885cbd0..c3c7fc7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ .vscode/ __pycache__/ byty_*.json +*.json +*.log + diff --git a/Makefile b/Makefile index c0fd73f..f11075c 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,13 @@ CONTAINER_NAME := maru-hleda-byt VOLUME_NAME := maru-hleda-byt-data VALIDATION_CONTAINER := maru-hleda-byt-validation VALIDATION_VOLUME := maru-hleda-byt-validation-data +DEBUG_CONTAINER := maru-hleda-byt-debug +DEBUG_VOLUME := maru-hleda-byt-debug-data +DEBUG_PORT ?= 8082 PORT := 8080 +SERVER_PORT ?= 8080 -.PHONY: build run stop logs scrape restart clean help validation validation-local validation-stop validation-local-debug +.PHONY: build run stop logs scrape restart clean help serve validation validation-local validation-stop validation-local-debug debug debug-stop help: @echo "Available targets:" @@ -20,6 +24,9 @@ help: @echo " validation-local-debug - Run validation locally with DEBUG logging" @echo " restart - Restart the container (stop and run again)" @echo " clean - Stop container and remove the Docker image" + @echo " serve - Start server.py locally on port 8080" + @echo " debug - Build and run debug Docker container with limited scrape (port $(DEBUG_PORT))" + @echo " debug-stop - Stop and remove the debug Docker container" @echo " help - Show this help message" build: @@ -59,6 +66,27 @@ validation-stop: @docker rm $(VALIDATION_CONTAINER) 2>/dev/null || true @echo "Validation container stopped and removed" +debug: build + @docker stop $(DEBUG_CONTAINER) 2>/dev/null || true + @docker rm $(DEBUG_CONTAINER) 2>/dev/null || true + docker run -d --name $(DEBUG_CONTAINER) \ + -p $(DEBUG_PORT):8080 \ + -v $(DEBUG_VOLUME):/app/data \ + -e LOG_LEVEL=DEBUG \ + $(IMAGE_NAME) + @sleep 2 + docker exec $(DEBUG_CONTAINER) bash /app/run_all.sh --max-pages 1 --max-properties 10 + @echo "Debug app at http://localhost:$(DEBUG_PORT)/mapa_bytu.html" + @echo "Debug status at http://localhost:$(DEBUG_PORT)/scrapers-status" + +debug-stop: + @docker stop $(DEBUG_CONTAINER) 2>/dev/null || true + @docker rm $(DEBUG_CONTAINER) 2>/dev/null || true + @echo "Debug container stopped and removed" + +serve: + DATA_DIR=. SERVER_PORT=$(SERVER_PORT) python3 server.py + validation-local: ./run_all.sh --max-pages 1 --max-properties 10 diff --git a/build/Dockerfile b/build/Dockerfile index a9bc15f..4dc54da 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -10,7 +10,8 @@ WORKDIR /app COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \ - merge_and_map.py regen_map.py run_all.sh ratings_server.py ./ + merge_and_map.py regen_map.py generate_status.py scraper_stats.py \ + run_all.sh server.py ./ COPY build/crontab /etc/crontabs/root COPY build/entrypoint.sh /entrypoint.sh @@ -18,7 +19,7 @@ RUN chmod +x /entrypoint.sh run_all.sh RUN mkdir -p /app/data -EXPOSE 8080 8081 +EXPOSE 8080 HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 diff --git a/build/entrypoint.sh b/build/entrypoint.sh index da84e83..0b3ad11 100644 --- a/build/entrypoint.sh +++ b/build/entrypoint.sh @@ -1,7 +1,7 @@ #!/bin/bash set -euo pipefail -DATA_DIR="/app/data" +export DATA_DIR="/app/data" # Create symlinks so scripts (which write to /app/) persist data to the volume for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ @@ -18,8 +18,5 @@ crond -b -l 2 echo "[entrypoint] Starting initial scrape in background..." bash /app/run_all.sh & -echo "[entrypoint] Starting ratings API server on port 8081..." -DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py & - -echo "[entrypoint] Starting HTTP server on port 8080..." -exec python3 -m http.server 8080 --directory "$DATA_DIR" +echo "[entrypoint] Starting server on port 8080..." +exec python3 /app/server.py diff --git a/generate_status.py b/generate_status.py index 8eaf1f5..ed505ae 100644 --- a/generate_status.py +++ b/generate_status.py @@ -1,16 +1,15 @@ #!/usr/bin/env python3 -"""Generate status.json from scraper JSON outputs and run log.""" +"""Generate status.json from scraper JSON outputs and per-scraper stats files.""" from __future__ import annotations +import argparse import json import os -import re -import sys from datetime import datetime from pathlib import Path -from typing import Optional HERE = Path(__file__).parent +DATA_DIR = Path(os.environ.get("DATA_DIR", HERE)) SOURCE_FILES = { "Sreality": "byty_sreality.json", @@ -21,7 +20,17 @@ SOURCE_FILES = { "CityHome": "byty_cityhome.json", } +STATS_FILES = { + "Sreality": "stats_sreality.json", + "Realingo": "stats_realingo.json", + "Bezrealitky": "stats_bezrealitky.json", + "iDNES": "stats_idnes.json", + "PSN": "stats_psn.json", + "CityHome": "stats_cityhome.json", +} + MERGED_FILE = "byty_merged.json" +HISTORY_FILE = "scraper_history.json" def count_source(path: Path) -> dict: @@ -36,105 +45,51 @@ def count_source(path: Path) -> dict: return {"accepted": 0, "error": str(e)} -def parse_log(log_path: str) -> dict[str, dict]: - """Parse scraper run log and extract per-source statistics. - - Scrapers log summary lines like: - ✓ Vyhovující byty: 12 - Vyloučeno (prodáno): 5 - Staženo stránek: 3 - Staženo inzerátů: 48 - Celkem bytů v cache: 120 - and section headers like: - [2/6] Realingo - """ - if not log_path or not os.path.exists(log_path): +def read_scraper_stats(path: Path) -> dict: + """Load a per-scraper stats JSON. Returns {} on missing or corrupt file.""" + if not path.exists(): + return {} + try: + data = json.loads(path.read_text(encoding="utf-8")) + return data if isinstance(data, dict) else {} + except Exception: return {} - with open(log_path, encoding="utf-8") as f: - content = f.read() - # Split into per-source sections by the [N/6] Step header - # Each section header looks like "[2/6] Realingo\n----..." - section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE) - sections_found = list(section_pattern.finditer(content)) +def append_to_history(status: dict, keep: int) -> None: + """Append the current status entry to scraper_history.json, keeping only `keep` latest.""" + history_path = HERE / HISTORY_FILE + history: list = [] + if history_path.exists(): + try: + history = json.loads(history_path.read_text(encoding="utf-8")) + if not isinstance(history, list): + history = [] + except Exception: + history = [] - if not sections_found: - return {} + history.append(status) - stats = {} - for i, match in enumerate(sections_found): - step_name = match.group(2).strip() - start = match.end() - end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content) - section_text = content[start:end] + # Keep only the N most recent entries + if keep > 0 and len(history) > keep: + history = history[-keep:] - # Identify which sources this section covers - # "PSN + CityHome" covers both - source_names = [] - for name in SOURCE_FILES: - if name.lower() in step_name.lower(): - source_names.append(name) - if not source_names: - continue - - # Parse numeric summary lines - def extract(pattern: str) -> Optional[int]: - m = re.search(pattern, section_text) - return int(m.group(1)) if m else None - - # Lines present in all/most scrapers - accepted = extract(r'Vyhovující byty[:\s]+(\d+)') - fetched = extract(r'Staženo inzerátů[:\s]+(\d+)') - pages = extract(r'Staženo stránek[:\s]+(\d+)') - cached = extract(r'Celkem bytů v cache[:\s]+(\d+)') - cache_hits = extract(r'Cache hit[:\s]+(\d+)') - - # Rejection reasons — collect all into a dict - excluded = {} - for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text): - excluded[m.group(1)] = int(m.group(2)) - # Also PSN-style "Vyloučeno (prodáno): N" - total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)') - - entry = {} - if accepted is not None: - entry["accepted"] = accepted - if fetched is not None: - entry["fetched"] = fetched - if pages is not None: - entry["pages"] = pages - if cached is not None: - entry["cached"] = cached - if cache_hits is not None: - entry["cache_hits"] = cache_hits - if excluded: - entry["excluded"] = excluded - elif total_excluded is not None: - entry["excluded_total"] = total_excluded - - for name in source_names: - stats[name] = entry - - return stats + history_path.write_text(json.dumps(history, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Historie uložena: {history_path} ({len(history)} záznamů)") def main(): - start_time = None - duration_sec = None + parser = argparse.ArgumentParser(description="Generate status.json from scraper outputs.") + parser.add_argument("--start-time", dest="start_time", default=None, + help="ISO timestamp of scrape start (default: now)") + parser.add_argument("--duration", dest="duration", type=int, default=None, + help="Run duration in seconds") + parser.add_argument("--keep", dest="keep", type=int, default=5, + help="Number of history entries to keep (default: 5, 0=unlimited)") + args = parser.parse_args() - if len(sys.argv) >= 3: - start_time = sys.argv[1] - try: - duration_sec = int(sys.argv[2]) - except ValueError: - pass - - if not start_time: - start_time = datetime.now().isoformat(timespec="seconds") - - log_path = sys.argv[3] if len(sys.argv) >= 4 else None - log_stats = parse_log(log_path) + start_time = args.start_time or datetime.now().isoformat(timespec="seconds") + duration_sec = args.duration sources = [] for name, filename in SOURCE_FILES.items(): @@ -142,14 +97,12 @@ def main(): info = count_source(path) info["name"] = name - # Merge log stats - ls = log_stats.get(name, {}) - for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"): - if k in ls: - info[k] = ls[k] - # Override accepted from log if available (log is authoritative for latest run) - if "accepted" in ls: - info["accepted"] = ls["accepted"] + # Merge in stats from the per-scraper stats file (authoritative for run data) + stats = read_scraper_stats(HERE / STATS_FILES[name]) + for key in ("accepted", "fetched", "pages", "cache_hits", "excluded", "excluded_total", + "success", "duration_sec", "error"): + if key in stats: + info[key] = stats[key] sources.append(info) @@ -168,17 +121,21 @@ def main(): duplicates_removed = total_accepted - deduplicated if deduplicated else 0 + # Top-level success: True if no source has an error + success = not any("error" in s for s in sources) + status = { "status": "done", "timestamp": start_time, "duration_sec": duration_sec, + "success": success, "total_accepted": total_accepted, "deduplicated": deduplicated, "duplicates_removed": duplicates_removed, "sources": sources, } - out = HERE / "status.json" + out = DATA_DIR / "status.json" out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Status uložen: {out}") print(f" Celkem bytů (před dedup): {total_accepted}") @@ -197,6 +154,8 @@ def main(): parts.append(f"[CHYBA: {err}]") print(" " + " ".join(parts)) + append_to_history(status, args.keep) + if __name__ == "__main__": main() diff --git a/mapa_bytu.html b/mapa_bytu.html index 55b4bec..9ef1109 100644 --- a/mapa_bytu.html +++ b/mapa_bytu.html @@ -3,7 +3,7 @@ -Byty v Praze — mapa (713 bytů) +Byty v Praze — mapa (132 bytů) \n' + ) + page_header = '

Scraper status

\n
maru-hleda-byt
\n' + footer = '' + + if status is None: + return ( + head_open + '\n\n' + page_header + + '

Status není k dispozici.

\n' + + footer + '\n\n' + ) + + if is_running: + return ( + head_open + + '\n' + + '\n\n' + page_header + + '
' + + '
Scraper právě běží…
\n' + + footer + '\n\n' + ) + + # ── Done state ──────────────────────────────────────────────────────────── + ts = status.get("timestamp", "") + duration = status.get("duration_sec") + total_accepted = status.get("total_accepted", 0) + deduplicated = status.get("deduplicated") + + ts_card = ( + '

Poslední scrape

' + f'
{_fmt_date(ts)}
' + + (f'
Trvání: {round(duration)}s
' if duration is not None else "") + + '
' + ) + + sum_card = ( + '

Souhrn

' + f'
Vyhovujících bytů' + f'{total_accepted}
' + + ( + f'
Po deduplikaci (v mapě)' + f'{deduplicated}
' + if deduplicated is not None else "" + ) + + '
' + ) + + rows_for_js = list(reversed(history)) + body = ( + page_header + + ts_card + "\n" + + sum_card + "\n" + + _sources_html(status.get("sources", [])) + "\n" + + _history_html(history) + "\n" + + footer + ) + modal = _modal_script(json.dumps(rows_for_js, ensure_ascii=False)) + return head_open + '\n\n' + body + '\n' + modal + '\n\n' + + +# ── HTTP handler ────────────────────────────────────────────────────────────── + +class Handler(SimpleHTTPRequestHandler): + def log_message(self, format, *args): + pass # suppress default access log; use our own where needed + + def _send_json(self, status: int, body, extra_headers=None): + payload = json.dumps(body, ensure_ascii=False).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Content-Length", str(len(payload))) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + if extra_headers: + for k, v in extra_headers.items(): + self.send_header(k, v) + self.end_headers() + self.wfile.write(payload) + + def do_OPTIONS(self): + self.send_response(204) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + self.end_headers() + + def do_GET(self): + if self.path.startswith("/api/"): + self._handle_api_get() + elif self.path.rstrip("/") == "/scrapers-status": + self._serve_status_page() + else: + log.debug("GET %s → static file: %s", self.path, self.translate_path(self.path)) + super().do_GET() + + def _handle_api_get(self): + if self.path in ("/api/ratings", "/api/ratings/export"): + ratings = load_ratings() + extra = None + if self.path == "/api/ratings/export": + extra = {"Content-Disposition": 'attachment; filename="ratings.json"'} + log.info("GET %s → %d ratings", self.path, len(ratings)) + self._send_json(200, ratings, extra) + elif self.path == "/api/status": + data = _load_json(DATA_DIR / "status.json") + if data is None: + self._send_json(404, {"error": "status not available"}) + return + log.info("GET /api/status → ok") + self._send_json(200, data) + elif self.path == "/api/status/history": + data = _load_json(DATA_DIR / "scraper_history.json", default=[]) + if not isinstance(data, list): + data = [] + log.info("GET /api/status/history → %d entries", len(data)) + self._send_json(200, data) + else: + self._send_json(404, {"error": "not found"}) + + def _serve_status_page(self): + status = _load_json(DATA_DIR / "status.json") + history = _load_json(DATA_DIR / "scraper_history.json", default=[]) + if not isinstance(history, list): + history = [] + is_running = (DATA_DIR / "scraper_running.json").exists() + html = _render_status_html(status, history, is_running) + payload = html.encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def do_POST(self): + if self.path == "/api/ratings": + length = int(self.headers.get("Content-Length", 0)) + if length == 0: + self._send_json(400, {"error": "empty body"}) + return + try: + raw = self.rfile.read(length) + data = json.loads(raw.decode("utf-8")) + except Exception as e: + log.warning("Bad request body: %s", e) + self._send_json(400, {"error": "invalid JSON"}) + return + if not isinstance(data, dict): + self._send_json(400, {"error": "expected JSON object"}) + return + save_ratings(data) + log.info("POST /api/ratings → saved %d ratings", len(data)) + self._send_json(200, {"ok": True, "count": len(data)}) + else: + self._send_json(404, {"error": "not found"}) + + +if __name__ == "__main__": + log.info("Server starting on port %d, data dir: %s", PORT, DATA_DIR) + handler = functools.partial(Handler, directory=str(DATA_DIR)) + server = HTTPServer(("0.0.0.0", PORT), handler) + try: + server.serve_forever() + except KeyboardInterrupt: + log.info("Stopped.") + sys.exit(0) diff --git a/status.html b/status.html deleted file mode 100644 index 3f6da1b..0000000 --- a/status.html +++ /dev/null @@ -1,204 +0,0 @@ - - - - - -Scraper status - - - - -

Scraper status

-
maru-hleda-byt
- -
-
-
-
Nacitam status...
-
-
- - - - - -