diff --git a/build/Dockerfile b/build/Dockerfile index f672cee..a9bc15f 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -10,7 +10,7 @@ WORKDIR /app COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \ - merge_and_map.py regen_map.py run_all.sh ./ + merge_and_map.py regen_map.py run_all.sh ratings_server.py ./ COPY build/crontab /etc/crontabs/root COPY build/entrypoint.sh /entrypoint.sh @@ -18,7 +18,7 @@ RUN chmod +x /entrypoint.sh run_all.sh RUN mkdir -p /app/data -EXPOSE 8080 +EXPOSE 8080 8081 HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 diff --git a/build/entrypoint.sh b/build/entrypoint.sh index 032afe5..da84e83 100644 --- a/build/entrypoint.sh +++ b/build/entrypoint.sh @@ -6,7 +6,7 @@ DATA_DIR="/app/data" # Create symlinks so scripts (which write to /app/) persist data to the volume for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \ - mapa_bytu.html; do + mapa_bytu.html ratings.json; do # Remove real file if it exists (e.g. baked into image) [ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f" ln -sf "$DATA_DIR/$f" "/app/$f" @@ -18,5 +18,8 @@ crond -b -l 2 echo "[entrypoint] Starting initial scrape in background..." bash /app/run_all.sh & +echo "[entrypoint] Starting ratings API server on port 8081..." +DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py & + echo "[entrypoint] Starting HTTP server on port 8080..." exec python3 -m http.server 8080 --directory "$DATA_DIR" diff --git a/generate_status.py b/generate_status.py new file mode 100644 index 0000000..8eaf1f5 --- /dev/null +++ b/generate_status.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +"""Generate status.json from scraper JSON outputs and run log.""" +from __future__ import annotations + +import json +import os +import re +import sys +from datetime import datetime +from pathlib import Path +from typing import Optional + +HERE = Path(__file__).parent + +SOURCE_FILES = { + "Sreality": "byty_sreality.json", + "Realingo": "byty_realingo.json", + "Bezrealitky": "byty_bezrealitky.json", + "iDNES": "byty_idnes.json", + "PSN": "byty_psn.json", + "CityHome": "byty_cityhome.json", +} + +MERGED_FILE = "byty_merged.json" + + +def count_source(path: Path) -> dict: + """Read a scraper JSON and return accepted count + file mtime.""" + if not path.exists(): + return {"accepted": 0, "error": "soubor nenalezen"} + try: + data = json.loads(path.read_text(encoding="utf-8")) + mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds") + return {"accepted": len(data), "updated_at": mtime} + except Exception as e: + return {"accepted": 0, "error": str(e)} + + +def parse_log(log_path: str) -> dict[str, dict]: + """Parse scraper run log and extract per-source statistics. + + Scrapers log summary lines like: + ✓ Vyhovující byty: 12 + Vyloučeno (prodáno): 5 + Staženo stránek: 3 + Staženo inzerátů: 48 + Celkem bytů v cache: 120 + and section headers like: + [2/6] Realingo + """ + if not log_path or not os.path.exists(log_path): + return {} + + with open(log_path, encoding="utf-8") as f: + content = f.read() + + # Split into per-source sections by the [N/6] Step header + # Each section header looks like "[2/6] Realingo\n----..." + section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE) + sections_found = list(section_pattern.finditer(content)) + + if not sections_found: + return {} + + stats = {} + for i, match in enumerate(sections_found): + step_name = match.group(2).strip() + start = match.end() + end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content) + section_text = content[start:end] + + # Identify which sources this section covers + # "PSN + CityHome" covers both + source_names = [] + for name in SOURCE_FILES: + if name.lower() in step_name.lower(): + source_names.append(name) + if not source_names: + continue + + # Parse numeric summary lines + def extract(pattern: str) -> Optional[int]: + m = re.search(pattern, section_text) + return int(m.group(1)) if m else None + + # Lines present in all/most scrapers + accepted = extract(r'Vyhovující byty[:\s]+(\d+)') + fetched = extract(r'Staženo inzerátů[:\s]+(\d+)') + pages = extract(r'Staženo stránek[:\s]+(\d+)') + cached = extract(r'Celkem bytů v cache[:\s]+(\d+)') + cache_hits = extract(r'Cache hit[:\s]+(\d+)') + + # Rejection reasons — collect all into a dict + excluded = {} + for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text): + excluded[m.group(1)] = int(m.group(2)) + # Also PSN-style "Vyloučeno (prodáno): N" + total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)') + + entry = {} + if accepted is not None: + entry["accepted"] = accepted + if fetched is not None: + entry["fetched"] = fetched + if pages is not None: + entry["pages"] = pages + if cached is not None: + entry["cached"] = cached + if cache_hits is not None: + entry["cache_hits"] = cache_hits + if excluded: + entry["excluded"] = excluded + elif total_excluded is not None: + entry["excluded_total"] = total_excluded + + for name in source_names: + stats[name] = entry + + return stats + + +def main(): + start_time = None + duration_sec = None + + if len(sys.argv) >= 3: + start_time = sys.argv[1] + try: + duration_sec = int(sys.argv[2]) + except ValueError: + pass + + if not start_time: + start_time = datetime.now().isoformat(timespec="seconds") + + log_path = sys.argv[3] if len(sys.argv) >= 4 else None + log_stats = parse_log(log_path) + + sources = [] + for name, filename in SOURCE_FILES.items(): + path = HERE / filename + info = count_source(path) + info["name"] = name + + # Merge log stats + ls = log_stats.get(name, {}) + for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"): + if k in ls: + info[k] = ls[k] + # Override accepted from log if available (log is authoritative for latest run) + if "accepted" in ls: + info["accepted"] = ls["accepted"] + + sources.append(info) + + # Total accepted before dedup + total_accepted = sum(s.get("accepted", 0) for s in sources) + + # Merged / deduplicated count + merged_path = HERE / MERGED_FILE + deduplicated = 0 + if merged_path.exists(): + try: + merged = json.loads(merged_path.read_text(encoding="utf-8")) + deduplicated = len(merged) + except Exception: + pass + + duplicates_removed = total_accepted - deduplicated if deduplicated else 0 + + status = { + "status": "done", + "timestamp": start_time, + "duration_sec": duration_sec, + "total_accepted": total_accepted, + "deduplicated": deduplicated, + "duplicates_removed": duplicates_removed, + "sources": sources, + } + + out = HERE / "status.json" + out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Status uložen: {out}") + print(f" Celkem bytů (před dedup): {total_accepted}") + print(f" Po deduplikaci: {deduplicated}") + if duplicates_removed: + print(f" Odstraněno duplikátů: {duplicates_removed}") + for s in sources: + acc = s.get("accepted", 0) + err = s.get("error", "") + exc = s.get("excluded", {}) + exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0) + parts = [f"{s['name']:12s}: {acc} bytů"] + if exc_total: + parts.append(f"({exc_total} vyloučeno)") + if err: + parts.append(f"[CHYBA: {err}]") + print(" " + " ".join(parts)) + + +if __name__ == "__main__": + main() diff --git a/mapa_bytu.html b/mapa_bytu.html index 2bb58eb..55b4bec 100644 --- a/mapa_bytu.html +++ b/mapa_bytu.html @@ -3,7 +3,7 @@
-