#!/usr/bin/env python3 """Generate status.json from scraper JSON outputs and run log.""" from __future__ import annotations import json import os import re import sys from datetime import datetime from pathlib import Path from typing import Optional HERE = Path(__file__).parent SOURCE_FILES = { "Sreality": "byty_sreality.json", "Realingo": "byty_realingo.json", "Bezrealitky": "byty_bezrealitky.json", "iDNES": "byty_idnes.json", "PSN": "byty_psn.json", "CityHome": "byty_cityhome.json", } MERGED_FILE = "byty_merged.json" def count_source(path: Path) -> dict: """Read a scraper JSON and return accepted count + file mtime.""" if not path.exists(): return {"accepted": 0, "error": "soubor nenalezen"} try: data = json.loads(path.read_text(encoding="utf-8")) mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds") return {"accepted": len(data), "updated_at": mtime} except Exception as e: return {"accepted": 0, "error": str(e)} def parse_log(log_path: str) -> dict[str, dict]: """Parse scraper run log and extract per-source statistics. Scrapers log summary lines like: ✓ Vyhovující byty: 12 Vyloučeno (prodáno): 5 Staženo stránek: 3 Staženo inzerátů: 48 Celkem bytů v cache: 120 and section headers like: [2/6] Realingo """ if not log_path or not os.path.exists(log_path): return {} with open(log_path, encoding="utf-8") as f: content = f.read() # Split into per-source sections by the [N/6] Step header # Each section header looks like "[2/6] Realingo\n----..." section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE) sections_found = list(section_pattern.finditer(content)) if not sections_found: return {} stats = {} for i, match in enumerate(sections_found): step_name = match.group(2).strip() start = match.end() end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content) section_text = content[start:end] # Identify which sources this section covers # "PSN + CityHome" covers both source_names = [] for name in SOURCE_FILES: if name.lower() in step_name.lower(): source_names.append(name) if not source_names: continue # Parse numeric summary lines def extract(pattern: str) -> Optional[int]: m = re.search(pattern, section_text) return int(m.group(1)) if m else None # Lines present in all/most scrapers accepted = extract(r'Vyhovující byty[:\s]+(\d+)') fetched = extract(r'Staženo inzerátů[:\s]+(\d+)') pages = extract(r'Staženo stránek[:\s]+(\d+)') cached = extract(r'Celkem bytů v cache[:\s]+(\d+)') cache_hits = extract(r'Cache hit[:\s]+(\d+)') # Rejection reasons — collect all into a dict excluded = {} for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text): excluded[m.group(1)] = int(m.group(2)) # Also PSN-style "Vyloučeno (prodáno): N" total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)') entry = {} if accepted is not None: entry["accepted"] = accepted if fetched is not None: entry["fetched"] = fetched if pages is not None: entry["pages"] = pages if cached is not None: entry["cached"] = cached if cache_hits is not None: entry["cache_hits"] = cache_hits if excluded: entry["excluded"] = excluded elif total_excluded is not None: entry["excluded_total"] = total_excluded for name in source_names: stats[name] = entry return stats def main(): start_time = None duration_sec = None if len(sys.argv) >= 3: start_time = sys.argv[1] try: duration_sec = int(sys.argv[2]) except ValueError: pass if not start_time: start_time = datetime.now().isoformat(timespec="seconds") log_path = sys.argv[3] if len(sys.argv) >= 4 else None log_stats = parse_log(log_path) sources = [] for name, filename in SOURCE_FILES.items(): path = HERE / filename info = count_source(path) info["name"] = name # Merge log stats ls = log_stats.get(name, {}) for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"): if k in ls: info[k] = ls[k] # Override accepted from log if available (log is authoritative for latest run) if "accepted" in ls: info["accepted"] = ls["accepted"] sources.append(info) # Total accepted before dedup total_accepted = sum(s.get("accepted", 0) for s in sources) # Merged / deduplicated count merged_path = HERE / MERGED_FILE deduplicated = 0 if merged_path.exists(): try: merged = json.loads(merged_path.read_text(encoding="utf-8")) deduplicated = len(merged) except Exception: pass duplicates_removed = total_accepted - deduplicated if deduplicated else 0 status = { "status": "done", "timestamp": start_time, "duration_sec": duration_sec, "total_accepted": total_accepted, "deduplicated": deduplicated, "duplicates_removed": duplicates_removed, "sources": sources, } out = HERE / "status.json" out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8") print(f"Status uložen: {out}") print(f" Celkem bytů (před dedup): {total_accepted}") print(f" Po deduplikaci: {deduplicated}") if duplicates_removed: print(f" Odstraněno duplikátů: {duplicates_removed}") for s in sources: acc = s.get("accepted", 0) err = s.get("error", "") exc = s.get("excluded", {}) exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0) parts = [f"{s['name']:12s}: {acc} bytů"] if exc_total: parts.append(f"({exc_total} vyloučeno)") if err: parts.append(f"[CHYBA: {err}]") print(" " + " ".join(parts)) if __name__ == "__main__": main()