maru-hleda-byt/generate_status.py

#!/usr/bin/env python3
"""Generate status.json from scraper JSON outputs and run log."""
from __future__ import annotations

import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

HERE = Path(__file__).parent

SOURCE_FILES = {
    "Sreality":    "byty_sreality.json",
    "Realingo":    "byty_realingo.json",
    "Bezrealitky": "byty_bezrealitky.json",
    "iDNES":       "byty_idnes.json",
    "PSN":         "byty_psn.json",
    "CityHome":    "byty_cityhome.json",
}

MERGED_FILE = "byty_merged.json"


def count_source(path: Path) -> dict:
    """Read a scraper JSON and return accepted count + file mtime."""
    if not path.exists():
        return {"accepted": 0, "error": "soubor nenalezen"}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds")
        return {"accepted": len(data), "updated_at": mtime}
    except Exception as e:
        return {"accepted": 0, "error": str(e)}


def parse_log(log_path: str) -> dict[str, dict]:
    """Parse scraper run log and extract per-source statistics.

    Scrapers log summary lines like:
      ✓ Vyhovující byty:    12
      Vyloučeno (prodáno):   5
      Staženo stránek:       3
      Staženo inzerátů:      48
      Celkem bytů v cache:   120
    and section headers like:
      [2/6] Realingo
    """
    if not log_path or not os.path.exists(log_path):
        return {}

    with open(log_path, encoding="utf-8") as f:
        content = f.read()

    # Split into per-source sections by the [N/6] Step header
    # Each section header looks like "[2/6] Realingo\n----..."
    section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE)
    sections_found = list(section_pattern.finditer(content))

    if not sections_found:
        return {}

    stats = {}
    for i, match in enumerate(sections_found):
        step_name = match.group(2).strip()
        start = match.end()
        end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content)
        section_text = content[start:end]

        # Identify which sources this section covers
        # "PSN + CityHome" covers both
        source_names = []
        for name in SOURCE_FILES:
            if name.lower() in step_name.lower():
                source_names.append(name)
        if not source_names:
            continue

        # Parse numeric summary lines
        def extract(pattern: str) -> Optional[int]:
            m = re.search(pattern, section_text)
            return int(m.group(1)) if m else None

        # Lines present in all/most scrapers
        accepted = extract(r'Vyhovující byty[:\s]+(\d+)')
        fetched = extract(r'Staženo inzerátů[:\s]+(\d+)')
        pages = extract(r'Staženo stránek[:\s]+(\d+)')
        cached = extract(r'Celkem bytů v cache[:\s]+(\d+)')
        cache_hits = extract(r'Cache hit[:\s]+(\d+)')

        # Rejection reasons — collect all into a dict
        excluded = {}
        for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text):
            excluded[m.group(1)] = int(m.group(2))
        # Also PSN-style "Vyloučeno (prodáno): N"
        total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)')

        entry = {}
        if accepted is not None:
            entry["accepted"] = accepted
        if fetched is not None:
            entry["fetched"] = fetched
        if pages is not None:
            entry["pages"] = pages
        if cached is not None:
            entry["cached"] = cached
        if cache_hits is not None:
            entry["cache_hits"] = cache_hits
        if excluded:
            entry["excluded"] = excluded
        elif total_excluded is not None:
            entry["excluded_total"] = total_excluded

        for name in source_names:
            stats[name] = entry

    return stats


def main():
    start_time = None
    duration_sec = None

    if len(sys.argv) >= 3:
        start_time = sys.argv[1]
        try:
            duration_sec = int(sys.argv[2])
        except ValueError:
            pass

    if not start_time:
        start_time = datetime.now().isoformat(timespec="seconds")

    log_path = sys.argv[3] if len(sys.argv) >= 4 else None
    log_stats = parse_log(log_path)

    sources = []
    for name, filename in SOURCE_FILES.items():
        path = HERE / filename
        info = count_source(path)
        info["name"] = name

        # Merge log stats
        ls = log_stats.get(name, {})
        for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"):
            if k in ls:
                info[k] = ls[k]
        # Override accepted from log if available (log is authoritative for latest run)
        if "accepted" in ls:
            info["accepted"] = ls["accepted"]

        sources.append(info)

    # Total accepted before dedup
    total_accepted = sum(s.get("accepted", 0) for s in sources)

    # Merged / deduplicated count
    merged_path = HERE / MERGED_FILE
    deduplicated = 0
    if merged_path.exists():
        try:
            merged = json.loads(merged_path.read_text(encoding="utf-8"))
            deduplicated = len(merged)
        except Exception:
            pass

    duplicates_removed = total_accepted - deduplicated if deduplicated else 0

    status = {
        "status": "done",
        "timestamp": start_time,
        "duration_sec": duration_sec,
        "total_accepted": total_accepted,
        "deduplicated": deduplicated,
        "duplicates_removed": duplicates_removed,
        "sources": sources,
    }

    out = HERE / "status.json"
    out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Status uložen: {out}")
    print(f"  Celkem bytů (před dedup): {total_accepted}")
    print(f"  Po deduplikaci:           {deduplicated}")
    if duplicates_removed:
        print(f"  Odstraněno duplikátů:     {duplicates_removed}")
    for s in sources:
        acc = s.get("accepted", 0)
        err = s.get("error", "")
        exc = s.get("excluded", {})
        exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0)
        parts = [f"{s['name']:12s}: {acc} bytů"]
        if exc_total:
            parts.append(f"({exc_total} vyloučeno)")
        if err:
            parts.append(f"[CHYBA: {err}]")
        print("  " + "  ".join(parts))


if __name__ == "__main__":
    main()