maru-hleda-byt/scrape_bezrealitky.py

#!/usr/bin/env python3
"""
Bezrealitky.cz scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_bezrealitky.json
"""
from __future__ import annotations

import json
import math
import re
import time
import urllib.request
from pathlib import Path

# ── Konfigurace ─────────────────────────────────────────────────────────────

MAX_PRICE = 13_500_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 15  # Bezrealitky vrací 15 na stránku

# Dispozice které chceme
WANTED_DISPOSITIONS = {
    "DISP_3_KK", "DISP_3_1",
    "DISP_4_KK", "DISP_4_1",
    "DISP_5_KK", "DISP_5_1",
    "DISP_6",
    "DISP_OTHER",  # atypické
}

DISPOSITION_LABELS = {
    "DISP_1_KK": "1+kk", "DISP_1_1": "1+1",
    "DISP_2_KK": "2+kk", "DISP_2_1": "2+1",
    "DISP_3_KK": "3+kk", "DISP_3_1": "3+1",
    "DISP_4_KK": "4+kk", "DISP_4_1": "4+1",
    "DISP_5_KK": "5+kk", "DISP_5_1": "5+1",
    "DISP_6": "6+",
    "DISP_OTHER": "Atypický",
}

CONSTRUCTION_MAP = {
    "BRICK": "Cihlová",
    "PANEL": "Panelová",
    "WOOD": "Dřevostavba",
    "MIXED": "Smíšená",
    "MONTAGE": "Montovaná",
    "STEEL": "Ocelová",
}

OWNERSHIP_MAP = {
    "OSOBNI": "Osobní",
    "DRUZSTEVNI": "Družstevní",
    "STATNI": "Státní/obecní",
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml",
    "Accept-Language": "cs,en;q=0.9",
}

BASE_URL = "https://www.bezrealitky.cz"


def fetch_page(page: int) -> tuple[list[dict], int]:
    """
    Fetch a listing page from Bezrealitky.
    Returns (list of advert dicts from Apollo cache, total count).
    """
    url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
    req = urllib.request.Request(url, headers=HEADERS)
    resp = urllib.request.urlopen(req, timeout=30)
    html = resp.read().decode("utf-8")

    match = re.search(
        r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
        html, re.DOTALL
    )
    if not match:
        return [], 0

    data = json.loads(match.group(1))
    cache = data["props"]["pageProps"]["apolloCache"]

    # Extract adverts from cache
    adverts = []
    for key, val in cache.items():
        if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
            adverts.append(val)

    # Get total count from ROOT_QUERY
    total = 0
    root = cache.get("ROOT_QUERY", {})
    for key, val in root.items():
        if "listAdverts" in key and isinstance(val, dict):
            tc = val.get("totalCount")
            if tc and tc > total:
                total = tc

    return adverts, total


def fetch_detail(uri: str) -> dict | None:
    """Fetch detail page for a listing."""
    try:
        url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
        req = urllib.request.Request(url, headers=HEADERS)
        resp = urllib.request.urlopen(req, timeout=30)
        html = resp.read().decode("utf-8")

        match = re.search(
            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
            html, re.DOTALL
        )
        if not match:
            return None

        data = json.loads(match.group(1))
        cache = data["props"]["pageProps"]["apolloCache"]

        # Find the full advert in cache
        for key, val in cache.items():
            if key.startswith("Advert:") and isinstance(val, dict):
                # Detail pages have much more fields
                if "construction" in val or "etage" in val or "ownership" in val:
                    return val

    except Exception as e:
        print(f"    Warning: detail failed for {uri}: {e}")
    return None


def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"


def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
    """Load previously scraped data as cache keyed by hash_id."""
    path = Path(json_path)
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        return {e["hash_id"]: e for e in data if "hash_id" in e}
    except (json.JSONDecodeError, KeyError):
        return {}


def scrape():
    cache = load_cache()

    print("=" * 60)
    print("Stahuji inzeráty z Bezrealitky.cz")
    print(f"Cena: do {format_price(MAX_PRICE)}")
    print(f"Min. plocha: {MIN_AREA} m²")
    print(f"Patro: od {MIN_FLOOR}. NP")
    print(f"Region: Praha")
    if cache:
        print(f"Cache: {len(cache)} bytů z minulého běhu")
    print("=" * 60)

    # Step 1: Fetch all listing pages
    print("\nFáze 1: Stahování seznamu inzerátů...")
    all_adverts = {}  # id -> advert dict (dedup)
    page = 1
    total = None

    while True:
        print(f"  Strana {page} ...")
        adverts, total_count = fetch_page(page)

        if total is None and total_count > 0:
            total = total_count
            total_pages = math.ceil(total / PER_PAGE)
            print(f"  → Celkem {total} inzerátů, ~{total_pages} stran")

        if not adverts:
            break

        for adv in adverts:
            adv_id = adv.get("id")
            if adv_id and adv_id not in all_adverts:
                all_adverts[adv_id] = adv

        page += 1
        if total and page > math.ceil(total / PER_PAGE):
            break
        time.sleep(0.5)

    print(f"\n  Staženo: {len(all_adverts)} unikátních inzerátů")

    # Step 2: Pre-filter by disposition, price, area from list data
    pre_filtered = []
    excluded_disp = 0
    excluded_price = 0
    excluded_area = 0
    excluded_no_gps = 0

    for adv in all_adverts.values():
        disp = adv.get("disposition", "")
        if disp not in WANTED_DISPOSITIONS:
            excluded_disp += 1
            continue

        price = adv.get("price", 0) or 0
        if price > MAX_PRICE or price == 0:
            excluded_price += 1
            continue

        surface = adv.get("surface")
        if surface is not None and surface < MIN_AREA:
            excluded_area += 1
            continue

        gps = adv.get("gps", {})
        if not gps or not gps.get("lat") or not gps.get("lng"):
            excluded_no_gps += 1
            continue

        pre_filtered.append(adv)

    print(f"\nPo předfiltraci:")
    print(f"  Vyloučeno (dispozice): {excluded_disp}")
    print(f"  Vyloučeno (cena):      {excluded_price}")
    print(f"  Vyloučeno (plocha):    {excluded_area}")
    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
    print(f"  Zbývá:                 {len(pre_filtered)}")

    # Step 3: Fetch details
    print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
    results = []
    excluded_panel = 0
    excluded_floor = 0
    excluded_detail = 0
    cache_hits = 0

    for i, adv in enumerate(pre_filtered):
        uri = adv.get("uri", "")
        if not uri:
            excluded_detail += 1
            continue

        # Check cache — if hash_id exists and price unchanged, reuse
        adv_id = int(adv["id"])
        adv_price = adv.get("price", 0) or 0
        cached = cache.get(adv_id)
        if cached and cached.get("price") == adv_price:
            cache_hits += 1
            results.append(cached)
            continue

        time.sleep(0.4)
        detail = fetch_detail(uri)

        if not detail:
            excluded_detail += 1
            continue

        # Check construction — exclude panel
        construction = detail.get("construction", "")
        if construction == "PANEL":
            excluded_panel += 1
            print(f"  ✗ Vyloučen #{adv['id']}: panel")
            continue

        # Check situation — exclude sídliště
        situation = detail.get("situation", "")
        if situation and "HOUSING_ESTATE" in str(situation).upper():
            excluded_panel += 1
            print(f"  ✗ Vyloučen #{adv['id']}: sídliště")
            continue

        # Check floor (etage)
        etage = detail.get("etage")
        if etage is not None and etage < MIN_FLOOR:
            excluded_floor += 1
            continue

        gps = adv.get("gps", {})
        disp = adv.get("disposition", "")

        # Get address — key includes locale parameter
        address = ""
        for key in detail:
            if key.startswith("address(") and "withHouseNumber" not in key:
                address = detail[key]
                break
        if not address:
            for key in detail:
                if key.startswith("address("):
                    address = detail[key]
                    break
        if not address:
            address = adv.get('address({"locale":"CS"})', "Praha")

        result = {
            "hash_id": int(adv["id"]),
            "name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')} m²",
            "price": adv.get("price", 0),
            "price_formatted": format_price(adv.get("price", 0)),
            "locality": address,
            "lat": gps["lat"],
            "lon": gps["lng"],
            "disposition": DISPOSITION_LABELS.get(disp, "?"),
            "floor": etage,
            "area": adv.get("surface"),
            "building_type": CONSTRUCTION_MAP.get(construction, construction or "neuvedeno"),
            "ownership": OWNERSHIP_MAP.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
            "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
            "source": "bezrealitky",
            "image": "",
        }
        results.append(result)

        if (i + 1) % 20 == 0:
            print(f"  Zpracováno {i + 1}/{len(pre_filtered)} ...")

    print(f"\n{'=' * 60}")
    print(f"Výsledky Bezrealitky:")
    print(f"  Předfiltrováno:        {len(pre_filtered)}")
    print(f"  Z cache (přeskočeno): {cache_hits}")
    print(f"  Vyloučeno (panel/síd): {excluded_panel}")
    print(f"  Vyloučeno (patro):     {excluded_floor}")
    print(f"  Vyloučeno (bez detailu): {excluded_detail}")
    print(f"  ✓ Vyhovující byty:    {len(results)}")
    print(f"{'=' * 60}")

    return results


if __name__ == "__main__":
    start = time.time()
    estates = scrape()

    if estates:
        json_path = Path("byty_bezrealitky.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        print(f"\n✓ Data uložena: {json_path.resolve()}")
        print(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")