#!/usr/bin/env python3 """ Bezrealitky.cz scraper. Stáhne byty na prodej v Praze a vyfiltruje podle kritérií. Výstup: byty_bezrealitky.json """ from __future__ import annotations import json import math import re import time import urllib.request from pathlib import Path # ── Konfigurace ───────────────────────────────────────────────────────────── MAX_PRICE = 13_500_000 MIN_AREA = 69 MIN_FLOOR = 2 PER_PAGE = 15 # Bezrealitky vrací 15 na stránku # Dispozice které chceme WANTED_DISPOSITIONS = { "DISP_3_KK", "DISP_3_1", "DISP_4_KK", "DISP_4_1", "DISP_5_KK", "DISP_5_1", "DISP_6", "DISP_OTHER", # atypické } DISPOSITION_LABELS = { "DISP_1_KK": "1+kk", "DISP_1_1": "1+1", "DISP_2_KK": "2+kk", "DISP_2_1": "2+1", "DISP_3_KK": "3+kk", "DISP_3_1": "3+1", "DISP_4_KK": "4+kk", "DISP_4_1": "4+1", "DISP_5_KK": "5+kk", "DISP_5_1": "5+1", "DISP_6": "6+", "DISP_OTHER": "Atypický", } CONSTRUCTION_MAP = { "BRICK": "Cihlová", "PANEL": "Panelová", "WOOD": "Dřevostavba", "MIXED": "Smíšená", "MONTAGE": "Montovaná", "STEEL": "Ocelová", } OWNERSHIP_MAP = { "OSOBNI": "Osobní", "DRUZSTEVNI": "Družstevní", "STATNI": "Státní/obecní", } HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml", "Accept-Language": "cs,en;q=0.9", } BASE_URL = "https://www.bezrealitky.cz" def fetch_page(page: int) -> tuple[list[dict], int]: """ Fetch a listing page from Bezrealitky. Returns (list of advert dicts from Apollo cache, total count). """ url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}" req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=30) html = resp.read().decode("utf-8") match = re.search( r'', html, re.DOTALL ) if not match: return [], 0 data = json.loads(match.group(1)) cache = data["props"]["pageProps"]["apolloCache"] # Extract adverts from cache adverts = [] for key, val in cache.items(): if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert": adverts.append(val) # Get total count from ROOT_QUERY total = 0 root = cache.get("ROOT_QUERY", {}) for key, val in root.items(): if "listAdverts" in key and isinstance(val, dict): tc = val.get("totalCount") if tc and tc > total: total = tc return adverts, total def fetch_detail(uri: str) -> dict | None: """Fetch detail page for a listing.""" try: url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}" req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=30) html = resp.read().decode("utf-8") match = re.search( r'', html, re.DOTALL ) if not match: return None data = json.loads(match.group(1)) cache = data["props"]["pageProps"]["apolloCache"] # Find the full advert in cache for key, val in cache.items(): if key.startswith("Advert:") and isinstance(val, dict): # Detail pages have much more fields if "construction" in val or "etage" in val or "ownership" in val: return val except Exception as e: print(f" Warning: detail failed for {uri}: {e}") return None def format_price(price: int) -> str: s = str(price) parts = [] while s: parts.append(s[-3:]) s = s[:-3] return " ".join(reversed(parts)) + " Kč" def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]: """Load previously scraped data as cache keyed by hash_id.""" path = Path(json_path) if not path.exists(): return {} try: data = json.loads(path.read_text(encoding="utf-8")) return {e["hash_id"]: e for e in data if "hash_id" in e} except (json.JSONDecodeError, KeyError): return {} def scrape(): cache = load_cache() print("=" * 60) print("Stahuji inzeráty z Bezrealitky.cz") print(f"Cena: do {format_price(MAX_PRICE)}") print(f"Min. plocha: {MIN_AREA} m²") print(f"Patro: od {MIN_FLOOR}. NP") print(f"Region: Praha") if cache: print(f"Cache: {len(cache)} bytů z minulého běhu") print("=" * 60) # Step 1: Fetch all listing pages print("\nFáze 1: Stahování seznamu inzerátů...") all_adverts = {} # id -> advert dict (dedup) page = 1 total = None while True: print(f" Strana {page} ...") adverts, total_count = fetch_page(page) if total is None and total_count > 0: total = total_count total_pages = math.ceil(total / PER_PAGE) print(f" → Celkem {total} inzerátů, ~{total_pages} stran") if not adverts: break for adv in adverts: adv_id = adv.get("id") if adv_id and adv_id not in all_adverts: all_adverts[adv_id] = adv page += 1 if total and page > math.ceil(total / PER_PAGE): break time.sleep(0.5) print(f"\n Staženo: {len(all_adverts)} unikátních inzerátů") # Step 2: Pre-filter by disposition, price, area from list data pre_filtered = [] excluded_disp = 0 excluded_price = 0 excluded_area = 0 excluded_no_gps = 0 for adv in all_adverts.values(): disp = adv.get("disposition", "") if disp not in WANTED_DISPOSITIONS: excluded_disp += 1 continue price = adv.get("price", 0) or 0 if price > MAX_PRICE or price == 0: excluded_price += 1 continue surface = adv.get("surface") if surface is not None and surface < MIN_AREA: excluded_area += 1 continue gps = adv.get("gps", {}) if not gps or not gps.get("lat") or not gps.get("lng"): excluded_no_gps += 1 continue pre_filtered.append(adv) print(f"\nPo předfiltraci:") print(f" Vyloučeno (dispozice): {excluded_disp}") print(f" Vyloučeno (cena): {excluded_price}") print(f" Vyloučeno (plocha): {excluded_area}") print(f" Vyloučeno (bez GPS): {excluded_no_gps}") print(f" Zbývá: {len(pre_filtered)}") # Step 3: Fetch details print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...") results = [] excluded_panel = 0 excluded_floor = 0 excluded_detail = 0 cache_hits = 0 for i, adv in enumerate(pre_filtered): uri = adv.get("uri", "") if not uri: excluded_detail += 1 continue # Check cache — if hash_id exists and price unchanged, reuse adv_id = int(adv["id"]) adv_price = adv.get("price", 0) or 0 cached = cache.get(adv_id) if cached and cached.get("price") == adv_price: cache_hits += 1 results.append(cached) continue time.sleep(0.4) detail = fetch_detail(uri) if not detail: excluded_detail += 1 continue # Check construction — exclude panel construction = detail.get("construction", "") if construction == "PANEL": excluded_panel += 1 print(f" ✗ Vyloučen #{adv['id']}: panel") continue # Check situation — exclude sídliště situation = detail.get("situation", "") if situation and "HOUSING_ESTATE" in str(situation).upper(): excluded_panel += 1 print(f" ✗ Vyloučen #{adv['id']}: sídliště") continue # Check floor (etage) etage = detail.get("etage") if etage is not None and etage < MIN_FLOOR: excluded_floor += 1 continue gps = adv.get("gps", {}) disp = adv.get("disposition", "") # Get address — key includes locale parameter address = "" for key in detail: if key.startswith("address(") and "withHouseNumber" not in key: address = detail[key] break if not address: for key in detail: if key.startswith("address("): address = detail[key] break if not address: address = adv.get('address({"locale":"CS"})', "Praha") result = { "hash_id": int(adv["id"]), "name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')} m²", "price": adv.get("price", 0), "price_formatted": format_price(adv.get("price", 0)), "locality": address, "lat": gps["lat"], "lon": gps["lng"], "disposition": DISPOSITION_LABELS.get(disp, "?"), "floor": etage, "area": adv.get("surface"), "building_type": CONSTRUCTION_MAP.get(construction, construction or "neuvedeno"), "ownership": OWNERSHIP_MAP.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"), "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}", "source": "bezrealitky", "image": "", } results.append(result) if (i + 1) % 20 == 0: print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...") print(f"\n{'=' * 60}") print(f"Výsledky Bezrealitky:") print(f" Předfiltrováno: {len(pre_filtered)}") print(f" Z cache (přeskočeno): {cache_hits}") print(f" Vyloučeno (panel/síd): {excluded_panel}") print(f" Vyloučeno (patro): {excluded_floor}") print(f" Vyloučeno (bez detailu): {excluded_detail}") print(f" ✓ Vyhovující byty: {len(results)}") print(f"{'=' * 60}") return results if __name__ == "__main__": start = time.time() estates = scrape() if estates: json_path = Path("byty_bezrealitky.json") json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed = time.time() - start print(f"\n✓ Data uložena: {json_path.resolve()}") print(f"⏱ Celkový čas: {elapsed:.0f} s") else: print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")