#!/usr/bin/env python3 """ Realingo.cz scraper. Stáhne byty na prodej v Praze a vyfiltruje podle kritérií. Výstup: byty_realingo.json """ from __future__ import annotations import argparse from datetime import datetime import json import logging import math import re import time import urllib.request from pathlib import Path logger = logging.getLogger(__name__) # ── Konfigurace (sdílená se Sreality scraperem) ───────────────────────────── MAX_PRICE = 13_500_000 MIN_AREA = 69 MIN_FLOOR = 2 PER_PAGE = 40 # Realingo vrací 40 na stránku # Kategorie které chceme (dispozice 3+kk a větší) WANTED_CATEGORIES = { "FLAT3_KK", "FLAT31", # 3+kk, 3+1 "FLAT4_KK", "FLAT41", # 4+kk, 4+1 "FLAT5_KK", "FLAT51", # 5+kk, 5+1 "FLAT6", # 6+ "OTHERS_FLAT", # atypické — zkontrolujeme plochu } # Mapování category → label CATEGORY_LABELS = { "FLAT1_KK": "1+kk", "FLAT11": "1+1", "FLAT2_KK": "2+kk", "FLAT21": "2+1", "FLAT3_KK": "3+kk", "FLAT31": "3+1", "FLAT4_KK": "4+kk", "FLAT41": "4+1", "FLAT5_KK": "5+kk", "FLAT51": "5+1", "FLAT6": "6+", "OTHERS_FLAT": "Atypický", } HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", "Accept": "text/html,application/xhtml+xml", } BASE_URL = "https://www.realingo.cz" def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]: """Fetch a page of Prague listings. Returns (items, total_count).""" if page == 1: url = f"{BASE_URL}/prodej_byty/praha/" else: url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/" logger.debug(f"HTTP GET request: {url}") logger.debug(f"Headers: {HEADERS}") req = urllib.request.Request(url, headers=HEADERS) try: resp = urllib.request.urlopen(req, timeout=30) html = resp.read().decode("utf-8") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") match = re.search( r'', html, re.DOTALL ) if not match: logger.debug("No __NEXT_DATA__ script found in HTML") return [], 0 data = json.loads(match.group(1)) offer_list = data["props"]["pageProps"]["store"]["offer"]["list"] logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}") return offer_list["data"], offer_list["total"] except (urllib.error.URLError, ConnectionError, OSError) as e: logger.error(f"HTTP request failed for {url}: {e}", exc_info=True) raise def fetch_detail(listing_url: str) -> dict | None: """Fetch detail page for a listing to get floor, building type, etc.""" try: url = f"{BASE_URL}{listing_url}" logger.debug(f"HTTP GET request: {url}") req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=30) html = resp.read().decode("utf-8") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") match = re.search( r'', html, re.DOTALL ) if not match: logger.debug("No __NEXT_DATA__ script found in detail page") return None data = json.loads(match.group(1)) details = data["props"]["pageProps"]["store"]["offer"]["details"] # Get first (only) detail entry for detail_data in details.values(): logger.debug(f"Detail fetched for {listing_url}") return detail_data except Exception as e: logger.warning(f"Detail fetch failed for {listing_url}: {e}", exc_info=True) return None def format_price(price: int) -> str: s = str(price) parts = [] while s: parts.append(s[-3:]) s = s[:-3] return " ".join(reversed(parts)) + " Kč" def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]: """Load previously scraped data as cache keyed by hash_id.""" path = Path(json_path) if not path.exists(): return {} try: data = json.loads(path.read_text(encoding="utf-8")) return {e["hash_id"]: e for e in data if "hash_id" in e} except (json.JSONDecodeError, KeyError): return {} def scrape(max_pages: int | None = None, max_properties: int | None = None): cache = load_cache() logger.info("=" * 60) logger.info("Stahuji inzeráty z Realingo.cz") logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Min. plocha: {MIN_AREA} m²") logger.info(f"Patro: od {MIN_FLOOR}. NP") logger.info(f"Region: Praha") if cache: logger.info(f"Cache: {len(cache)} bytů z minulého běhu") if max_pages: logger.info(f"Max. stran: {max_pages}") if max_properties: logger.info(f"Max. bytů: {max_properties}") logger.info("=" * 60) # Step 1: Fetch all listing pages logger.info("\nFáze 1: Stahování seznamu inzerátů...") all_listings = [] page = 1 total = None while True: if max_pages and page > max_pages: logger.debug(f"Max pages limit reached: {max_pages}") break logger.info(f"Strana {page} ...") items, total_count = fetch_listing_page(page) if total is None: total = total_count total_pages = math.ceil(total / PER_PAGE) logger.info(f"→ Celkem {total} inzerátů, {total_pages} stran") if not items: logger.debug(f"No items found on page {page}, stopping") break all_listings.extend(items) page += 1 if page > total_pages: break time.sleep(0.5) logger.info(f"\nStaženo: {len(all_listings)} inzerátů") # Step 2: Pre-filter by category, price, area from listing data pre_filtered = [] excluded_category = 0 excluded_price = 0 excluded_area = 0 excluded_no_gps = 0 for item in all_listings: item_id = item.get("id") cat = item.get("category", "") if cat not in WANTED_CATEGORIES: excluded_category += 1 logger.debug(f"Filter: id={item_id} - excluded (category {cat})") continue price = item.get("price", {}).get("total", 0) or 0 if price > MAX_PRICE or price == 0: excluded_price += 1 logger.debug(f"Filter: id={item_id} - excluded (price {price})") continue area = item.get("area", {}).get("main") if area is not None and area < MIN_AREA: excluded_area += 1 logger.debug(f"Filter: id={item_id} - excluded (area {area} m²)") continue loc = item.get("location", {}) if not loc.get("latitude") or not loc.get("longitude"): excluded_no_gps += 1 logger.debug(f"Filter: id={item_id} - excluded (no GPS)") continue pre_filtered.append(item) logger.info(f"\nPo předfiltraci:") logger.info(f" Vyloučeno (dispozice): {excluded_category}") logger.info(f" Vyloučeno (cena): {excluded_price}") logger.info(f" Vyloučeno (plocha): {excluded_area}") logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}") logger.info(f" Zbývá: {len(pre_filtered)}") # Step 3: Fetch details for remaining listings (floor, building type) logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...") results = [] excluded_panel = 0 excluded_floor = 0 excluded_detail = 0 cache_hits = 0 properties_fetched = 0 for i, item in enumerate(pre_filtered): if max_properties and properties_fetched >= max_properties: logger.debug(f"Max properties limit reached: {max_properties}") break # Check cache — if hash_id exists and price unchanged, reuse item_id = int(item["id"]) item_price = item.get("price", {}).get("total", 0) or 0 cached = cache.get(item_id) if cached and cached.get("price") == item_price: cache_hits += 1 logger.debug(f"Cache hit for id={item_id}") results.append(cached) continue time.sleep(0.3) detail_data = fetch_detail(item["url"]) if not detail_data: excluded_detail += 1 logger.debug(f"Filter: id={item_id} - excluded (detail fetch failed)") continue detail = detail_data.get("offer", {}).get("detail", {}) if not detail and "detail" in detail_data: detail = detail_data["detail"] # Check building type — exclude panel building_type = detail.get("buildingType", "") if building_type == "PANEL": excluded_panel += 1 logger.debug(f"Filter: id={item['id']} - excluded (panel construction)") logger.info(f"✗ Vyloučen #{item['id']}: panel") continue # Check building position — exclude sídliště building_position = detail.get("buildingPosition", "") if building_position and "ESTATE" in str(building_position).upper(): excluded_panel += 1 logger.debug(f"Filter: id={item['id']} - excluded (building estate)") logger.info(f"✗ Vyloučen #{item['id']}: sídliště") continue # Check floor floor = detail.get("floor") if floor is not None and floor < MIN_FLOOR: excluded_floor += 1 logger.debug(f"Filter: id={item_id} - excluded (floor {floor})") continue # Map building type bt_map = { "BRICK": "Cihlová", "PANEL": "Panelová", "WOOD": "Dřevostavba", "STEEL": "Ocelová", "MIXED": "Smíšená", "MONTAGE": "Montovaná", } ownership_map = { "PRIVATE": "Osobní", "COOPERATIVE": "Družstevní", "STATE": "Státní/obecní", } cat = item.get("category", "") loc = item.get("location", {}) result = { "hash_id": int(item["id"]), "name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')} m²", "price": item.get("price", {}).get("total", 0), "price_formatted": format_price(item.get("price", {}).get("total", 0)), "locality": loc.get("address", "Praha"), "lat": loc["latitude"], "lon": loc["longitude"], "disposition": CATEGORY_LABELS.get(cat, "?"), "floor": floor, "area": item.get("area", {}).get("main"), "building_type": bt_map.get(building_type, building_type or "neuvedeno"), "ownership": ownership_map.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"), "url": f"{BASE_URL}{item['url']}", "source": "realingo", "image": "", "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) properties_fetched += 1 if (i + 1) % 20 == 0: logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...") logger.info(f"\n{'=' * 60}") logger.info(f"Výsledky Realingo:") logger.info(f" Předfiltrováno: {len(pre_filtered)}") logger.info(f" Z cache (přeskočeno): {cache_hits}") logger.info(f" Vyloučeno (panel/síd): {excluded_panel}") logger.info(f" Vyloučeno (patro): {excluded_floor}") logger.info(f" Vyloučeno (bez detailu): {excluded_detail}") logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f"{'=' * 60}") return results if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape apartments from Realingo.cz") parser.add_argument("--max-pages", type=int, default=None, help="Maximum number of listing pages to scrape") parser.add_argument("--max-properties", type=int, default=None, help="Maximum number of properties to fetch details for") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") args = parser.parse_args() # Configure logging logging.basicConfig( level=getattr(logging, args.log_level), format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", handlers=[logging.StreamHandler()] ) start = time.time() estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) if estates: json_path = Path("byty_realingo.json") json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed = time.time() - start logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") else: logger.info("\nŽádné byty z Realinga neodpovídají kritériím :(")