#!/usr/bin/env python3 """ Reality iDNES scraper. Stáhne byty na prodej v Praze a vyfiltruje podle kritérií. Výstup: byty_idnes.json """ from __future__ import annotations import json import math import re import time import urllib.request import urllib.parse from html.parser import HTMLParser from pathlib import Path # ── Konfigurace ───────────────────────────────────────────────────────────── MAX_PRICE = 13_500_000 MIN_AREA = 69 MIN_FLOOR = 2 PER_PAGE = 26 # iDNES vrací 26 na stránku # Dispozice — kódy pro s-qc[subtypeFlat] DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k" # Mapování dispozice z titulku na label DISPOSITION_MAP = { "3+kk": "3+kk", "3+1": "3+1", "4+kk": "4+kk", "4+1": "4+1", "5+kk": "5+kk", "5+1": "5+1", "6+kk": "6+", "6+1": "6+", "6 a více": "6+", } HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "cs,en;q=0.9", "Accept-Encoding": "identity", "Connection": "keep-alive", } BASE_URL = "https://reality.idnes.cz" MAX_RETRIES = 5 def fetch_url(url: str) -> str: """Fetch URL and return HTML string with retry logic.""" for attempt in range(MAX_RETRIES): try: req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=30) data = resp.read() return data.decode("utf-8") except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e: if attempt < MAX_RETRIES - 1: wait = (attempt + 1) * 3 # 3, 6, 9, 12s print(f" Retry {attempt + 1}/{MAX_RETRIES} (wait {wait}s): {e}") time.sleep(wait) else: raise def build_list_url(page: int = 0) -> str: """Build listing URL with all filters.""" base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/" params = { "s-qc[subtypeFlat]": DISPOSITION_CODES, "s-qc[usableAreaMin]": str(MIN_AREA), } url = f"{base}?{urllib.parse.urlencode(params)}" if page > 0: url += f"&page={page}" return url def parse_total_count(html: str) -> int: """Extract total listing count from page.""" # Look for "720 inzerátů" or similar match = re.search(r'(\d[\d\s]*)\s*inzerát', html) if match: return int(match.group(1).replace(" ", "").replace("\xa0", "")) return 0 def parse_listings(html: str) -> list[dict]: """Parse listing cards from HTML using regex.""" results = [] # Find each listing block — look for c-products__link with detail URL # Pattern: ... block ... # Each listing card contains: title (h2), price (strong), info (p.c-products__info) # Split by listing items, skip ads items = re.findall( r']*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)\s*\s*', html, re.DOTALL ) # Alternative: find all detail links and extract surrounding context # More robust approach: find each detail link and parse nearby elements link_pattern = re.compile( r']*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>', re.DOTALL ) # Also match when class comes before href link_pattern2 = re.compile( r']*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>', re.DOTALL ) # Find all c-products__link anchors all_links = link_pattern.findall(html) + link_pattern2.findall(html) seen_urls = set() # For each link, find the surrounding product block for link_url in all_links: if link_url in seen_urls: continue seen_urls.add(link_url) # Find context around this link (the product card) escaped_url = re.escape(link_url) context_match = re.search( escaped_url + r'(.*?)\s*', html, re.DOTALL ) if not context_match: continue block = context_match.group(1) # Ensure full URL url = link_url if not url.startswith("http"): url = BASE_URL + url # Skip ads ad_check_start = max(0, context_match.start() - 500) ad_block = html[ad_check_start:context_match.start()] if "advertisment" in ad_block or "advertisement" in ad_block: continue # Parse title:

prodej bytu 3+kk 79 m2

title_match = re.search(r'class="c-products__title"[^>]*>(.*?)', block, re.DOTALL) title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else "" # Parse price:

12 950 000 Kč

price_match = re.search(r'c-products__price[^>]*>.*?(.*?)', block, re.DOTALL) price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else "" # Parse address:

Klečkova, Praha 5 - Stodůlky

info_match = re.search(r'class="c-products__info"[^>]*>(.*?)

', block, re.DOTALL) info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else "" # Parse disposition and area from title disp_match = re.search(r'(\d\+(?:kk|\d))', title) area_match = re.search(r'(\d+)\s*m[²2]', title) disposition = disp_match.group(1) if disp_match else None area = int(area_match.group(1)) if area_match else None if not disposition and ("6 a" in title or "6+" in title): disposition = "6+" # Parse price price = 0 if price_text and "vyžádání" not in price_text.lower(): price_clean = re.sub(r'[^\d]', '', price_text) if price_clean: price = int(price_clean) # Extract listing ID from URL id_match = re.search(r'/([a-f0-9]{24})/?', url) listing_id = id_match.group(1) if id_match else url results.append({ "id": listing_id, "url": url, "disposition": DISPOSITION_MAP.get(disposition, disposition or "?"), "area": area, "price": price, "locality": info, }) return results def parse_detail(html: str) -> dict: """Parse detail page for GPS, floor, construction, ownership.""" detail = {} # 1. Parse dataLayer.push() for GPS and other data dl_match = re.search( r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)', html, re.DOTALL ) if dl_match: # Clean up JS object to valid JSON js_obj = dl_match.group(1) # Replace single quotes with double, handle trailing commas, etc. # The dataLayer is usually valid JSON-like, let's try parsing try: # Remove JS comments, handle unquoted keys # Most importantly: listing_lat, listing_lon, listing_price, listing_area lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj) lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj) if lat_match: detail["lat"] = float(lat_match.group(1)) if lon_match: detail["lon"] = float(lon_match.group(1)) except (ValueError, AttributeError): pass # 2. Parse DT/DD pairs for floor, construction, ownership # Pattern:
Label
Value
dt_dd_pairs = re.findall( r']*>(.*?)\s*]*>(.*?)', html, re.DOTALL ) for dt, dd in dt_dd_pairs: dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower() dd_clean = re.sub(r'<[^>]+>', '', dd).strip() if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean: # "2. patro (3. NP)" or "3. podlaží z celkem 5" # Try to find NP first np_match = re.search(r'(\d+)\.\s*NP', dd_clean) if np_match: detail["floor"] = int(np_match.group(1)) else: # Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly patro_match = re.search(r'(\d+)', dd_clean) if patro_match: detail["floor"] = int(patro_match.group(1)) if "konstrukce" in dt_clean or "stavba" in dt_clean: detail["construction"] = dd_clean.lower() if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean: detail["ownership"] = dd_clean return detail def format_price(price: int) -> str: s = str(price) parts = [] while s: parts.append(s[-3:]) s = s[:-3] return " ".join(reversed(parts)) + " Kč" def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]: """Load previously scraped data as cache keyed by hash_id.""" path = Path(json_path) if not path.exists(): return {} try: data = json.loads(path.read_text(encoding="utf-8")) return {str(e["hash_id"]): e for e in data if "hash_id" in e} except (json.JSONDecodeError, KeyError): return {} def scrape(): cache = load_cache() print("=" * 60) print("Stahuji inzeráty z Reality iDNES") print(f"Cena: do {format_price(MAX_PRICE)}") print(f"Min. plocha: {MIN_AREA} m²") print(f"Patro: od {MIN_FLOOR}. NP") print(f"Region: Praha") if cache: print(f"Cache: {len(cache)} bytů z minulého běhu") print("=" * 60) # Step 1: Fetch listing pages print("\nFáze 1: Stahování seznamu inzerátů...") all_listings = {} # id -> listing dict page = 0 total = None while True: url = build_list_url(page) print(f" Strana {page + 1} ...") html = fetch_url(url) if total is None: total = parse_total_count(html) total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1 print(f" → Celkem {total} inzerátů, ~{total_pages} stran") listings = parse_listings(html) if not listings: break for item in listings: lid = item["id"] if lid not in all_listings: all_listings[lid] = item page += 1 if total and page >= math.ceil(total / PER_PAGE): break time.sleep(1.0) print(f"\n Staženo: {len(all_listings)} unikátních inzerátů") # Step 2: Pre-filter by price and area from list data pre_filtered = [] excluded_price = 0 excluded_area = 0 excluded_disp = 0 for item in all_listings.values(): if item["price"] <= 0 or item["price"] > MAX_PRICE: excluded_price += 1 continue if item["area"] is not None and item["area"] < MIN_AREA: excluded_area += 1 continue if item["disposition"] == "?": excluded_disp += 1 continue pre_filtered.append(item) print(f"\nPo předfiltraci:") print(f" Vyloučeno (cena): {excluded_price}") print(f" Vyloučeno (plocha): {excluded_area}") print(f" Vyloučeno (dispozice): {excluded_disp}") print(f" Zbývá: {len(pre_filtered)}") # Step 3: Fetch details for GPS, floor, construction print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...") results = [] excluded_panel = 0 excluded_floor = 0 excluded_no_gps = 0 excluded_detail = 0 cache_hits = 0 for i, item in enumerate(pre_filtered): # Check cache — if hash_id exists and price unchanged, reuse cached = cache.get(str(item["id"])) if cached and cached.get("price") == item["price"]: cache_hits += 1 results.append(cached) continue url = item["url"] time.sleep(0.4) try: html = fetch_url(url) except Exception as e: print(f" Warning: detail failed for {item['id']}: {e}") excluded_detail += 1 continue detail = parse_detail(html) # Must have GPS if not detail.get("lat") or not detail.get("lon"): excluded_no_gps += 1 continue # Check construction — exclude panel construction = detail.get("construction", "") if "panel" in construction: excluded_panel += 1 print(f" ✗ Vyloučen {item['id'][:12]}...: panel ({construction})") continue # Check for sídliště in construction/description if "sídliště" in construction or "sidliste" in construction: excluded_panel += 1 print(f" ✗ Vyloučen {item['id'][:12]}...: sídliště") continue # Check floor floor = detail.get("floor") if floor is not None and floor < MIN_FLOOR: excluded_floor += 1 continue # Map construction to Czech label building_type = "neuvedeno" if construction: if "cihlo" in construction or "cihla" in construction: building_type = "Cihlová" elif "smíšen" in construction or "smisen" in construction: building_type = "Smíšená" elif "skelet" in construction: building_type = "Skeletová" elif "dřevo" in construction or "drevo" in construction: building_type = "Dřevostavba" elif "mont" in construction: building_type = "Montovaná" else: building_type = construction.capitalize() result = { "hash_id": item["id"], "name": f"Prodej bytu {item['disposition']} {item.get('area', '?')} m²", "price": item["price"], "price_formatted": format_price(item["price"]), "locality": item["locality"], "lat": detail["lat"], "lon": detail["lon"], "disposition": item["disposition"], "floor": floor, "area": item["area"], "building_type": building_type, "ownership": detail.get("ownership", "neuvedeno"), "url": item["url"], "source": "idnes", "image": "", } results.append(result) if (i + 1) % 20 == 0: print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...") print(f"\n{'=' * 60}") print(f"Výsledky Reality iDNES:") print(f" Předfiltrováno: {len(pre_filtered)}") print(f" Z cache (přeskočeno): {cache_hits}") print(f" Vyloučeno (panel/síd): {excluded_panel}") print(f" Vyloučeno (patro): {excluded_floor}") print(f" Vyloučeno (bez GPS): {excluded_no_gps}") print(f" Vyloučeno (bez detailu): {excluded_detail}") print(f" ✓ Vyhovující byty: {len(results)}") print(f"{'=' * 60}") return results if __name__ == "__main__": start = time.time() estates = scrape() if estates: json_path = Path("byty_idnes.json") json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed = time.time() - start print(f"\n✓ Data uložena: {json_path.resolve()}") print(f"⏱ Celkový čas: {elapsed:.0f} s") else: print("\nŽádné byty z Reality iDNES neodpovídají kritériím :(")