diff --git a/scrape_bezrealitky.py b/scrape_bezrealitky.py new file mode 100644 index 0000000..21715f0 --- /dev/null +++ b/scrape_bezrealitky.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 +""" +Bezrealitky.cz scraper. +Stáhne byty na prodej v Praze a vyfiltruje podle kritérií. +Výstup: byty_bezrealitky.json +""" +from __future__ import annotations + +import json +import math +import re +import time +import urllib.request +from pathlib import Path + +# ── Konfigurace ───────────────────────────────────────────────────────────── + +MAX_PRICE = 13_500_000 +MIN_AREA = 69 +MIN_FLOOR = 2 +PER_PAGE = 15 # Bezrealitky vrací 15 na stránku + +# Dispozice které chceme +WANTED_DISPOSITIONS = { + "DISP_3_KK", "DISP_3_1", + "DISP_4_KK", "DISP_4_1", + "DISP_5_KK", "DISP_5_1", + "DISP_6", + "DISP_OTHER", # atypické +} + +DISPOSITION_LABELS = { + "DISP_1_KK": "1+kk", "DISP_1_1": "1+1", + "DISP_2_KK": "2+kk", "DISP_2_1": "2+1", + "DISP_3_KK": "3+kk", "DISP_3_1": "3+1", + "DISP_4_KK": "4+kk", "DISP_4_1": "4+1", + "DISP_5_KK": "5+kk", "DISP_5_1": "5+1", + "DISP_6": "6+", + "DISP_OTHER": "Atypický", +} + +CONSTRUCTION_MAP = { + "BRICK": "Cihlová", + "PANEL": "Panelová", + "WOOD": "Dřevostavba", + "MIXED": "Smíšená", + "MONTAGE": "Montovaná", + "STEEL": "Ocelová", +} + +OWNERSHIP_MAP = { + "OSOBNI": "Osobní", + "DRUZSTEVNI": "Družstevní", + "STATNI": "Státní/obecní", +} + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml", + "Accept-Language": "cs,en;q=0.9", +} + +BASE_URL = "https://www.bezrealitky.cz" + + +def fetch_page(page: int) -> tuple[list[dict], int]: + """ + Fetch a listing page from Bezrealitky. + Returns (list of advert dicts from Apollo cache, total count). + """ + url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}" + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + html = resp.read().decode("utf-8") + + match = re.search( + r'', + html, re.DOTALL + ) + if not match: + return [], 0 + + data = json.loads(match.group(1)) + cache = data["props"]["pageProps"]["apolloCache"] + + # Extract adverts from cache + adverts = [] + for key, val in cache.items(): + if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert": + adverts.append(val) + + # Get total count from ROOT_QUERY + total = 0 + root = cache.get("ROOT_QUERY", {}) + for key, val in root.items(): + if "listAdverts" in key and isinstance(val, dict): + tc = val.get("totalCount") + if tc and tc > total: + total = tc + + return adverts, total + + +def fetch_detail(uri: str) -> dict | None: + """Fetch detail page for a listing.""" + try: + url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}" + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + html = resp.read().decode("utf-8") + + match = re.search( + r'', + html, re.DOTALL + ) + if not match: + return None + + data = json.loads(match.group(1)) + cache = data["props"]["pageProps"]["apolloCache"] + + # Find the full advert in cache + for key, val in cache.items(): + if key.startswith("Advert:") and isinstance(val, dict): + # Detail pages have much more fields + if "construction" in val or "etage" in val or "ownership" in val: + return val + + except Exception as e: + print(f" Warning: detail failed for {uri}: {e}") + return None + + +def format_price(price: int) -> str: + s = str(price) + parts = [] + while s: + parts.append(s[-3:]) + s = s[:-3] + return " ".join(reversed(parts)) + " Kč" + + +def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]: + """Load previously scraped data as cache keyed by hash_id.""" + path = Path(json_path) + if not path.exists(): + return {} + try: + data = json.loads(path.read_text(encoding="utf-8")) + return {e["hash_id"]: e for e in data if "hash_id" in e} + except (json.JSONDecodeError, KeyError): + return {} + + +def scrape(): + cache = load_cache() + + print("=" * 60) + print("Stahuji inzeráty z Bezrealitky.cz") + print(f"Cena: do {format_price(MAX_PRICE)}") + print(f"Min. plocha: {MIN_AREA} m²") + print(f"Patro: od {MIN_FLOOR}. NP") + print(f"Region: Praha") + if cache: + print(f"Cache: {len(cache)} bytů z minulého běhu") + print("=" * 60) + + # Step 1: Fetch all listing pages + print("\nFáze 1: Stahování seznamu inzerátů...") + all_adverts = {} # id -> advert dict (dedup) + page = 1 + total = None + + while True: + print(f" Strana {page} ...") + adverts, total_count = fetch_page(page) + + if total is None and total_count > 0: + total = total_count + total_pages = math.ceil(total / PER_PAGE) + print(f" → Celkem {total} inzerátů, ~{total_pages} stran") + + if not adverts: + break + + for adv in adverts: + adv_id = adv.get("id") + if adv_id and adv_id not in all_adverts: + all_adverts[adv_id] = adv + + page += 1 + if total and page > math.ceil(total / PER_PAGE): + break + time.sleep(0.5) + + print(f"\n Staženo: {len(all_adverts)} unikátních inzerátů") + + # Step 2: Pre-filter by disposition, price, area from list data + pre_filtered = [] + excluded_disp = 0 + excluded_price = 0 + excluded_area = 0 + excluded_no_gps = 0 + + for adv in all_adverts.values(): + disp = adv.get("disposition", "") + if disp not in WANTED_DISPOSITIONS: + excluded_disp += 1 + continue + + price = adv.get("price", 0) or 0 + if price > MAX_PRICE or price == 0: + excluded_price += 1 + continue + + surface = adv.get("surface") + if surface is not None and surface < MIN_AREA: + excluded_area += 1 + continue + + gps = adv.get("gps", {}) + if not gps or not gps.get("lat") or not gps.get("lng"): + excluded_no_gps += 1 + continue + + pre_filtered.append(adv) + + print(f"\nPo předfiltraci:") + print(f" Vyloučeno (dispozice): {excluded_disp}") + print(f" Vyloučeno (cena): {excluded_price}") + print(f" Vyloučeno (plocha): {excluded_area}") + print(f" Vyloučeno (bez GPS): {excluded_no_gps}") + print(f" Zbývá: {len(pre_filtered)}") + + # Step 3: Fetch details + print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...") + results = [] + excluded_panel = 0 + excluded_floor = 0 + excluded_detail = 0 + cache_hits = 0 + + for i, adv in enumerate(pre_filtered): + uri = adv.get("uri", "") + if not uri: + excluded_detail += 1 + continue + + # Check cache — if hash_id exists and price unchanged, reuse + adv_id = int(adv["id"]) + adv_price = adv.get("price", 0) or 0 + cached = cache.get(adv_id) + if cached and cached.get("price") == adv_price: + cache_hits += 1 + results.append(cached) + continue + + time.sleep(0.4) + detail = fetch_detail(uri) + + if not detail: + excluded_detail += 1 + continue + + # Check construction — exclude panel + construction = detail.get("construction", "") + if construction == "PANEL": + excluded_panel += 1 + print(f" ✗ Vyloučen #{adv['id']}: panel") + continue + + # Check situation — exclude sídliště + situation = detail.get("situation", "") + if situation and "HOUSING_ESTATE" in str(situation).upper(): + excluded_panel += 1 + print(f" ✗ Vyloučen #{adv['id']}: sídliště") + continue + + # Check floor (etage) + etage = detail.get("etage") + if etage is not None and etage < MIN_FLOOR: + excluded_floor += 1 + continue + + gps = adv.get("gps", {}) + disp = adv.get("disposition", "") + + # Get address — key includes locale parameter + address = "" + for key in detail: + if key.startswith("address(") and "withHouseNumber" not in key: + address = detail[key] + break + if not address: + for key in detail: + if key.startswith("address("): + address = detail[key] + break + if not address: + address = adv.get('address({"locale":"CS"})', "Praha") + + result = { + "hash_id": int(adv["id"]), + "name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')} m²", + "price": adv.get("price", 0), + "price_formatted": format_price(adv.get("price", 0)), + "locality": address, + "lat": gps["lat"], + "lon": gps["lng"], + "disposition": DISPOSITION_LABELS.get(disp, "?"), + "floor": etage, + "area": adv.get("surface"), + "building_type": CONSTRUCTION_MAP.get(construction, construction or "neuvedeno"), + "ownership": OWNERSHIP_MAP.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"), + "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}", + "source": "bezrealitky", + "image": "", + } + results.append(result) + + if (i + 1) % 20 == 0: + print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...") + + print(f"\n{'=' * 60}") + print(f"Výsledky Bezrealitky:") + print(f" Předfiltrováno: {len(pre_filtered)}") + print(f" Z cache (přeskočeno): {cache_hits}") + print(f" Vyloučeno (panel/síd): {excluded_panel}") + print(f" Vyloučeno (patro): {excluded_floor}") + print(f" Vyloučeno (bez detailu): {excluded_detail}") + print(f" ✓ Vyhovující byty: {len(results)}") + print(f"{'=' * 60}") + + return results + + +if __name__ == "__main__": + start = time.time() + estates = scrape() + + if estates: + json_path = Path("byty_bezrealitky.json") + json_path.write_text( + json.dumps(estates, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + elapsed = time.time() - start + print(f"\n✓ Data uložena: {json_path.resolve()}") + print(f"⏱ Celkový čas: {elapsed:.0f} s") + else: + print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(") diff --git a/scrape_cityhome.py b/scrape_cityhome.py new file mode 100644 index 0000000..76482cd --- /dev/null +++ b/scrape_cityhome.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +""" +CityHome (city-home.cz) scraper. +Stáhne byty na prodej v Praze z projektů CityHome/SATPO. +Výstup: byty_cityhome.json +""" +from __future__ import annotations + +import json +import re +import time +import urllib.request +from pathlib import Path + +# ── Konfigurace ───────────────────────────────────────────────────────────── + +MAX_PRICE = 14_000_000 +MIN_AREA = 69 +MIN_FLOOR = 2 + +WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml", + "Accept-Language": "cs,en;q=0.9", +} + +BASE_URL = "https://www.city-home.cz" + + +def fetch_url(url: str) -> str: + """Fetch URL and return HTML string.""" + for attempt in range(3): + try: + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + return resp.read().decode("utf-8") + except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e: + if attempt < 2: + time.sleep((attempt + 1) * 2) + print(f" Retry {attempt + 1}: {e}") + else: + raise + + +def format_price(price: int) -> str: + s = str(price) + parts = [] + while s: + parts.append(s[-3:]) + s = s[:-3] + return " ".join(reversed(parts)) + " Kč" + + +def parse_filter_page(html: str) -> list[dict]: + """Parse all listing rows from the filter page.""" + listings = [] + + # Find all with data-cena attribute + row_pattern = re.compile( + r']*' + r'data-cena="(\d+)"[^>]*' + r'data-plocha="([\d.]+)"[^>]*' + r'data-unittype="(\d+)"[^>]*' + r'data-free="(yes|no)"[^>]*' + r'data-project="(\d+)"[^>]*' + r'data-transaction="([^"]*)"[^>]*' + r'data-dispozition="([^"]*)"[^>]*' + r'data-location="([^"]*)"[^>]*' + r'>(.*?)', + re.DOTALL + ) + + # Also try with different attribute order + rows = re.findall(r']*data-cena="[^"]*"[^>]*>(.*?)', html, re.DOTALL) + + for row_html in rows: + # Extract data attributes from the surrounding + tr_match = re.search( + r']*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*' + r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*' + r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*' + r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"', + html + ) + + # More flexible: search around each row + pass + + # Better approach: find each tr tag with all its attributes + for match in re.finditer(r']*data-cena="[^"]*"[^>]*)>(.*?)', html, re.DOTALL): + attrs_str = match.group(1) + row_content = match.group(2) + + # Extract all data attributes + cena = re.search(r'data-cena="(\d+)"', attrs_str) + plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str) + unittype = re.search(r'data-unittype="(\d+)"', attrs_str) + free = re.search(r'data-free="(yes|no)"', attrs_str) + project = re.search(r'data-project="(\d+)"', attrs_str) + transaction = re.search(r'data-transaction="([^"]*)"', attrs_str) + dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str) + location = re.search(r'data-location="([^"]*)"', attrs_str) + + if not cena: + continue + + # Extract detail URL and unit name from first cell + link_match = re.search(r']*href="([^"]*)"[^>]*>(.*?)', row_content, re.DOTALL) + detail_url = link_match.group(1).strip() if link_match else "" + unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else "" + + if detail_url and not detail_url.startswith("http"): + detail_url = BASE_URL + detail_url + + # Extract floor from cells — look for pattern like "3.NP" or "2.PP" + cells = re.findall(r']*>(.*?)', row_content, re.DOTALL) + floor = None + floor_text = "" + project_name = "" + + for cell in cells: + cell_text = re.sub(r'<[^>]+>', '', cell).strip() + # Floor pattern + np_match = re.search(r'(\d+)\.\s*NP', cell_text) + pp_match = re.search(r'(\d+)\.\s*PP', cell_text) + if np_match: + floor = int(np_match.group(1)) + floor_text = cell_text + elif pp_match: + floor = -int(pp_match.group(1)) # Underground + floor_text = cell_text + + # Extract project name — usually in a cell that's not a number/price/floor + for cell in cells: + cell_text = re.sub(r'<[^>]+>', '', cell).strip() + if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text: + if len(cell_text) > 3 and cell_text != unit_name: + project_name = cell_text + break + + listing = { + "price": int(cena.group(1)), + "area": float(plocha.group(1)) if plocha else 0, + "unittype": int(unittype.group(1)) if unittype else 0, + "free": free.group(1) if free else "no", + "project_id": project.group(1) if project else "", + "transaction": transaction.group(1) if transaction else "", + "disposition": dispozition.group(1) if dispozition else "", + "location": location.group(1) if location else "", + "url": detail_url, + "unit_name": unit_name, + "floor": floor, + "project_name": project_name, + } + listings.append(listing) + + return listings + + +def extract_project_gps(html: str) -> dict[str, tuple[float, float]]: + """Extract GPS coordinates for projects from locality pages.""" + # Pattern in JS: ['

Project Name

...', 'LAT', 'LON', '1', 'Name'] + gps_data = {} + for match in re.finditer(r"\['[^']*

([^<]+)

[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html): + name = match.group(1).strip() + lat = float(match.group(2)) + lon = float(match.group(3)) + gps_data[name] = (lat, lon) + return gps_data + + +def scrape(): + print("=" * 60) + print("Stahuji inzeráty z CityHome (city-home.cz)") + print(f"Cena: do {format_price(MAX_PRICE)}") + print(f"Min. plocha: {MIN_AREA} m²") + print(f"Patro: od {MIN_FLOOR}. NP") + print("=" * 60) + + # Step 1: Fetch the main filter page + print("\nFáze 1: Stahování seznamu bytů...") + html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1") + all_listings = parse_filter_page(html) + print(f" Nalezeno: {len(all_listings)} jednotek") + + # Step 2: Collect unique project slugs from detail URLs to fetch GPS + print("\nFáze 2: Stahování GPS souřadnic projektů...") + project_slugs = set() + for listing in all_listings: + url = listing.get("url", "") + # /projekty/zateckych-14/nabidka-nemovitosti/byt-a31 + slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url) + if slug_match: + project_slugs.add(slug_match.group(1)) + + # Fetch GPS for each project from locality pages + project_gps = {} + for slug in sorted(project_slugs): + time.sleep(0.5) + try: + locality_url = f"{BASE_URL}/projekty/{slug}/lokalita" + loc_html = fetch_url(locality_url) + gps = extract_project_gps(loc_html) + if gps: + # Take first entry (the project itself) + first_name, (lat, lon) = next(iter(gps.items())) + project_gps[slug] = (lat, lon) + print(f" ✓ {slug}: {lat}, {lon}") + else: + print(f" ✗ {slug}: GPS nenalezeno") + except Exception as e: + print(f" ✗ {slug}: chyba ({e})") + + # Step 3: Filter listings + print(f"\nFáze 3: Filtrování...") + results = [] + excluded_sold = 0 + excluded_type = 0 + excluded_disp = 0 + excluded_price = 0 + excluded_area = 0 + excluded_floor = 0 + excluded_no_gps = 0 + + for listing in all_listings: + # Only available units + if listing["free"] != "yes": + excluded_sold += 1 + continue + + # Only apartments (unittype=2) + if listing["unittype"] != 2: + excluded_type += 1 + continue + + # Only sales + if listing["transaction"] != "prodej": + excluded_type += 1 + continue + + # Disposition + disp = listing["disposition"] + if disp not in WANTED_DISPOSITIONS: + excluded_disp += 1 + continue + + # Price + price = listing["price"] + if price <= 0 or price > MAX_PRICE: + excluded_price += 1 + continue + + # Area + area = listing["area"] + if area < MIN_AREA: + excluded_area += 1 + continue + + # Floor + floor = listing["floor"] + if floor is not None and floor < MIN_FLOOR: + excluded_floor += 1 + continue + + # GPS from project + url = listing.get("url", "") + slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url) + slug = slug_match.group(1) if slug_match else "" + gps = project_gps.get(slug) + + if not gps: + excluded_no_gps += 1 + continue + + lat, lon = gps + + result = { + "hash_id": f"cityhome_{slug}_{listing['unit_name']}", + "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}", + "price": price, + "price_formatted": format_price(price), + "locality": f"{listing['project_name']}, Praha", + "lat": lat, + "lon": lon, + "disposition": disp, + "floor": floor, + "area": area, + "building_type": "Cihlová", # CityHome renovuje cihlové domy + "ownership": "neuvedeno", + "url": url, + "source": "cityhome", + "image": "", + } + results.append(result) + + print(f"\n{'=' * 60}") + print(f"Výsledky CityHome:") + print(f" Celkem jednotek: {len(all_listings)}") + print(f" Vyloučeno (prodáno): {excluded_sold}") + print(f" Vyloučeno (typ): {excluded_type}") + print(f" Vyloučeno (dispozice): {excluded_disp}") + print(f" Vyloučeno (cena): {excluded_price}") + print(f" Vyloučeno (plocha): {excluded_area}") + print(f" Vyloučeno (patro): {excluded_floor}") + print(f" Vyloučeno (bez GPS): {excluded_no_gps}") + print(f" ✓ Vyhovující byty: {len(results)}") + print(f"{'=' * 60}") + + return results + + +if __name__ == "__main__": + start = time.time() + estates = scrape() + + if estates: + json_path = Path("byty_cityhome.json") + json_path.write_text( + json.dumps(estates, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + elapsed = time.time() - start + print(f"\n✓ Data uložena: {json_path.resolve()}") + print(f"⏱ Celkový čas: {elapsed:.0f} s") + else: + print("\nŽádné byty z CityHome neodpovídají kritériím :(") diff --git a/scrape_idnes.py b/scrape_idnes.py new file mode 100644 index 0000000..ff7c47d --- /dev/null +++ b/scrape_idnes.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +""" +Reality iDNES scraper. +Stáhne byty na prodej v Praze a vyfiltruje podle kritérií. +Výstup: byty_idnes.json +""" +from __future__ import annotations + +import json +import math +import re +import time +import urllib.request +import urllib.parse +from html.parser import HTMLParser +from pathlib import Path + +# ── Konfigurace ───────────────────────────────────────────────────────────── + +MAX_PRICE = 13_500_000 +MIN_AREA = 69 +MIN_FLOOR = 2 +PER_PAGE = 26 # iDNES vrací 26 na stránku + +# Dispozice — kódy pro s-qc[subtypeFlat] +DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k" + +# Mapování dispozice z titulku na label +DISPOSITION_MAP = { + "3+kk": "3+kk", "3+1": "3+1", + "4+kk": "4+kk", "4+1": "4+1", + "5+kk": "5+kk", "5+1": "5+1", + "6+kk": "6+", "6+1": "6+", + "6 a více": "6+", +} + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "cs,en;q=0.9", + "Accept-Encoding": "identity", + "Connection": "keep-alive", +} + +BASE_URL = "https://reality.idnes.cz" + +MAX_RETRIES = 5 + + +def fetch_url(url: str) -> str: + """Fetch URL and return HTML string with retry logic.""" + for attempt in range(MAX_RETRIES): + try: + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + data = resp.read() + return data.decode("utf-8") + except (ConnectionResetError, ConnectionError, urllib.error.URLError, + OSError) as e: + if attempt < MAX_RETRIES - 1: + wait = (attempt + 1) * 3 # 3, 6, 9, 12s + print(f" Retry {attempt + 1}/{MAX_RETRIES} (wait {wait}s): {e}") + time.sleep(wait) + else: + raise + + +def build_list_url(page: int = 0) -> str: + """Build listing URL with all filters.""" + base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/" + params = { + "s-qc[subtypeFlat]": DISPOSITION_CODES, + "s-qc[usableAreaMin]": str(MIN_AREA), + } + url = f"{base}?{urllib.parse.urlencode(params)}" + if page > 0: + url += f"&page={page}" + return url + + +def parse_total_count(html: str) -> int: + """Extract total listing count from page.""" + # Look for "720 inzerátů" or similar + match = re.search(r'(\d[\d\s]*)\s*inzerát', html) + if match: + return int(match.group(1).replace(" ", "").replace("\xa0", "")) + return 0 + + +def parse_listings(html: str) -> list[dict]: + """Parse listing cards from HTML using regex.""" + results = [] + + # Find each listing block — look for c-products__link with detail URL + # Pattern: ... block ... + # Each listing card contains: title (h2), price (strong), info (p.c-products__info) + + # Split by listing items, skip ads + items = re.findall( + r']*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)\s*\s*', + html, re.DOTALL + ) + + # Alternative: find all detail links and extract surrounding context + # More robust approach: find each detail link and parse nearby elements + link_pattern = re.compile( + r']*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>', + re.DOTALL + ) + # Also match when class comes before href + link_pattern2 = re.compile( + r']*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>', + re.DOTALL + ) + + # Find all c-products__link anchors + all_links = link_pattern.findall(html) + link_pattern2.findall(html) + seen_urls = set() + + # For each link, find the surrounding product block + for link_url in all_links: + if link_url in seen_urls: + continue + seen_urls.add(link_url) + + # Find context around this link (the product card) + escaped_url = re.escape(link_url) + context_match = re.search( + escaped_url + r'(.*?)\s*', + html, re.DOTALL + ) + if not context_match: + continue + + block = context_match.group(1) + + # Ensure full URL + url = link_url + if not url.startswith("http"): + url = BASE_URL + url + + # Skip ads + ad_check_start = max(0, context_match.start() - 500) + ad_block = html[ad_check_start:context_match.start()] + if "advertisment" in ad_block or "advertisement" in ad_block: + continue + + # Parse title:

prodej bytu 3+kk 79 m2

+ title_match = re.search(r'class="c-products__title"[^>]*>(.*?)', block, re.DOTALL) + title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else "" + + # Parse price:

12 950 000 Kč

+ price_match = re.search(r'c-products__price[^>]*>.*?(.*?)', block, re.DOTALL) + price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else "" + + # Parse address:

Klečkova, Praha 5 - Stodůlky

+ info_match = re.search(r'class="c-products__info"[^>]*>(.*?)

', block, re.DOTALL) + info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else "" + + # Parse disposition and area from title + disp_match = re.search(r'(\d\+(?:kk|\d))', title) + area_match = re.search(r'(\d+)\s*m[²2]', title) + + disposition = disp_match.group(1) if disp_match else None + area = int(area_match.group(1)) if area_match else None + + if not disposition and ("6 a" in title or "6+" in title): + disposition = "6+" + + # Parse price + price = 0 + if price_text and "vyžádání" not in price_text.lower(): + price_clean = re.sub(r'[^\d]', '', price_text) + if price_clean: + price = int(price_clean) + + # Extract listing ID from URL + id_match = re.search(r'/([a-f0-9]{24})/?', url) + listing_id = id_match.group(1) if id_match else url + + results.append({ + "id": listing_id, + "url": url, + "disposition": DISPOSITION_MAP.get(disposition, disposition or "?"), + "area": area, + "price": price, + "locality": info, + }) + + return results + + +def parse_detail(html: str) -> dict: + """Parse detail page for GPS, floor, construction, ownership.""" + detail = {} + + # 1. Parse dataLayer.push() for GPS and other data + dl_match = re.search( + r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)', + html, re.DOTALL + ) + if dl_match: + # Clean up JS object to valid JSON + js_obj = dl_match.group(1) + # Replace single quotes with double, handle trailing commas, etc. + # The dataLayer is usually valid JSON-like, let's try parsing + try: + # Remove JS comments, handle unquoted keys + # Most importantly: listing_lat, listing_lon, listing_price, listing_area + lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj) + lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj) + if lat_match: + detail["lat"] = float(lat_match.group(1)) + if lon_match: + detail["lon"] = float(lon_match.group(1)) + except (ValueError, AttributeError): + pass + + # 2. Parse DT/DD pairs for floor, construction, ownership + # Pattern:
Label
Value
+ dt_dd_pairs = re.findall( + r']*>(.*?)\s*]*>(.*?)', + html, re.DOTALL + ) + + for dt, dd in dt_dd_pairs: + dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower() + dd_clean = re.sub(r'<[^>]+>', '', dd).strip() + + if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean: + # "2. patro (3. NP)" or "3. podlaží z celkem 5" + # Try to find NP first + np_match = re.search(r'(\d+)\.\s*NP', dd_clean) + if np_match: + detail["floor"] = int(np_match.group(1)) + else: + # Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly + patro_match = re.search(r'(\d+)', dd_clean) + if patro_match: + detail["floor"] = int(patro_match.group(1)) + + if "konstrukce" in dt_clean or "stavba" in dt_clean: + detail["construction"] = dd_clean.lower() + + if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean: + detail["ownership"] = dd_clean + + return detail + + +def format_price(price: int) -> str: + s = str(price) + parts = [] + while s: + parts.append(s[-3:]) + s = s[:-3] + return " ".join(reversed(parts)) + " Kč" + + +def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]: + """Load previously scraped data as cache keyed by hash_id.""" + path = Path(json_path) + if not path.exists(): + return {} + try: + data = json.loads(path.read_text(encoding="utf-8")) + return {str(e["hash_id"]): e for e in data if "hash_id" in e} + except (json.JSONDecodeError, KeyError): + return {} + + +def scrape(): + cache = load_cache() + + print("=" * 60) + print("Stahuji inzeráty z Reality iDNES") + print(f"Cena: do {format_price(MAX_PRICE)}") + print(f"Min. plocha: {MIN_AREA} m²") + print(f"Patro: od {MIN_FLOOR}. NP") + print(f"Region: Praha") + if cache: + print(f"Cache: {len(cache)} bytů z minulého běhu") + print("=" * 60) + + # Step 1: Fetch listing pages + print("\nFáze 1: Stahování seznamu inzerátů...") + all_listings = {} # id -> listing dict + page = 0 + total = None + + while True: + url = build_list_url(page) + print(f" Strana {page + 1} ...") + html = fetch_url(url) + + if total is None: + total = parse_total_count(html) + total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1 + print(f" → Celkem {total} inzerátů, ~{total_pages} stran") + + listings = parse_listings(html) + + if not listings: + break + + for item in listings: + lid = item["id"] + if lid not in all_listings: + all_listings[lid] = item + + page += 1 + if total and page >= math.ceil(total / PER_PAGE): + break + time.sleep(1.0) + + print(f"\n Staženo: {len(all_listings)} unikátních inzerátů") + + # Step 2: Pre-filter by price and area from list data + pre_filtered = [] + excluded_price = 0 + excluded_area = 0 + excluded_disp = 0 + + for item in all_listings.values(): + if item["price"] <= 0 or item["price"] > MAX_PRICE: + excluded_price += 1 + continue + + if item["area"] is not None and item["area"] < MIN_AREA: + excluded_area += 1 + continue + + if item["disposition"] == "?": + excluded_disp += 1 + continue + + pre_filtered.append(item) + + print(f"\nPo předfiltraci:") + print(f" Vyloučeno (cena): {excluded_price}") + print(f" Vyloučeno (plocha): {excluded_area}") + print(f" Vyloučeno (dispozice): {excluded_disp}") + print(f" Zbývá: {len(pre_filtered)}") + + # Step 3: Fetch details for GPS, floor, construction + print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...") + results = [] + excluded_panel = 0 + excluded_floor = 0 + excluded_no_gps = 0 + excluded_detail = 0 + cache_hits = 0 + + for i, item in enumerate(pre_filtered): + # Check cache — if hash_id exists and price unchanged, reuse + cached = cache.get(str(item["id"])) + if cached and cached.get("price") == item["price"]: + cache_hits += 1 + results.append(cached) + continue + + url = item["url"] + time.sleep(0.4) + + try: + html = fetch_url(url) + except Exception as e: + print(f" Warning: detail failed for {item['id']}: {e}") + excluded_detail += 1 + continue + + detail = parse_detail(html) + + # Must have GPS + if not detail.get("lat") or not detail.get("lon"): + excluded_no_gps += 1 + continue + + # Check construction — exclude panel + construction = detail.get("construction", "") + if "panel" in construction: + excluded_panel += 1 + print(f" ✗ Vyloučen {item['id'][:12]}...: panel ({construction})") + continue + + # Check for sídliště in construction/description + if "sídliště" in construction or "sidliste" in construction: + excluded_panel += 1 + print(f" ✗ Vyloučen {item['id'][:12]}...: sídliště") + continue + + # Check floor + floor = detail.get("floor") + if floor is not None and floor < MIN_FLOOR: + excluded_floor += 1 + continue + + # Map construction to Czech label + building_type = "neuvedeno" + if construction: + if "cihlo" in construction or "cihla" in construction: + building_type = "Cihlová" + elif "smíšen" in construction or "smisen" in construction: + building_type = "Smíšená" + elif "skelet" in construction: + building_type = "Skeletová" + elif "dřevo" in construction or "drevo" in construction: + building_type = "Dřevostavba" + elif "mont" in construction: + building_type = "Montovaná" + else: + building_type = construction.capitalize() + + result = { + "hash_id": item["id"], + "name": f"Prodej bytu {item['disposition']} {item.get('area', '?')} m²", + "price": item["price"], + "price_formatted": format_price(item["price"]), + "locality": item["locality"], + "lat": detail["lat"], + "lon": detail["lon"], + "disposition": item["disposition"], + "floor": floor, + "area": item["area"], + "building_type": building_type, + "ownership": detail.get("ownership", "neuvedeno"), + "url": item["url"], + "source": "idnes", + "image": "", + } + results.append(result) + + if (i + 1) % 20 == 0: + print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...") + + print(f"\n{'=' * 60}") + print(f"Výsledky Reality iDNES:") + print(f" Předfiltrováno: {len(pre_filtered)}") + print(f" Z cache (přeskočeno): {cache_hits}") + print(f" Vyloučeno (panel/síd): {excluded_panel}") + print(f" Vyloučeno (patro): {excluded_floor}") + print(f" Vyloučeno (bez GPS): {excluded_no_gps}") + print(f" Vyloučeno (bez detailu): {excluded_detail}") + print(f" ✓ Vyhovující byty: {len(results)}") + print(f"{'=' * 60}") + + return results + + +if __name__ == "__main__": + start = time.time() + estates = scrape() + + if estates: + json_path = Path("byty_idnes.json") + json_path.write_text( + json.dumps(estates, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + elapsed = time.time() - start + print(f"\n✓ Data uložena: {json_path.resolve()}") + print(f"⏱ Celkový čas: {elapsed:.0f} s") + else: + print("\nŽádné byty z Reality iDNES neodpovídají kritériím :(") diff --git a/scrape_psn.py b/scrape_psn.py new file mode 100644 index 0000000..85cae64 --- /dev/null +++ b/scrape_psn.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 +""" +PSN.cz scraper. +Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií. +Výstup: byty_psn.json +""" +from __future__ import annotations + +import json +import re +import subprocess +import time +from pathlib import Path + +# ── Konfigurace ───────────────────────────────────────────────────────────── + +MAX_PRICE = 14_000_000 +MIN_AREA = 69 +MIN_FLOOR = 2 + +WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} + +UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + +BASE_URL = "https://psn.cz" + +# Known Prague project slugs with GPS (from research) +PRAGUE_PROJECTS = [ + {"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125}, + {"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463}, + {"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720}, + {"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990}, + {"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768}, + {"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517}, + {"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348}, + {"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692}, + {"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036}, + {"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653}, + {"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313}, + {"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342}, + {"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187}, + {"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414}, + {"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063}, + {"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033}, +] + + +def fetch_url(url: str) -> str: + """Fetch URL via curl (urllib SSL too old for Cloudflare).""" + result = subprocess.run( + ["curl", "-s", "-L", "--max-time", "30", + "-H", f"User-Agent: {UA}", + "-H", "Accept: text/html", + url], + capture_output=True, text=True, timeout=60 + ) + if result.returncode != 0: + raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") + return result.stdout + + +def extract_units_from_html(html: str) -> list[dict]: + """Extract unit JSON objects from raw HTML with escaped quotes.""" + # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\" + # Step 1: Unescape the double-backslash-quotes to regular quotes + cleaned = html.replace('\\"', '"') + + # Step 2: Find each unit by looking for "title":"Byt and walking back to { + units = [] + decoder = json.JSONDecoder() + + for m in re.finditer(r'"title":"Byt', cleaned): + pos = m.start() + # Walk backwards to find the opening brace + depth = 0 + found = False + for i in range(pos - 1, max(pos - 3000, 0), -1): + if cleaned[i] == '}': + depth += 1 + elif cleaned[i] == '{': + if depth == 0: + try: + obj, end = decoder.raw_decode(cleaned, i) + if isinstance(obj, dict) and 'price_czk' in obj: + units.append(obj) + found = True + except (json.JSONDecodeError, ValueError): + pass + break + depth -= 1 + + return units + + +def format_price(price: int) -> str: + s = str(price) + parts = [] + while s: + parts.append(s[-3:]) + s = s[:-3] + return " ".join(reversed(parts)) + " Kč" + + +def scrape(): + print("=" * 60) + print("Stahuji inzeráty z PSN.cz") + print(f"Cena: do {format_price(MAX_PRICE)}") + print(f"Min. plocha: {MIN_AREA} m²") + print(f"Patro: od {MIN_FLOOR}. NP") + print(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)") + print("=" * 60) + + # Fetch units from each Prague project + all_units = [] + + for proj in PRAGUE_PROJECTS: + page = 1 + project_units = [] + + while True: + url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}" + print(f" {proj['name']} — strana {page} ...") + time.sleep(0.5) + + try: + html = fetch_url(url) + except Exception as e: + print(f" Chyba: {e}") + break + + units = extract_units_from_html(html) + + if not units: + if page == 1: + print(f" → 0 jednotek") + break + + # Add project info to each unit + for unit in units: + if not unit.get("latitude") or not unit.get("longitude"): + unit["latitude"] = proj["lat"] + unit["longitude"] = proj["lon"] + unit["_project_name"] = proj["name"] + unit["_project_slug"] = proj["slug"] + + project_units.extend(units) + + if page == 1: + print(f" → {len(units)} jednotek na stránce") + + # Check if there might be more pages + # If we got fewer than expected or same units, stop + if len(units) < 10: + break + + page += 1 + if page > 10: # Safety limit + break + + all_units.extend(project_units) + + # Deduplicate by slug + seen_slugs = set() + unique_units = [] + for u in all_units: + slug = u.get("slug", "") + if slug and slug not in seen_slugs: + seen_slugs.add(slug) + unique_units.append(u) + elif not slug: + unique_units.append(u) + + print(f"\n Staženo celkem: {len(unique_units)} unikátních jednotek") + + # Filter + print(f"\nFiltrování...") + results = [] + excluded_sold = 0 + excluded_type = 0 + excluded_disp = 0 + excluded_price = 0 + excluded_area = 0 + excluded_floor = 0 + excluded_panel = 0 + + for unit in unique_units: + # Only free units + is_free = unit.get("is_free", False) + is_sold = unit.get("is_sold", False) + if is_sold or not is_free: + excluded_sold += 1 + continue + + # Only apartments + category = str(unit.get("category", "")).lower() + if "byt" not in category and "ateliér" not in category: + excluded_type += 1 + continue + + # Disposition + disp = unit.get("disposition", "") + if disp not in WANTED_DISPOSITIONS: + excluded_disp += 1 + continue + + # Price + price = unit.get("price_czk") or unit.get("action_price_czk") or 0 + if price <= 0 or price > MAX_PRICE: + excluded_price += 1 + continue + + # Area + area = unit.get("total_area") or unit.get("floor_area") or 0 + if area < MIN_AREA: + excluded_area += 1 + continue + + # Floor + floor_str = str(unit.get("floor", "")) + floor = None + if floor_str: + try: + floor = int(floor_str) + except ValueError: + floor_match = re.search(r'(-?\d+)', floor_str) + if floor_match: + floor = int(floor_match.group(1)) + + if floor is not None and floor < MIN_FLOOR: + excluded_floor += 1 + continue + + # Construction — check for panel + build_type = str(unit.get("build_type", "")).lower() + if "panel" in build_type: + excluded_panel += 1 + print(f" ✗ Vyloučen: panel ({build_type})") + continue + + # Build construction label + building_type = "neuvedeno" + if build_type and build_type != "nevybráno": + if "cihlo" in build_type or "cihla" in build_type: + building_type = "Cihlová" + elif "skelet" in build_type: + building_type = "Skeletová" + else: + building_type = build_type.capitalize() + + lat = unit.get("latitude", 0) + lon = unit.get("longitude", 0) + + slug = unit.get("slug", "") + project_slug = unit.get("_project_slug", "") + detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}" + + result = { + "hash_id": unit.get("id", slug), + "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}", + "price": int(price), + "price_formatted": format_price(int(price)), + "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha", + "lat": lat, + "lon": lon, + "disposition": disp, + "floor": floor, + "area": area, + "building_type": building_type, + "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno", + "url": detail_url, + "source": "psn", + "image": "", + } + results.append(result) + + print(f"\n{'=' * 60}") + print(f"Výsledky PSN:") + print(f" Celkem jednotek: {len(unique_units)}") + print(f" Vyloučeno (prodáno): {excluded_sold}") + print(f" Vyloučeno (typ): {excluded_type}") + print(f" Vyloučeno (dispozice): {excluded_disp}") + print(f" Vyloučeno (cena): {excluded_price}") + print(f" Vyloučeno (plocha): {excluded_area}") + print(f" Vyloučeno (patro): {excluded_floor}") + print(f" Vyloučeno (panel): {excluded_panel}") + print(f" ✓ Vyhovující byty: {len(results)}") + print(f"{'=' * 60}") + + return results + + +if __name__ == "__main__": + start = time.time() + estates = scrape() + + if estates: + json_path = Path("byty_psn.json") + json_path.write_text( + json.dumps(estates, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + elapsed = time.time() - start + print(f"\n✓ Data uložena: {json_path.resolve()}") + print(f"⏱ Celkový čas: {elapsed:.0f} s") + else: + print("\nŽádné byty z PSN neodpovídají kritériím :(") diff --git a/scrape_realingo.py b/scrape_realingo.py new file mode 100644 index 0000000..45484df --- /dev/null +++ b/scrape_realingo.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +""" +Realingo.cz scraper. +Stáhne byty na prodej v Praze a vyfiltruje podle kritérií. +Výstup: byty_realingo.json +""" +from __future__ import annotations + +import json +import math +import re +import time +import urllib.request +from pathlib import Path + +# ── Konfigurace (sdílená se Sreality scraperem) ───────────────────────────── + +MAX_PRICE = 13_500_000 +MIN_AREA = 69 +MIN_FLOOR = 2 +PER_PAGE = 40 # Realingo vrací 40 na stránku + +# Kategorie které chceme (dispozice 3+kk a větší) +WANTED_CATEGORIES = { + "FLAT3_KK", "FLAT31", # 3+kk, 3+1 + "FLAT4_KK", "FLAT41", # 4+kk, 4+1 + "FLAT5_KK", "FLAT51", # 5+kk, 5+1 + "FLAT6", # 6+ + "OTHERS_FLAT", # atypické — zkontrolujeme plochu +} + +# Mapování category → label +CATEGORY_LABELS = { + "FLAT1_KK": "1+kk", "FLAT11": "1+1", + "FLAT2_KK": "2+kk", "FLAT21": "2+1", + "FLAT3_KK": "3+kk", "FLAT31": "3+1", + "FLAT4_KK": "4+kk", "FLAT41": "4+1", + "FLAT5_KK": "5+kk", "FLAT51": "5+1", + "FLAT6": "6+", + "OTHERS_FLAT": "Atypický", +} + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml", +} + +BASE_URL = "https://www.realingo.cz" + + +def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]: + """Fetch a page of Prague listings. Returns (items, total_count).""" + if page == 1: + url = f"{BASE_URL}/prodej_byty/praha/" + else: + url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/" + + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + html = resp.read().decode("utf-8") + + match = re.search( + r'', + html, re.DOTALL + ) + if not match: + return [], 0 + + data = json.loads(match.group(1)) + offer_list = data["props"]["pageProps"]["store"]["offer"]["list"] + return offer_list["data"], offer_list["total"] + + +def fetch_detail(listing_url: str) -> dict | None: + """Fetch detail page for a listing to get floor, building type, etc.""" + try: + url = f"{BASE_URL}{listing_url}" + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + html = resp.read().decode("utf-8") + + match = re.search( + r'', + html, re.DOTALL + ) + if not match: + return None + + data = json.loads(match.group(1)) + details = data["props"]["pageProps"]["store"]["offer"]["details"] + # Get first (only) detail entry + for detail_data in details.values(): + return detail_data + except Exception as e: + print(f" Warning: detail fetch failed for {listing_url}: {e}") + return None + + +def format_price(price: int) -> str: + s = str(price) + parts = [] + while s: + parts.append(s[-3:]) + s = s[:-3] + return " ".join(reversed(parts)) + " Kč" + + +def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]: + """Load previously scraped data as cache keyed by hash_id.""" + path = Path(json_path) + if not path.exists(): + return {} + try: + data = json.loads(path.read_text(encoding="utf-8")) + return {e["hash_id"]: e for e in data if "hash_id" in e} + except (json.JSONDecodeError, KeyError): + return {} + + +def scrape(): + cache = load_cache() + + print("=" * 60) + print("Stahuji inzeráty z Realingo.cz") + print(f"Cena: do {format_price(MAX_PRICE)}") + print(f"Min. plocha: {MIN_AREA} m²") + print(f"Patro: od {MIN_FLOOR}. NP") + print(f"Region: Praha") + if cache: + print(f"Cache: {len(cache)} bytů z minulého běhu") + print("=" * 60) + + # Step 1: Fetch all listing pages + print("\nFáze 1: Stahování seznamu inzerátů...") + all_listings = [] + page = 1 + total = None + + while True: + print(f" Strana {page} ...") + items, total_count = fetch_listing_page(page) + if total is None: + total = total_count + total_pages = math.ceil(total / PER_PAGE) + print(f" → Celkem {total} inzerátů, {total_pages} stran") + + if not items: + break + + all_listings.extend(items) + page += 1 + if page > total_pages: + break + time.sleep(0.5) + + print(f"\n Staženo: {len(all_listings)} inzerátů") + + # Step 2: Pre-filter by category, price, area from listing data + pre_filtered = [] + excluded_category = 0 + excluded_price = 0 + excluded_area = 0 + excluded_no_gps = 0 + + for item in all_listings: + cat = item.get("category", "") + if cat not in WANTED_CATEGORIES: + excluded_category += 1 + continue + + price = item.get("price", {}).get("total", 0) or 0 + if price > MAX_PRICE or price == 0: + excluded_price += 1 + continue + + area = item.get("area", {}).get("main") + if area is not None and area < MIN_AREA: + excluded_area += 1 + continue + + loc = item.get("location", {}) + if not loc.get("latitude") or not loc.get("longitude"): + excluded_no_gps += 1 + continue + + pre_filtered.append(item) + + print(f"\nPo předfiltraci:") + print(f" Vyloučeno (dispozice): {excluded_category}") + print(f" Vyloučeno (cena): {excluded_price}") + print(f" Vyloučeno (plocha): {excluded_area}") + print(f" Vyloučeno (bez GPS): {excluded_no_gps}") + print(f" Zbývá: {len(pre_filtered)}") + + # Step 3: Fetch details for remaining listings (floor, building type) + print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...") + results = [] + excluded_panel = 0 + excluded_floor = 0 + excluded_detail = 0 + cache_hits = 0 + + for i, item in enumerate(pre_filtered): + # Check cache — if hash_id exists and price unchanged, reuse + item_id = int(item["id"]) + item_price = item.get("price", {}).get("total", 0) or 0 + cached = cache.get(item_id) + if cached and cached.get("price") == item_price: + cache_hits += 1 + results.append(cached) + continue + + time.sleep(0.3) + detail_data = fetch_detail(item["url"]) + + if not detail_data: + excluded_detail += 1 + continue + + detail = detail_data.get("offer", {}).get("detail", {}) + if not detail and "detail" in detail_data: + detail = detail_data["detail"] + + # Check building type — exclude panel + building_type = detail.get("buildingType", "") + if building_type == "PANEL": + excluded_panel += 1 + print(f" ✗ Vyloučen #{item['id']}: panel") + continue + + # Check building position — exclude sídliště + building_position = detail.get("buildingPosition", "") + if building_position and "ESTATE" in str(building_position).upper(): + excluded_panel += 1 + print(f" ✗ Vyloučen #{item['id']}: sídliště") + continue + + # Check floor + floor = detail.get("floor") + if floor is not None and floor < MIN_FLOOR: + excluded_floor += 1 + continue + + # Map building type + bt_map = { + "BRICK": "Cihlová", + "PANEL": "Panelová", + "WOOD": "Dřevostavba", + "STEEL": "Ocelová", + "MIXED": "Smíšená", + "MONTAGE": "Montovaná", + } + ownership_map = { + "PRIVATE": "Osobní", + "COOPERATIVE": "Družstevní", + "STATE": "Státní/obecní", + } + + cat = item.get("category", "") + loc = item.get("location", {}) + + result = { + "hash_id": int(item["id"]), + "name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')} m²", + "price": item.get("price", {}).get("total", 0), + "price_formatted": format_price(item.get("price", {}).get("total", 0)), + "locality": loc.get("address", "Praha"), + "lat": loc["latitude"], + "lon": loc["longitude"], + "disposition": CATEGORY_LABELS.get(cat, "?"), + "floor": floor, + "area": item.get("area", {}).get("main"), + "building_type": bt_map.get(building_type, building_type or "neuvedeno"), + "ownership": ownership_map.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"), + "url": f"{BASE_URL}{item['url']}", + "source": "realingo", + "image": "", + } + results.append(result) + + if (i + 1) % 20 == 0: + print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...") + + print(f"\n{'=' * 60}") + print(f"Výsledky Realingo:") + print(f" Předfiltrováno: {len(pre_filtered)}") + print(f" Z cache (přeskočeno): {cache_hits}") + print(f" Vyloučeno (panel/síd): {excluded_panel}") + print(f" Vyloučeno (patro): {excluded_floor}") + print(f" Vyloučeno (bez detailu): {excluded_detail}") + print(f" ✓ Vyhovující byty: {len(results)}") + print(f"{'=' * 60}") + + return results + + +if __name__ == "__main__": + start = time.time() + estates = scrape() + + if estates: + json_path = Path("byty_realingo.json") + json_path.write_text( + json.dumps(estates, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + elapsed = time.time() - start + print(f"\n✓ Data uložena: {json_path.resolve()}") + print(f"⏱ Celkový čas: {elapsed:.0f} s") + else: + print("\nŽádné byty z Realinga neodpovídají kritériím :(")