diff --git a/scrape_bezrealitky.py b/scrape_bezrealitky.py
new file mode 100644
index 0000000..21715f0
--- /dev/null
+++ b/scrape_bezrealitky.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""
+Bezrealitky.cz scraper.
+Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
+Výstup: byty_bezrealitky.json
+"""
+from __future__ import annotations
+
+import json
+import math
+import re
+import time
+import urllib.request
+from pathlib import Path
+
+# ── Konfigurace ─────────────────────────────────────────────────────────────
+
+MAX_PRICE = 13_500_000
+MIN_AREA = 69
+MIN_FLOOR = 2
+PER_PAGE = 15 # Bezrealitky vrací 15 na stránku
+
+# Dispozice které chceme
+WANTED_DISPOSITIONS = {
+ "DISP_3_KK", "DISP_3_1",
+ "DISP_4_KK", "DISP_4_1",
+ "DISP_5_KK", "DISP_5_1",
+ "DISP_6",
+ "DISP_OTHER", # atypické
+}
+
+DISPOSITION_LABELS = {
+ "DISP_1_KK": "1+kk", "DISP_1_1": "1+1",
+ "DISP_2_KK": "2+kk", "DISP_2_1": "2+1",
+ "DISP_3_KK": "3+kk", "DISP_3_1": "3+1",
+ "DISP_4_KK": "4+kk", "DISP_4_1": "4+1",
+ "DISP_5_KK": "5+kk", "DISP_5_1": "5+1",
+ "DISP_6": "6+",
+ "DISP_OTHER": "Atypický",
+}
+
+CONSTRUCTION_MAP = {
+ "BRICK": "Cihlová",
+ "PANEL": "Panelová",
+ "WOOD": "Dřevostavba",
+ "MIXED": "Smíšená",
+ "MONTAGE": "Montovaná",
+ "STEEL": "Ocelová",
+}
+
+OWNERSHIP_MAP = {
+ "OSOBNI": "Osobní",
+ "DRUZSTEVNI": "Družstevní",
+ "STATNI": "Státní/obecní",
+}
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml",
+ "Accept-Language": "cs,en;q=0.9",
+}
+
+BASE_URL = "https://www.bezrealitky.cz"
+
+
+def fetch_page(page: int) -> tuple[list[dict], int]:
+ """
+ Fetch a listing page from Bezrealitky.
+ Returns (list of advert dicts from Apollo cache, total count).
+ """
+ url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
+ req = urllib.request.Request(url, headers=HEADERS)
+ resp = urllib.request.urlopen(req, timeout=30)
+ html = resp.read().decode("utf-8")
+
+ match = re.search(
+ r'',
+ html, re.DOTALL
+ )
+ if not match:
+ return [], 0
+
+ data = json.loads(match.group(1))
+ cache = data["props"]["pageProps"]["apolloCache"]
+
+ # Extract adverts from cache
+ adverts = []
+ for key, val in cache.items():
+ if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
+ adverts.append(val)
+
+ # Get total count from ROOT_QUERY
+ total = 0
+ root = cache.get("ROOT_QUERY", {})
+ for key, val in root.items():
+ if "listAdverts" in key and isinstance(val, dict):
+ tc = val.get("totalCount")
+ if tc and tc > total:
+ total = tc
+
+ return adverts, total
+
+
+def fetch_detail(uri: str) -> dict | None:
+ """Fetch detail page for a listing."""
+ try:
+ url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
+ req = urllib.request.Request(url, headers=HEADERS)
+ resp = urllib.request.urlopen(req, timeout=30)
+ html = resp.read().decode("utf-8")
+
+ match = re.search(
+ r'',
+ html, re.DOTALL
+ )
+ if not match:
+ return None
+
+ data = json.loads(match.group(1))
+ cache = data["props"]["pageProps"]["apolloCache"]
+
+ # Find the full advert in cache
+ for key, val in cache.items():
+ if key.startswith("Advert:") and isinstance(val, dict):
+ # Detail pages have much more fields
+ if "construction" in val or "etage" in val or "ownership" in val:
+ return val
+
+ except Exception as e:
+ print(f" Warning: detail failed for {uri}: {e}")
+ return None
+
+
+def format_price(price: int) -> str:
+ s = str(price)
+ parts = []
+ while s:
+ parts.append(s[-3:])
+ s = s[:-3]
+ return " ".join(reversed(parts)) + " Kč"
+
+
+def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
+ """Load previously scraped data as cache keyed by hash_id."""
+ path = Path(json_path)
+ if not path.exists():
+ return {}
+ try:
+ data = json.loads(path.read_text(encoding="utf-8"))
+ return {e["hash_id"]: e for e in data if "hash_id" in e}
+ except (json.JSONDecodeError, KeyError):
+ return {}
+
+
+def scrape():
+ cache = load_cache()
+
+ print("=" * 60)
+ print("Stahuji inzeráty z Bezrealitky.cz")
+ print(f"Cena: do {format_price(MAX_PRICE)}")
+ print(f"Min. plocha: {MIN_AREA} m²")
+ print(f"Patro: od {MIN_FLOOR}. NP")
+ print(f"Region: Praha")
+ if cache:
+ print(f"Cache: {len(cache)} bytů z minulého běhu")
+ print("=" * 60)
+
+ # Step 1: Fetch all listing pages
+ print("\nFáze 1: Stahování seznamu inzerátů...")
+ all_adverts = {} # id -> advert dict (dedup)
+ page = 1
+ total = None
+
+ while True:
+ print(f" Strana {page} ...")
+ adverts, total_count = fetch_page(page)
+
+ if total is None and total_count > 0:
+ total = total_count
+ total_pages = math.ceil(total / PER_PAGE)
+ print(f" → Celkem {total} inzerátů, ~{total_pages} stran")
+
+ if not adverts:
+ break
+
+ for adv in adverts:
+ adv_id = adv.get("id")
+ if adv_id and adv_id not in all_adverts:
+ all_adverts[adv_id] = adv
+
+ page += 1
+ if total and page > math.ceil(total / PER_PAGE):
+ break
+ time.sleep(0.5)
+
+ print(f"\n Staženo: {len(all_adverts)} unikátních inzerátů")
+
+ # Step 2: Pre-filter by disposition, price, area from list data
+ pre_filtered = []
+ excluded_disp = 0
+ excluded_price = 0
+ excluded_area = 0
+ excluded_no_gps = 0
+
+ for adv in all_adverts.values():
+ disp = adv.get("disposition", "")
+ if disp not in WANTED_DISPOSITIONS:
+ excluded_disp += 1
+ continue
+
+ price = adv.get("price", 0) or 0
+ if price > MAX_PRICE or price == 0:
+ excluded_price += 1
+ continue
+
+ surface = adv.get("surface")
+ if surface is not None and surface < MIN_AREA:
+ excluded_area += 1
+ continue
+
+ gps = adv.get("gps", {})
+ if not gps or not gps.get("lat") or not gps.get("lng"):
+ excluded_no_gps += 1
+ continue
+
+ pre_filtered.append(adv)
+
+ print(f"\nPo předfiltraci:")
+ print(f" Vyloučeno (dispozice): {excluded_disp}")
+ print(f" Vyloučeno (cena): {excluded_price}")
+ print(f" Vyloučeno (plocha): {excluded_area}")
+ print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
+ print(f" Zbývá: {len(pre_filtered)}")
+
+ # Step 3: Fetch details
+ print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
+ results = []
+ excluded_panel = 0
+ excluded_floor = 0
+ excluded_detail = 0
+ cache_hits = 0
+
+ for i, adv in enumerate(pre_filtered):
+ uri = adv.get("uri", "")
+ if not uri:
+ excluded_detail += 1
+ continue
+
+ # Check cache — if hash_id exists and price unchanged, reuse
+ adv_id = int(adv["id"])
+ adv_price = adv.get("price", 0) or 0
+ cached = cache.get(adv_id)
+ if cached and cached.get("price") == adv_price:
+ cache_hits += 1
+ results.append(cached)
+ continue
+
+ time.sleep(0.4)
+ detail = fetch_detail(uri)
+
+ if not detail:
+ excluded_detail += 1
+ continue
+
+ # Check construction — exclude panel
+ construction = detail.get("construction", "")
+ if construction == "PANEL":
+ excluded_panel += 1
+ print(f" ✗ Vyloučen #{adv['id']}: panel")
+ continue
+
+ # Check situation — exclude sídliště
+ situation = detail.get("situation", "")
+ if situation and "HOUSING_ESTATE" in str(situation).upper():
+ excluded_panel += 1
+ print(f" ✗ Vyloučen #{adv['id']}: sídliště")
+ continue
+
+ # Check floor (etage)
+ etage = detail.get("etage")
+ if etage is not None and etage < MIN_FLOOR:
+ excluded_floor += 1
+ continue
+
+ gps = adv.get("gps", {})
+ disp = adv.get("disposition", "")
+
+ # Get address — key includes locale parameter
+ address = ""
+ for key in detail:
+ if key.startswith("address(") and "withHouseNumber" not in key:
+ address = detail[key]
+ break
+ if not address:
+ for key in detail:
+ if key.startswith("address("):
+ address = detail[key]
+ break
+ if not address:
+ address = adv.get('address({"locale":"CS"})', "Praha")
+
+ result = {
+ "hash_id": int(adv["id"]),
+ "name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')} m²",
+ "price": adv.get("price", 0),
+ "price_formatted": format_price(adv.get("price", 0)),
+ "locality": address,
+ "lat": gps["lat"],
+ "lon": gps["lng"],
+ "disposition": DISPOSITION_LABELS.get(disp, "?"),
+ "floor": etage,
+ "area": adv.get("surface"),
+ "building_type": CONSTRUCTION_MAP.get(construction, construction or "neuvedeno"),
+ "ownership": OWNERSHIP_MAP.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
+ "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
+ "source": "bezrealitky",
+ "image": "",
+ }
+ results.append(result)
+
+ if (i + 1) % 20 == 0:
+ print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
+
+ print(f"\n{'=' * 60}")
+ print(f"Výsledky Bezrealitky:")
+ print(f" Předfiltrováno: {len(pre_filtered)}")
+ print(f" Z cache (přeskočeno): {cache_hits}")
+ print(f" Vyloučeno (panel/síd): {excluded_panel}")
+ print(f" Vyloučeno (patro): {excluded_floor}")
+ print(f" Vyloučeno (bez detailu): {excluded_detail}")
+ print(f" ✓ Vyhovující byty: {len(results)}")
+ print(f"{'=' * 60}")
+
+ return results
+
+
+if __name__ == "__main__":
+ start = time.time()
+ estates = scrape()
+
+ if estates:
+ json_path = Path("byty_bezrealitky.json")
+ json_path.write_text(
+ json.dumps(estates, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+ elapsed = time.time() - start
+ print(f"\n✓ Data uložena: {json_path.resolve()}")
+ print(f"⏱ Celkový čas: {elapsed:.0f} s")
+ else:
+ print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")
diff --git a/scrape_cityhome.py b/scrape_cityhome.py
new file mode 100644
index 0000000..76482cd
--- /dev/null
+++ b/scrape_cityhome.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""
+CityHome (city-home.cz) scraper.
+Stáhne byty na prodej v Praze z projektů CityHome/SATPO.
+Výstup: byty_cityhome.json
+"""
+from __future__ import annotations
+
+import json
+import re
+import time
+import urllib.request
+from pathlib import Path
+
+# ── Konfigurace ─────────────────────────────────────────────────────────────
+
+MAX_PRICE = 14_000_000
+MIN_AREA = 69
+MIN_FLOOR = 2
+
+WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml",
+ "Accept-Language": "cs,en;q=0.9",
+}
+
+BASE_URL = "https://www.city-home.cz"
+
+
+def fetch_url(url: str) -> str:
+ """Fetch URL and return HTML string."""
+ for attempt in range(3):
+ try:
+ req = urllib.request.Request(url, headers=HEADERS)
+ resp = urllib.request.urlopen(req, timeout=30)
+ return resp.read().decode("utf-8")
+ except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
+ if attempt < 2:
+ time.sleep((attempt + 1) * 2)
+ print(f" Retry {attempt + 1}: {e}")
+ else:
+ raise
+
+
+def format_price(price: int) -> str:
+ s = str(price)
+ parts = []
+ while s:
+ parts.append(s[-3:])
+ s = s[:-3]
+ return " ".join(reversed(parts)) + " Kč"
+
+
+def parse_filter_page(html: str) -> list[dict]:
+ """Parse all listing rows from the filter page."""
+ listings = []
+
+ # Find all
with data-cena attribute
+ row_pattern = re.compile(
+ r'
]*'
+ r'data-cena="(\d+)"[^>]*'
+ r'data-plocha="([\d.]+)"[^>]*'
+ r'data-unittype="(\d+)"[^>]*'
+ r'data-free="(yes|no)"[^>]*'
+ r'data-project="(\d+)"[^>]*'
+ r'data-transaction="([^"]*)"[^>]*'
+ r'data-dispozition="([^"]*)"[^>]*'
+ r'data-location="([^"]*)"[^>]*'
+ r'>(.*?)
',
+ re.DOTALL
+ )
+
+ # Also try with different attribute order
+ rows = re.findall(r']*data-cena="[^"]*"[^>]*>(.*?)
', html, re.DOTALL)
+
+ for row_html in rows:
+ # Extract data attributes from the surrounding
+ tr_match = re.search(
+ r'
]*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*'
+ r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*'
+ r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*'
+ r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"',
+ html
+ )
+
+ # More flexible: search around each row
+ pass
+
+ # Better approach: find each tr tag with all its attributes
+ for match in re.finditer(r'
]*data-cena="[^"]*"[^>]*)>(.*?)
', html, re.DOTALL):
+ attrs_str = match.group(1)
+ row_content = match.group(2)
+
+ # Extract all data attributes
+ cena = re.search(r'data-cena="(\d+)"', attrs_str)
+ plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str)
+ unittype = re.search(r'data-unittype="(\d+)"', attrs_str)
+ free = re.search(r'data-free="(yes|no)"', attrs_str)
+ project = re.search(r'data-project="(\d+)"', attrs_str)
+ transaction = re.search(r'data-transaction="([^"]*)"', attrs_str)
+ dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str)
+ location = re.search(r'data-location="([^"]*)"', attrs_str)
+
+ if not cena:
+ continue
+
+ # Extract detail URL and unit name from first cell
+ link_match = re.search(r']*href="([^"]*)"[^>]*>(.*?)', row_content, re.DOTALL)
+ detail_url = link_match.group(1).strip() if link_match else ""
+ unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else ""
+
+ if detail_url and not detail_url.startswith("http"):
+ detail_url = BASE_URL + detail_url
+
+ # Extract floor from cells — look for pattern like "3.NP" or "2.PP"
+ cells = re.findall(r']*>(.*?) | ', row_content, re.DOTALL)
+ floor = None
+ floor_text = ""
+ project_name = ""
+
+ for cell in cells:
+ cell_text = re.sub(r'<[^>]+>', '', cell).strip()
+ # Floor pattern
+ np_match = re.search(r'(\d+)\.\s*NP', cell_text)
+ pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
+ if np_match:
+ floor = int(np_match.group(1))
+ floor_text = cell_text
+ elif pp_match:
+ floor = -int(pp_match.group(1)) # Underground
+ floor_text = cell_text
+
+ # Extract project name — usually in a cell that's not a number/price/floor
+ for cell in cells:
+ cell_text = re.sub(r'<[^>]+>', '', cell).strip()
+ if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
+ if len(cell_text) > 3 and cell_text != unit_name:
+ project_name = cell_text
+ break
+
+ listing = {
+ "price": int(cena.group(1)),
+ "area": float(plocha.group(1)) if plocha else 0,
+ "unittype": int(unittype.group(1)) if unittype else 0,
+ "free": free.group(1) if free else "no",
+ "project_id": project.group(1) if project else "",
+ "transaction": transaction.group(1) if transaction else "",
+ "disposition": dispozition.group(1) if dispozition else "",
+ "location": location.group(1) if location else "",
+ "url": detail_url,
+ "unit_name": unit_name,
+ "floor": floor,
+ "project_name": project_name,
+ }
+ listings.append(listing)
+
+ return listings
+
+
+def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
+ """Extract GPS coordinates for projects from locality pages."""
+ # Pattern in JS: ['Project Name
...', 'LAT', 'LON', '1', 'Name']
+ gps_data = {}
+ for match in re.finditer(r"\['[^']*([^<]+)
[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
+ name = match.group(1).strip()
+ lat = float(match.group(2))
+ lon = float(match.group(3))
+ gps_data[name] = (lat, lon)
+ return gps_data
+
+
+def scrape():
+ print("=" * 60)
+ print("Stahuji inzeráty z CityHome (city-home.cz)")
+ print(f"Cena: do {format_price(MAX_PRICE)}")
+ print(f"Min. plocha: {MIN_AREA} m²")
+ print(f"Patro: od {MIN_FLOOR}. NP")
+ print("=" * 60)
+
+ # Step 1: Fetch the main filter page
+ print("\nFáze 1: Stahování seznamu bytů...")
+ html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
+ all_listings = parse_filter_page(html)
+ print(f" Nalezeno: {len(all_listings)} jednotek")
+
+ # Step 2: Collect unique project slugs from detail URLs to fetch GPS
+ print("\nFáze 2: Stahování GPS souřadnic projektů...")
+ project_slugs = set()
+ for listing in all_listings:
+ url = listing.get("url", "")
+ # /projekty/zateckych-14/nabidka-nemovitosti/byt-a31
+ slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
+ if slug_match:
+ project_slugs.add(slug_match.group(1))
+
+ # Fetch GPS for each project from locality pages
+ project_gps = {}
+ for slug in sorted(project_slugs):
+ time.sleep(0.5)
+ try:
+ locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
+ loc_html = fetch_url(locality_url)
+ gps = extract_project_gps(loc_html)
+ if gps:
+ # Take first entry (the project itself)
+ first_name, (lat, lon) = next(iter(gps.items()))
+ project_gps[slug] = (lat, lon)
+ print(f" ✓ {slug}: {lat}, {lon}")
+ else:
+ print(f" ✗ {slug}: GPS nenalezeno")
+ except Exception as e:
+ print(f" ✗ {slug}: chyba ({e})")
+
+ # Step 3: Filter listings
+ print(f"\nFáze 3: Filtrování...")
+ results = []
+ excluded_sold = 0
+ excluded_type = 0
+ excluded_disp = 0
+ excluded_price = 0
+ excluded_area = 0
+ excluded_floor = 0
+ excluded_no_gps = 0
+
+ for listing in all_listings:
+ # Only available units
+ if listing["free"] != "yes":
+ excluded_sold += 1
+ continue
+
+ # Only apartments (unittype=2)
+ if listing["unittype"] != 2:
+ excluded_type += 1
+ continue
+
+ # Only sales
+ if listing["transaction"] != "prodej":
+ excluded_type += 1
+ continue
+
+ # Disposition
+ disp = listing["disposition"]
+ if disp not in WANTED_DISPOSITIONS:
+ excluded_disp += 1
+ continue
+
+ # Price
+ price = listing["price"]
+ if price <= 0 or price > MAX_PRICE:
+ excluded_price += 1
+ continue
+
+ # Area
+ area = listing["area"]
+ if area < MIN_AREA:
+ excluded_area += 1
+ continue
+
+ # Floor
+ floor = listing["floor"]
+ if floor is not None and floor < MIN_FLOOR:
+ excluded_floor += 1
+ continue
+
+ # GPS from project
+ url = listing.get("url", "")
+ slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
+ slug = slug_match.group(1) if slug_match else ""
+ gps = project_gps.get(slug)
+
+ if not gps:
+ excluded_no_gps += 1
+ continue
+
+ lat, lon = gps
+
+ result = {
+ "hash_id": f"cityhome_{slug}_{listing['unit_name']}",
+ "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
+ "price": price,
+ "price_formatted": format_price(price),
+ "locality": f"{listing['project_name']}, Praha",
+ "lat": lat,
+ "lon": lon,
+ "disposition": disp,
+ "floor": floor,
+ "area": area,
+ "building_type": "Cihlová", # CityHome renovuje cihlové domy
+ "ownership": "neuvedeno",
+ "url": url,
+ "source": "cityhome",
+ "image": "",
+ }
+ results.append(result)
+
+ print(f"\n{'=' * 60}")
+ print(f"Výsledky CityHome:")
+ print(f" Celkem jednotek: {len(all_listings)}")
+ print(f" Vyloučeno (prodáno): {excluded_sold}")
+ print(f" Vyloučeno (typ): {excluded_type}")
+ print(f" Vyloučeno (dispozice): {excluded_disp}")
+ print(f" Vyloučeno (cena): {excluded_price}")
+ print(f" Vyloučeno (plocha): {excluded_area}")
+ print(f" Vyloučeno (patro): {excluded_floor}")
+ print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
+ print(f" ✓ Vyhovující byty: {len(results)}")
+ print(f"{'=' * 60}")
+
+ return results
+
+
+if __name__ == "__main__":
+ start = time.time()
+ estates = scrape()
+
+ if estates:
+ json_path = Path("byty_cityhome.json")
+ json_path.write_text(
+ json.dumps(estates, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+ elapsed = time.time() - start
+ print(f"\n✓ Data uložena: {json_path.resolve()}")
+ print(f"⏱ Celkový čas: {elapsed:.0f} s")
+ else:
+ print("\nŽádné byty z CityHome neodpovídají kritériím :(")
diff --git a/scrape_idnes.py b/scrape_idnes.py
new file mode 100644
index 0000000..ff7c47d
--- /dev/null
+++ b/scrape_idnes.py
@@ -0,0 +1,464 @@
+#!/usr/bin/env python3
+"""
+Reality iDNES scraper.
+Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
+Výstup: byty_idnes.json
+"""
+from __future__ import annotations
+
+import json
+import math
+import re
+import time
+import urllib.request
+import urllib.parse
+from html.parser import HTMLParser
+from pathlib import Path
+
+# ── Konfigurace ─────────────────────────────────────────────────────────────
+
+MAX_PRICE = 13_500_000
+MIN_AREA = 69
+MIN_FLOOR = 2
+PER_PAGE = 26 # iDNES vrací 26 na stránku
+
+# Dispozice — kódy pro s-qc[subtypeFlat]
+DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k"
+
+# Mapování dispozice z titulku na label
+DISPOSITION_MAP = {
+ "3+kk": "3+kk", "3+1": "3+1",
+ "4+kk": "4+kk", "4+1": "4+1",
+ "5+kk": "5+kk", "5+1": "5+1",
+ "6+kk": "6+", "6+1": "6+",
+ "6 a více": "6+",
+}
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language": "cs,en;q=0.9",
+ "Accept-Encoding": "identity",
+ "Connection": "keep-alive",
+}
+
+BASE_URL = "https://reality.idnes.cz"
+
+MAX_RETRIES = 5
+
+
+def fetch_url(url: str) -> str:
+ """Fetch URL and return HTML string with retry logic."""
+ for attempt in range(MAX_RETRIES):
+ try:
+ req = urllib.request.Request(url, headers=HEADERS)
+ resp = urllib.request.urlopen(req, timeout=30)
+ data = resp.read()
+ return data.decode("utf-8")
+ except (ConnectionResetError, ConnectionError, urllib.error.URLError,
+ OSError) as e:
+ if attempt < MAX_RETRIES - 1:
+ wait = (attempt + 1) * 3 # 3, 6, 9, 12s
+ print(f" Retry {attempt + 1}/{MAX_RETRIES} (wait {wait}s): {e}")
+ time.sleep(wait)
+ else:
+ raise
+
+
+def build_list_url(page: int = 0) -> str:
+ """Build listing URL with all filters."""
+ base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/"
+ params = {
+ "s-qc[subtypeFlat]": DISPOSITION_CODES,
+ "s-qc[usableAreaMin]": str(MIN_AREA),
+ }
+ url = f"{base}?{urllib.parse.urlencode(params)}"
+ if page > 0:
+ url += f"&page={page}"
+ return url
+
+
+def parse_total_count(html: str) -> int:
+ """Extract total listing count from page."""
+ # Look for "720 inzerátů" or similar
+ match = re.search(r'(\d[\d\s]*)\s*inzerát', html)
+ if match:
+ return int(match.group(1).replace(" ", "").replace("\xa0", ""))
+ return 0
+
+
+def parse_listings(html: str) -> list[dict]:
+ """Parse listing cards from HTML using regex."""
+ results = []
+
+ # Find each listing block — look for c-products__link with detail URL
+ # Pattern: ... block ...
+ # Each listing card contains: title (h2), price (strong), info (p.c-products__info)
+
+ # Split by listing items, skip ads
+ items = re.findall(
+ r']*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)
\s*\s*',
+ html, re.DOTALL
+ )
+
+ # Alternative: find all detail links and extract surrounding context
+ # More robust approach: find each detail link and parse nearby elements
+ link_pattern = re.compile(
+ r']*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>',
+ re.DOTALL
+ )
+ # Also match when class comes before href
+ link_pattern2 = re.compile(
+ r']*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>',
+ re.DOTALL
+ )
+
+ # Find all c-products__link anchors
+ all_links = link_pattern.findall(html) + link_pattern2.findall(html)
+ seen_urls = set()
+
+ # For each link, find the surrounding product block
+ for link_url in all_links:
+ if link_url in seen_urls:
+ continue
+ seen_urls.add(link_url)
+
+ # Find context around this link (the product card)
+ escaped_url = re.escape(link_url)
+ context_match = re.search(
+ escaped_url + r'(.*?)\s*',
+ html, re.DOTALL
+ )
+ if not context_match:
+ continue
+
+ block = context_match.group(1)
+
+ # Ensure full URL
+ url = link_url
+ if not url.startswith("http"):
+ url = BASE_URL + url
+
+ # Skip ads
+ ad_check_start = max(0, context_match.start() - 500)
+ ad_block = html[ad_check_start:context_match.start()]
+ if "advertisment" in ad_block or "advertisement" in ad_block:
+ continue
+
+ # Parse title: prodej bytu 3+kk 79 m2
+ title_match = re.search(r'class="c-products__title"[^>]*>(.*?)', block, re.DOTALL)
+ title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else ""
+
+ # Parse price: 12 950 000 Kč
+ price_match = re.search(r'c-products__price[^>]*>.*?(.*?)', block, re.DOTALL)
+ price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else ""
+
+ # Parse address: Klečkova, Praha 5 - Stodůlky
+ info_match = re.search(r'class="c-products__info"[^>]*>(.*?)', block, re.DOTALL)
+ info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else ""
+
+ # Parse disposition and area from title
+ disp_match = re.search(r'(\d\+(?:kk|\d))', title)
+ area_match = re.search(r'(\d+)\s*m[²2]', title)
+
+ disposition = disp_match.group(1) if disp_match else None
+ area = int(area_match.group(1)) if area_match else None
+
+ if not disposition and ("6 a" in title or "6+" in title):
+ disposition = "6+"
+
+ # Parse price
+ price = 0
+ if price_text and "vyžádání" not in price_text.lower():
+ price_clean = re.sub(r'[^\d]', '', price_text)
+ if price_clean:
+ price = int(price_clean)
+
+ # Extract listing ID from URL
+ id_match = re.search(r'/([a-f0-9]{24})/?', url)
+ listing_id = id_match.group(1) if id_match else url
+
+ results.append({
+ "id": listing_id,
+ "url": url,
+ "disposition": DISPOSITION_MAP.get(disposition, disposition or "?"),
+ "area": area,
+ "price": price,
+ "locality": info,
+ })
+
+ return results
+
+
+def parse_detail(html: str) -> dict:
+ """Parse detail page for GPS, floor, construction, ownership."""
+ detail = {}
+
+ # 1. Parse dataLayer.push() for GPS and other data
+ dl_match = re.search(
+ r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)',
+ html, re.DOTALL
+ )
+ if dl_match:
+ # Clean up JS object to valid JSON
+ js_obj = dl_match.group(1)
+ # Replace single quotes with double, handle trailing commas, etc.
+ # The dataLayer is usually valid JSON-like, let's try parsing
+ try:
+ # Remove JS comments, handle unquoted keys
+ # Most importantly: listing_lat, listing_lon, listing_price, listing_area
+ lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj)
+ lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj)
+ if lat_match:
+ detail["lat"] = float(lat_match.group(1))
+ if lon_match:
+ detail["lon"] = float(lon_match.group(1))
+ except (ValueError, AttributeError):
+ pass
+
+ # 2. Parse DT/DD pairs for floor, construction, ownership
+ # Pattern: LabelValue
+ dt_dd_pairs = re.findall(
+ r']*>(.*?)\s*]*>(.*?)',
+ html, re.DOTALL
+ )
+
+ for dt, dd in dt_dd_pairs:
+ dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower()
+ dd_clean = re.sub(r'<[^>]+>', '', dd).strip()
+
+ if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean:
+ # "2. patro (3. NP)" or "3. podlaží z celkem 5"
+ # Try to find NP first
+ np_match = re.search(r'(\d+)\.\s*NP', dd_clean)
+ if np_match:
+ detail["floor"] = int(np_match.group(1))
+ else:
+ # Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly
+ patro_match = re.search(r'(\d+)', dd_clean)
+ if patro_match:
+ detail["floor"] = int(patro_match.group(1))
+
+ if "konstrukce" in dt_clean or "stavba" in dt_clean:
+ detail["construction"] = dd_clean.lower()
+
+ if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean:
+ detail["ownership"] = dd_clean
+
+ return detail
+
+
+def format_price(price: int) -> str:
+ s = str(price)
+ parts = []
+ while s:
+ parts.append(s[-3:])
+ s = s[:-3]
+ return " ".join(reversed(parts)) + " Kč"
+
+
+def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
+ """Load previously scraped data as cache keyed by hash_id."""
+ path = Path(json_path)
+ if not path.exists():
+ return {}
+ try:
+ data = json.loads(path.read_text(encoding="utf-8"))
+ return {str(e["hash_id"]): e for e in data if "hash_id" in e}
+ except (json.JSONDecodeError, KeyError):
+ return {}
+
+
+def scrape():
+ cache = load_cache()
+
+ print("=" * 60)
+ print("Stahuji inzeráty z Reality iDNES")
+ print(f"Cena: do {format_price(MAX_PRICE)}")
+ print(f"Min. plocha: {MIN_AREA} m²")
+ print(f"Patro: od {MIN_FLOOR}. NP")
+ print(f"Region: Praha")
+ if cache:
+ print(f"Cache: {len(cache)} bytů z minulého běhu")
+ print("=" * 60)
+
+ # Step 1: Fetch listing pages
+ print("\nFáze 1: Stahování seznamu inzerátů...")
+ all_listings = {} # id -> listing dict
+ page = 0
+ total = None
+
+ while True:
+ url = build_list_url(page)
+ print(f" Strana {page + 1} ...")
+ html = fetch_url(url)
+
+ if total is None:
+ total = parse_total_count(html)
+ total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1
+ print(f" → Celkem {total} inzerátů, ~{total_pages} stran")
+
+ listings = parse_listings(html)
+
+ if not listings:
+ break
+
+ for item in listings:
+ lid = item["id"]
+ if lid not in all_listings:
+ all_listings[lid] = item
+
+ page += 1
+ if total and page >= math.ceil(total / PER_PAGE):
+ break
+ time.sleep(1.0)
+
+ print(f"\n Staženo: {len(all_listings)} unikátních inzerátů")
+
+ # Step 2: Pre-filter by price and area from list data
+ pre_filtered = []
+ excluded_price = 0
+ excluded_area = 0
+ excluded_disp = 0
+
+ for item in all_listings.values():
+ if item["price"] <= 0 or item["price"] > MAX_PRICE:
+ excluded_price += 1
+ continue
+
+ if item["area"] is not None and item["area"] < MIN_AREA:
+ excluded_area += 1
+ continue
+
+ if item["disposition"] == "?":
+ excluded_disp += 1
+ continue
+
+ pre_filtered.append(item)
+
+ print(f"\nPo předfiltraci:")
+ print(f" Vyloučeno (cena): {excluded_price}")
+ print(f" Vyloučeno (plocha): {excluded_area}")
+ print(f" Vyloučeno (dispozice): {excluded_disp}")
+ print(f" Zbývá: {len(pre_filtered)}")
+
+ # Step 3: Fetch details for GPS, floor, construction
+ print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
+ results = []
+ excluded_panel = 0
+ excluded_floor = 0
+ excluded_no_gps = 0
+ excluded_detail = 0
+ cache_hits = 0
+
+ for i, item in enumerate(pre_filtered):
+ # Check cache — if hash_id exists and price unchanged, reuse
+ cached = cache.get(str(item["id"]))
+ if cached and cached.get("price") == item["price"]:
+ cache_hits += 1
+ results.append(cached)
+ continue
+
+ url = item["url"]
+ time.sleep(0.4)
+
+ try:
+ html = fetch_url(url)
+ except Exception as e:
+ print(f" Warning: detail failed for {item['id']}: {e}")
+ excluded_detail += 1
+ continue
+
+ detail = parse_detail(html)
+
+ # Must have GPS
+ if not detail.get("lat") or not detail.get("lon"):
+ excluded_no_gps += 1
+ continue
+
+ # Check construction — exclude panel
+ construction = detail.get("construction", "")
+ if "panel" in construction:
+ excluded_panel += 1
+ print(f" ✗ Vyloučen {item['id'][:12]}...: panel ({construction})")
+ continue
+
+ # Check for sídliště in construction/description
+ if "sídliště" in construction or "sidliste" in construction:
+ excluded_panel += 1
+ print(f" ✗ Vyloučen {item['id'][:12]}...: sídliště")
+ continue
+
+ # Check floor
+ floor = detail.get("floor")
+ if floor is not None and floor < MIN_FLOOR:
+ excluded_floor += 1
+ continue
+
+ # Map construction to Czech label
+ building_type = "neuvedeno"
+ if construction:
+ if "cihlo" in construction or "cihla" in construction:
+ building_type = "Cihlová"
+ elif "smíšen" in construction or "smisen" in construction:
+ building_type = "Smíšená"
+ elif "skelet" in construction:
+ building_type = "Skeletová"
+ elif "dřevo" in construction or "drevo" in construction:
+ building_type = "Dřevostavba"
+ elif "mont" in construction:
+ building_type = "Montovaná"
+ else:
+ building_type = construction.capitalize()
+
+ result = {
+ "hash_id": item["id"],
+ "name": f"Prodej bytu {item['disposition']} {item.get('area', '?')} m²",
+ "price": item["price"],
+ "price_formatted": format_price(item["price"]),
+ "locality": item["locality"],
+ "lat": detail["lat"],
+ "lon": detail["lon"],
+ "disposition": item["disposition"],
+ "floor": floor,
+ "area": item["area"],
+ "building_type": building_type,
+ "ownership": detail.get("ownership", "neuvedeno"),
+ "url": item["url"],
+ "source": "idnes",
+ "image": "",
+ }
+ results.append(result)
+
+ if (i + 1) % 20 == 0:
+ print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
+
+ print(f"\n{'=' * 60}")
+ print(f"Výsledky Reality iDNES:")
+ print(f" Předfiltrováno: {len(pre_filtered)}")
+ print(f" Z cache (přeskočeno): {cache_hits}")
+ print(f" Vyloučeno (panel/síd): {excluded_panel}")
+ print(f" Vyloučeno (patro): {excluded_floor}")
+ print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
+ print(f" Vyloučeno (bez detailu): {excluded_detail}")
+ print(f" ✓ Vyhovující byty: {len(results)}")
+ print(f"{'=' * 60}")
+
+ return results
+
+
+if __name__ == "__main__":
+ start = time.time()
+ estates = scrape()
+
+ if estates:
+ json_path = Path("byty_idnes.json")
+ json_path.write_text(
+ json.dumps(estates, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+ elapsed = time.time() - start
+ print(f"\n✓ Data uložena: {json_path.resolve()}")
+ print(f"⏱ Celkový čas: {elapsed:.0f} s")
+ else:
+ print("\nŽádné byty z Reality iDNES neodpovídají kritériím :(")
diff --git a/scrape_psn.py b/scrape_psn.py
new file mode 100644
index 0000000..85cae64
--- /dev/null
+++ b/scrape_psn.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+PSN.cz scraper.
+Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
+Výstup: byty_psn.json
+"""
+from __future__ import annotations
+
+import json
+import re
+import subprocess
+import time
+from pathlib import Path
+
+# ── Konfigurace ─────────────────────────────────────────────────────────────
+
+MAX_PRICE = 14_000_000
+MIN_AREA = 69
+MIN_FLOOR = 2
+
+WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
+
+UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+
+BASE_URL = "https://psn.cz"
+
+# Known Prague project slugs with GPS (from research)
+PRAGUE_PROJECTS = [
+ {"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
+ {"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
+ {"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
+ {"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
+ {"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
+ {"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
+ {"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
+ {"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
+ {"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
+ {"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
+ {"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
+ {"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
+ {"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
+ {"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
+ {"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
+ {"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
+]
+
+
+def fetch_url(url: str) -> str:
+ """Fetch URL via curl (urllib SSL too old for Cloudflare)."""
+ result = subprocess.run(
+ ["curl", "-s", "-L", "--max-time", "30",
+ "-H", f"User-Agent: {UA}",
+ "-H", "Accept: text/html",
+ url],
+ capture_output=True, text=True, timeout=60
+ )
+ if result.returncode != 0:
+ raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
+ return result.stdout
+
+
+def extract_units_from_html(html: str) -> list[dict]:
+ """Extract unit JSON objects from raw HTML with escaped quotes."""
+ # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
+ # Step 1: Unescape the double-backslash-quotes to regular quotes
+ cleaned = html.replace('\\"', '"')
+
+ # Step 2: Find each unit by looking for "title":"Byt and walking back to {
+ units = []
+ decoder = json.JSONDecoder()
+
+ for m in re.finditer(r'"title":"Byt', cleaned):
+ pos = m.start()
+ # Walk backwards to find the opening brace
+ depth = 0
+ found = False
+ for i in range(pos - 1, max(pos - 3000, 0), -1):
+ if cleaned[i] == '}':
+ depth += 1
+ elif cleaned[i] == '{':
+ if depth == 0:
+ try:
+ obj, end = decoder.raw_decode(cleaned, i)
+ if isinstance(obj, dict) and 'price_czk' in obj:
+ units.append(obj)
+ found = True
+ except (json.JSONDecodeError, ValueError):
+ pass
+ break
+ depth -= 1
+
+ return units
+
+
+def format_price(price: int) -> str:
+ s = str(price)
+ parts = []
+ while s:
+ parts.append(s[-3:])
+ s = s[:-3]
+ return " ".join(reversed(parts)) + " Kč"
+
+
+def scrape():
+ print("=" * 60)
+ print("Stahuji inzeráty z PSN.cz")
+ print(f"Cena: do {format_price(MAX_PRICE)}")
+ print(f"Min. plocha: {MIN_AREA} m²")
+ print(f"Patro: od {MIN_FLOOR}. NP")
+ print(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
+ print("=" * 60)
+
+ # Fetch units from each Prague project
+ all_units = []
+
+ for proj in PRAGUE_PROJECTS:
+ page = 1
+ project_units = []
+
+ while True:
+ url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
+ print(f" {proj['name']} — strana {page} ...")
+ time.sleep(0.5)
+
+ try:
+ html = fetch_url(url)
+ except Exception as e:
+ print(f" Chyba: {e}")
+ break
+
+ units = extract_units_from_html(html)
+
+ if not units:
+ if page == 1:
+ print(f" → 0 jednotek")
+ break
+
+ # Add project info to each unit
+ for unit in units:
+ if not unit.get("latitude") or not unit.get("longitude"):
+ unit["latitude"] = proj["lat"]
+ unit["longitude"] = proj["lon"]
+ unit["_project_name"] = proj["name"]
+ unit["_project_slug"] = proj["slug"]
+
+ project_units.extend(units)
+
+ if page == 1:
+ print(f" → {len(units)} jednotek na stránce")
+
+ # Check if there might be more pages
+ # If we got fewer than expected or same units, stop
+ if len(units) < 10:
+ break
+
+ page += 1
+ if page > 10: # Safety limit
+ break
+
+ all_units.extend(project_units)
+
+ # Deduplicate by slug
+ seen_slugs = set()
+ unique_units = []
+ for u in all_units:
+ slug = u.get("slug", "")
+ if slug and slug not in seen_slugs:
+ seen_slugs.add(slug)
+ unique_units.append(u)
+ elif not slug:
+ unique_units.append(u)
+
+ print(f"\n Staženo celkem: {len(unique_units)} unikátních jednotek")
+
+ # Filter
+ print(f"\nFiltrování...")
+ results = []
+ excluded_sold = 0
+ excluded_type = 0
+ excluded_disp = 0
+ excluded_price = 0
+ excluded_area = 0
+ excluded_floor = 0
+ excluded_panel = 0
+
+ for unit in unique_units:
+ # Only free units
+ is_free = unit.get("is_free", False)
+ is_sold = unit.get("is_sold", False)
+ if is_sold or not is_free:
+ excluded_sold += 1
+ continue
+
+ # Only apartments
+ category = str(unit.get("category", "")).lower()
+ if "byt" not in category and "ateliér" not in category:
+ excluded_type += 1
+ continue
+
+ # Disposition
+ disp = unit.get("disposition", "")
+ if disp not in WANTED_DISPOSITIONS:
+ excluded_disp += 1
+ continue
+
+ # Price
+ price = unit.get("price_czk") or unit.get("action_price_czk") or 0
+ if price <= 0 or price > MAX_PRICE:
+ excluded_price += 1
+ continue
+
+ # Area
+ area = unit.get("total_area") or unit.get("floor_area") or 0
+ if area < MIN_AREA:
+ excluded_area += 1
+ continue
+
+ # Floor
+ floor_str = str(unit.get("floor", ""))
+ floor = None
+ if floor_str:
+ try:
+ floor = int(floor_str)
+ except ValueError:
+ floor_match = re.search(r'(-?\d+)', floor_str)
+ if floor_match:
+ floor = int(floor_match.group(1))
+
+ if floor is not None and floor < MIN_FLOOR:
+ excluded_floor += 1
+ continue
+
+ # Construction — check for panel
+ build_type = str(unit.get("build_type", "")).lower()
+ if "panel" in build_type:
+ excluded_panel += 1
+ print(f" ✗ Vyloučen: panel ({build_type})")
+ continue
+
+ # Build construction label
+ building_type = "neuvedeno"
+ if build_type and build_type != "nevybráno":
+ if "cihlo" in build_type or "cihla" in build_type:
+ building_type = "Cihlová"
+ elif "skelet" in build_type:
+ building_type = "Skeletová"
+ else:
+ building_type = build_type.capitalize()
+
+ lat = unit.get("latitude", 0)
+ lon = unit.get("longitude", 0)
+
+ slug = unit.get("slug", "")
+ project_slug = unit.get("_project_slug", "")
+ detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
+
+ result = {
+ "hash_id": unit.get("id", slug),
+ "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
+ "price": int(price),
+ "price_formatted": format_price(int(price)),
+ "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
+ "lat": lat,
+ "lon": lon,
+ "disposition": disp,
+ "floor": floor,
+ "area": area,
+ "building_type": building_type,
+ "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
+ "url": detail_url,
+ "source": "psn",
+ "image": "",
+ }
+ results.append(result)
+
+ print(f"\n{'=' * 60}")
+ print(f"Výsledky PSN:")
+ print(f" Celkem jednotek: {len(unique_units)}")
+ print(f" Vyloučeno (prodáno): {excluded_sold}")
+ print(f" Vyloučeno (typ): {excluded_type}")
+ print(f" Vyloučeno (dispozice): {excluded_disp}")
+ print(f" Vyloučeno (cena): {excluded_price}")
+ print(f" Vyloučeno (plocha): {excluded_area}")
+ print(f" Vyloučeno (patro): {excluded_floor}")
+ print(f" Vyloučeno (panel): {excluded_panel}")
+ print(f" ✓ Vyhovující byty: {len(results)}")
+ print(f"{'=' * 60}")
+
+ return results
+
+
+if __name__ == "__main__":
+ start = time.time()
+ estates = scrape()
+
+ if estates:
+ json_path = Path("byty_psn.json")
+ json_path.write_text(
+ json.dumps(estates, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+ elapsed = time.time() - start
+ print(f"\n✓ Data uložena: {json_path.resolve()}")
+ print(f"⏱ Celkový čas: {elapsed:.0f} s")
+ else:
+ print("\nŽádné byty z PSN neodpovídají kritériím :(")
diff --git a/scrape_realingo.py b/scrape_realingo.py
new file mode 100644
index 0000000..45484df
--- /dev/null
+++ b/scrape_realingo.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+"""
+Realingo.cz scraper.
+Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
+Výstup: byty_realingo.json
+"""
+from __future__ import annotations
+
+import json
+import math
+import re
+import time
+import urllib.request
+from pathlib import Path
+
+# ── Konfigurace (sdílená se Sreality scraperem) ─────────────────────────────
+
+MAX_PRICE = 13_500_000
+MIN_AREA = 69
+MIN_FLOOR = 2
+PER_PAGE = 40 # Realingo vrací 40 na stránku
+
+# Kategorie které chceme (dispozice 3+kk a větší)
+WANTED_CATEGORIES = {
+ "FLAT3_KK", "FLAT31", # 3+kk, 3+1
+ "FLAT4_KK", "FLAT41", # 4+kk, 4+1
+ "FLAT5_KK", "FLAT51", # 5+kk, 5+1
+ "FLAT6", # 6+
+ "OTHERS_FLAT", # atypické — zkontrolujeme plochu
+}
+
+# Mapování category → label
+CATEGORY_LABELS = {
+ "FLAT1_KK": "1+kk", "FLAT11": "1+1",
+ "FLAT2_KK": "2+kk", "FLAT21": "2+1",
+ "FLAT3_KK": "3+kk", "FLAT31": "3+1",
+ "FLAT4_KK": "4+kk", "FLAT41": "4+1",
+ "FLAT5_KK": "5+kk", "FLAT51": "5+1",
+ "FLAT6": "6+",
+ "OTHERS_FLAT": "Atypický",
+}
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
+ "Accept": "text/html,application/xhtml+xml",
+}
+
+BASE_URL = "https://www.realingo.cz"
+
+
+def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
+ """Fetch a page of Prague listings. Returns (items, total_count)."""
+ if page == 1:
+ url = f"{BASE_URL}/prodej_byty/praha/"
+ else:
+ url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
+
+ req = urllib.request.Request(url, headers=HEADERS)
+ resp = urllib.request.urlopen(req, timeout=30)
+ html = resp.read().decode("utf-8")
+
+ match = re.search(
+ r'',
+ html, re.DOTALL
+ )
+ if not match:
+ return [], 0
+
+ data = json.loads(match.group(1))
+ offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
+ return offer_list["data"], offer_list["total"]
+
+
+def fetch_detail(listing_url: str) -> dict | None:
+ """Fetch detail page for a listing to get floor, building type, etc."""
+ try:
+ url = f"{BASE_URL}{listing_url}"
+ req = urllib.request.Request(url, headers=HEADERS)
+ resp = urllib.request.urlopen(req, timeout=30)
+ html = resp.read().decode("utf-8")
+
+ match = re.search(
+ r'',
+ html, re.DOTALL
+ )
+ if not match:
+ return None
+
+ data = json.loads(match.group(1))
+ details = data["props"]["pageProps"]["store"]["offer"]["details"]
+ # Get first (only) detail entry
+ for detail_data in details.values():
+ return detail_data
+ except Exception as e:
+ print(f" Warning: detail fetch failed for {listing_url}: {e}")
+ return None
+
+
+def format_price(price: int) -> str:
+ s = str(price)
+ parts = []
+ while s:
+ parts.append(s[-3:])
+ s = s[:-3]
+ return " ".join(reversed(parts)) + " Kč"
+
+
+def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
+ """Load previously scraped data as cache keyed by hash_id."""
+ path = Path(json_path)
+ if not path.exists():
+ return {}
+ try:
+ data = json.loads(path.read_text(encoding="utf-8"))
+ return {e["hash_id"]: e for e in data if "hash_id" in e}
+ except (json.JSONDecodeError, KeyError):
+ return {}
+
+
+def scrape():
+ cache = load_cache()
+
+ print("=" * 60)
+ print("Stahuji inzeráty z Realingo.cz")
+ print(f"Cena: do {format_price(MAX_PRICE)}")
+ print(f"Min. plocha: {MIN_AREA} m²")
+ print(f"Patro: od {MIN_FLOOR}. NP")
+ print(f"Region: Praha")
+ if cache:
+ print(f"Cache: {len(cache)} bytů z minulého běhu")
+ print("=" * 60)
+
+ # Step 1: Fetch all listing pages
+ print("\nFáze 1: Stahování seznamu inzerátů...")
+ all_listings = []
+ page = 1
+ total = None
+
+ while True:
+ print(f" Strana {page} ...")
+ items, total_count = fetch_listing_page(page)
+ if total is None:
+ total = total_count
+ total_pages = math.ceil(total / PER_PAGE)
+ print(f" → Celkem {total} inzerátů, {total_pages} stran")
+
+ if not items:
+ break
+
+ all_listings.extend(items)
+ page += 1
+ if page > total_pages:
+ break
+ time.sleep(0.5)
+
+ print(f"\n Staženo: {len(all_listings)} inzerátů")
+
+ # Step 2: Pre-filter by category, price, area from listing data
+ pre_filtered = []
+ excluded_category = 0
+ excluded_price = 0
+ excluded_area = 0
+ excluded_no_gps = 0
+
+ for item in all_listings:
+ cat = item.get("category", "")
+ if cat not in WANTED_CATEGORIES:
+ excluded_category += 1
+ continue
+
+ price = item.get("price", {}).get("total", 0) or 0
+ if price > MAX_PRICE or price == 0:
+ excluded_price += 1
+ continue
+
+ area = item.get("area", {}).get("main")
+ if area is not None and area < MIN_AREA:
+ excluded_area += 1
+ continue
+
+ loc = item.get("location", {})
+ if not loc.get("latitude") or not loc.get("longitude"):
+ excluded_no_gps += 1
+ continue
+
+ pre_filtered.append(item)
+
+ print(f"\nPo předfiltraci:")
+ print(f" Vyloučeno (dispozice): {excluded_category}")
+ print(f" Vyloučeno (cena): {excluded_price}")
+ print(f" Vyloučeno (plocha): {excluded_area}")
+ print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
+ print(f" Zbývá: {len(pre_filtered)}")
+
+ # Step 3: Fetch details for remaining listings (floor, building type)
+ print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
+ results = []
+ excluded_panel = 0
+ excluded_floor = 0
+ excluded_detail = 0
+ cache_hits = 0
+
+ for i, item in enumerate(pre_filtered):
+ # Check cache — if hash_id exists and price unchanged, reuse
+ item_id = int(item["id"])
+ item_price = item.get("price", {}).get("total", 0) or 0
+ cached = cache.get(item_id)
+ if cached and cached.get("price") == item_price:
+ cache_hits += 1
+ results.append(cached)
+ continue
+
+ time.sleep(0.3)
+ detail_data = fetch_detail(item["url"])
+
+ if not detail_data:
+ excluded_detail += 1
+ continue
+
+ detail = detail_data.get("offer", {}).get("detail", {})
+ if not detail and "detail" in detail_data:
+ detail = detail_data["detail"]
+
+ # Check building type — exclude panel
+ building_type = detail.get("buildingType", "")
+ if building_type == "PANEL":
+ excluded_panel += 1
+ print(f" ✗ Vyloučen #{item['id']}: panel")
+ continue
+
+ # Check building position — exclude sídliště
+ building_position = detail.get("buildingPosition", "")
+ if building_position and "ESTATE" in str(building_position).upper():
+ excluded_panel += 1
+ print(f" ✗ Vyloučen #{item['id']}: sídliště")
+ continue
+
+ # Check floor
+ floor = detail.get("floor")
+ if floor is not None and floor < MIN_FLOOR:
+ excluded_floor += 1
+ continue
+
+ # Map building type
+ bt_map = {
+ "BRICK": "Cihlová",
+ "PANEL": "Panelová",
+ "WOOD": "Dřevostavba",
+ "STEEL": "Ocelová",
+ "MIXED": "Smíšená",
+ "MONTAGE": "Montovaná",
+ }
+ ownership_map = {
+ "PRIVATE": "Osobní",
+ "COOPERATIVE": "Družstevní",
+ "STATE": "Státní/obecní",
+ }
+
+ cat = item.get("category", "")
+ loc = item.get("location", {})
+
+ result = {
+ "hash_id": int(item["id"]),
+ "name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')} m²",
+ "price": item.get("price", {}).get("total", 0),
+ "price_formatted": format_price(item.get("price", {}).get("total", 0)),
+ "locality": loc.get("address", "Praha"),
+ "lat": loc["latitude"],
+ "lon": loc["longitude"],
+ "disposition": CATEGORY_LABELS.get(cat, "?"),
+ "floor": floor,
+ "area": item.get("area", {}).get("main"),
+ "building_type": bt_map.get(building_type, building_type or "neuvedeno"),
+ "ownership": ownership_map.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
+ "url": f"{BASE_URL}{item['url']}",
+ "source": "realingo",
+ "image": "",
+ }
+ results.append(result)
+
+ if (i + 1) % 20 == 0:
+ print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
+
+ print(f"\n{'=' * 60}")
+ print(f"Výsledky Realingo:")
+ print(f" Předfiltrováno: {len(pre_filtered)}")
+ print(f" Z cache (přeskočeno): {cache_hits}")
+ print(f" Vyloučeno (panel/síd): {excluded_panel}")
+ print(f" Vyloučeno (patro): {excluded_floor}")
+ print(f" Vyloučeno (bez detailu): {excluded_detail}")
+ print(f" ✓ Vyhovující byty: {len(results)}")
+ print(f"{'=' * 60}")
+
+ return results
+
+
+if __name__ == "__main__":
+ start = time.time()
+ estates = scrape()
+
+ if estates:
+ json_path = Path("byty_realingo.json")
+ json_path.write_text(
+ json.dumps(estates, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+ elapsed = time.time() - start
+ print(f"\n✓ Data uložena: {json_path.resolve()}")
+ print(f"⏱ Celkový čas: {elapsed:.0f} s")
+ else:
+ print("\nŽádné byty z Realinga neodpovídají kritériím :(")