Upload files to "/"

v1 scrapery
2026-02-13 16:11:28 +00:00
parent 82d1f94104
commit 846d0bd9f2
5 changed files with 1760 additions and 0 deletions
--- a/scrape_bezrealitky.py
+++ b/scrape_bezrealitky.py
@@ -0,0 +1,351 @@
 #!/usr/bin/env python3
 """
 Bezrealitky.cz scraper.
 Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
 Výstup: byty_bezrealitky.json
 """
 from __future__ import annotations
 import json
 import math
 import re
 import time
 import urllib.request
 from pathlib import Path
 # ── Konfigurace ─────────────────────────────────────────────────────────────
 MAX_PRICE = 13_500_000
 MIN_AREA = 69
 MIN_FLOOR = 2
 PER_PAGE = 15  # Bezrealitky vrací 15 na stránku
 # Dispozice které chceme
 WANTED_DISPOSITIONS = {
    "DISP_3_KK", "DISP_3_1",
    "DISP_4_KK", "DISP_4_1",
    "DISP_5_KK", "DISP_5_1",
    "DISP_6",
    "DISP_OTHER",  # atypické
 }
 DISPOSITION_LABELS = {
    "DISP_1_KK": "1+kk", "DISP_1_1": "1+1",
    "DISP_2_KK": "2+kk", "DISP_2_1": "2+1",
    "DISP_3_KK": "3+kk", "DISP_3_1": "3+1",
    "DISP_4_KK": "4+kk", "DISP_4_1": "4+1",
    "DISP_5_KK": "5+kk", "DISP_5_1": "5+1",
    "DISP_6": "6+",
    "DISP_OTHER": "Atypický",
 }
 CONSTRUCTION_MAP = {
    "BRICK": "Cihlová",
    "PANEL": "Panelová",
    "WOOD": "Dřevostavba",
    "MIXED": "Smíšená",
    "MONTAGE": "Montovaná",
    "STEEL": "Ocelová",
 }
 OWNERSHIP_MAP = {
    "OSOBNI": "Osobní",
    "DRUZSTEVNI": "Družstevní",
    "STATNI": "Státní/obecní",
 }
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml",
    "Accept-Language": "cs,en;q=0.9",
 }
 BASE_URL = "https://www.bezrealitky.cz"
 def fetch_page(page: int) -> tuple[list[dict], int]:
    """
    Fetch a listing page from Bezrealitky.
    Returns (list of advert dicts from Apollo cache, total count).
    """
    url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
    req = urllib.request.Request(url, headers=HEADERS)
    resp = urllib.request.urlopen(req, timeout=30)
    html = resp.read().decode("utf-8")
    match = re.search(
        r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
        html, re.DOTALL
    )
    if not match:
        return [], 0
    data = json.loads(match.group(1))
    cache = data["props"]["pageProps"]["apolloCache"]
    # Extract adverts from cache
    adverts = []
    for key, val in cache.items():
        if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
            adverts.append(val)
    # Get total count from ROOT_QUERY
    total = 0
    root = cache.get("ROOT_QUERY", {})
    for key, val in root.items():
        if "listAdverts" in key and isinstance(val, dict):
            tc = val.get("totalCount")
            if tc and tc > total:
                total = tc
    return adverts, total
 def fetch_detail(uri: str) -> dict | None:
    """Fetch detail page for a listing."""
    try:
        url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
        req = urllib.request.Request(url, headers=HEADERS)
        resp = urllib.request.urlopen(req, timeout=30)
        html = resp.read().decode("utf-8")
        match = re.search(
            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
            html, re.DOTALL
        )
        if not match:
            return None
        data = json.loads(match.group(1))
        cache = data["props"]["pageProps"]["apolloCache"]
        # Find the full advert in cache
        for key, val in cache.items():
            if key.startswith("Advert:") and isinstance(val, dict):
                # Detail pages have much more fields
                if "construction" in val or "etage" in val or "ownership" in val:
                    return val
    except Exception as e:
        print(f"    Warning: detail failed for {uri}: {e}")
    return None
 def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"
 def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
    """Load previously scraped data as cache keyed by hash_id."""
    path = Path(json_path)
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        return {e["hash_id"]: e for e in data if "hash_id" in e}
    except (json.JSONDecodeError, KeyError):
        return {}
 def scrape():
    cache = load_cache()
    print("=" * 60)
    print("Stahuji inzeráty z Bezrealitky.cz")
    print(f"Cena: do {format_price(MAX_PRICE)}")
    print(f"Min. plocha: {MIN_AREA} m²")
    print(f"Patro: od {MIN_FLOOR}. NP")
    print(f"Region: Praha")
    if cache:
        print(f"Cache: {len(cache)} bytů z minulého běhu")
    print("=" * 60)
    # Step 1: Fetch all listing pages
    print("\nFáze 1: Stahování seznamu inzerátů...")
    all_adverts = {}  # id -> advert dict (dedup)
    page = 1
    total = None
    while True:
        print(f"  Strana {page} ...")
        adverts, total_count = fetch_page(page)
        if total is None and total_count > 0:
            total = total_count
            total_pages = math.ceil(total / PER_PAGE)
            print(f"  → Celkem {total} inzerátů, ~{total_pages} stran")
        if not adverts:
            break
        for adv in adverts:
            adv_id = adv.get("id")
            if adv_id and adv_id not in all_adverts:
                all_adverts[adv_id] = adv
        page += 1
        if total and page > math.ceil(total / PER_PAGE):
            break
        time.sleep(0.5)
    print(f"\n  Staženo: {len(all_adverts)} unikátních inzerátů")
    # Step 2: Pre-filter by disposition, price, area from list data
    pre_filtered = []
    excluded_disp = 0
    excluded_price = 0
    excluded_area = 0
    excluded_no_gps = 0
    for adv in all_adverts.values():
        disp = adv.get("disposition", "")
        if disp not in WANTED_DISPOSITIONS:
            excluded_disp += 1
            continue
        price = adv.get("price", 0) or 0
        if price > MAX_PRICE or price == 0:
            excluded_price += 1
            continue
        surface = adv.get("surface")
        if surface is not None and surface < MIN_AREA:
            excluded_area += 1
            continue
        gps = adv.get("gps", {})
        if not gps or not gps.get("lat") or not gps.get("lng"):
            excluded_no_gps += 1
            continue
        pre_filtered.append(adv)
    print(f"\nPo předfiltraci:")
    print(f"  Vyloučeno (dispozice): {excluded_disp}")
    print(f"  Vyloučeno (cena):      {excluded_price}")
    print(f"  Vyloučeno (plocha):    {excluded_area}")
    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
    print(f"  Zbývá:                 {len(pre_filtered)}")
    # Step 3: Fetch details
    print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
    results = []
    excluded_panel = 0
    excluded_floor = 0
    excluded_detail = 0
    cache_hits = 0
    for i, adv in enumerate(pre_filtered):
        uri = adv.get("uri", "")
        if not uri:
            excluded_detail += 1
            continue
        # Check cache — if hash_id exists and price unchanged, reuse
        adv_id = int(adv["id"])
        adv_price = adv.get("price", 0) or 0
        cached = cache.get(adv_id)
        if cached and cached.get("price") == adv_price:
            cache_hits += 1
            results.append(cached)
            continue
        time.sleep(0.4)
        detail = fetch_detail(uri)
        if not detail:
            excluded_detail += 1
            continue
        # Check construction — exclude panel
        construction = detail.get("construction", "")
        if construction == "PANEL":
            excluded_panel += 1
            print(f"  ✗ Vyloučen #{adv['id']}: panel")
            continue
        # Check situation — exclude sídliště
        situation = detail.get("situation", "")
        if situation and "HOUSING_ESTATE" in str(situation).upper():
            excluded_panel += 1
            print(f"  ✗ Vyloučen #{adv['id']}: sídliště")
            continue
        # Check floor (etage)
        etage = detail.get("etage")
        if etage is not None and etage < MIN_FLOOR:
            excluded_floor += 1
            continue
        gps = adv.get("gps", {})
        disp = adv.get("disposition", "")
        # Get address — key includes locale parameter
        address = ""
        for key in detail:
            if key.startswith("address(") and "withHouseNumber" not in key:
                address = detail[key]
                break
        if not address:
            for key in detail:
                if key.startswith("address("):
                    address = detail[key]
                    break
        if not address:
            address = adv.get('address({"locale":"CS"})', "Praha")
        result = {
            "hash_id": int(adv["id"]),
            "name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')} m²",
            "price": adv.get("price", 0),
            "price_formatted": format_price(adv.get("price", 0)),
            "locality": address,
            "lat": gps["lat"],
            "lon": gps["lng"],
            "disposition": DISPOSITION_LABELS.get(disp, "?"),
            "floor": etage,
            "area": adv.get("surface"),
            "building_type": CONSTRUCTION_MAP.get(construction, construction or "neuvedeno"),
            "ownership": OWNERSHIP_MAP.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
            "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
            "source": "bezrealitky",
            "image": "",
        }
        results.append(result)
        if (i + 1) % 20 == 0:
            print(f"  Zpracováno {i + 1}/{len(pre_filtered)} ...")
    print(f"\n{'=' * 60}")
    print(f"Výsledky Bezrealitky:")
    print(f"  Předfiltrováno:        {len(pre_filtered)}")
    print(f"  Z cache (přeskočeno): {cache_hits}")
    print(f"  Vyloučeno (panel/síd): {excluded_panel}")
    print(f"  Vyloučeno (patro):     {excluded_floor}")
    print(f"  Vyloučeno (bez detailu): {excluded_detail}")
    print(f"  ✓ Vyhovující byty:    {len(results)}")
    print(f"{'=' * 60}")
    return results
 if __name__ == "__main__":
    start = time.time()
    estates = scrape()
    if estates:
        json_path = Path("byty_bezrealitky.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        print(f"\n✓ Data uložena: {json_path.resolve()}")
        print(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")
--- a/scrape_cityhome.py
+++ b/scrape_cityhome.py
@@ -0,0 +1,328 @@
 #!/usr/bin/env python3
 """
 CityHome (city-home.cz) scraper.
 Stáhne byty na prodej v Praze z projektů CityHome/SATPO.
 Výstup: byty_cityhome.json
 """
 from __future__ import annotations
 import json
 import re
 import time
 import urllib.request
 from pathlib import Path
 # ── Konfigurace ─────────────────────────────────────────────────────────────
 MAX_PRICE = 14_000_000
 MIN_AREA = 69
 MIN_FLOOR = 2
 WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml",
    "Accept-Language": "cs,en;q=0.9",
 }
 BASE_URL = "https://www.city-home.cz"
 def fetch_url(url: str) -> str:
    """Fetch URL and return HTML string."""
    for attempt in range(3):
        try:
            req = urllib.request.Request(url, headers=HEADERS)
            resp = urllib.request.urlopen(req, timeout=30)
            return resp.read().decode("utf-8")
        except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
            if attempt < 2:
                time.sleep((attempt + 1) * 2)
                print(f"    Retry {attempt + 1}: {e}")
            else:
                raise
 def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"
 def parse_filter_page(html: str) -> list[dict]:
    """Parse all listing rows from the filter page."""
    listings = []
    # Find all <tr> with data-cena attribute
    row_pattern = re.compile(
        r'<tr[^>]*'
        r'data-cena="(\d+)"[^>]*'
        r'data-plocha="([\d.]+)"[^>]*'
        r'data-unittype="(\d+)"[^>]*'
        r'data-free="(yes|no)"[^>]*'
        r'data-project="(\d+)"[^>]*'
        r'data-transaction="([^"]*)"[^>]*'
        r'data-dispozition="([^"]*)"[^>]*'
        r'data-location="([^"]*)"[^>]*'
        r'>(.*?)</tr>',
        re.DOTALL
    )
    # Also try with different attribute order
    rows = re.findall(r'<tr[^>]*data-cena="[^"]*"[^>]*>(.*?)</tr>', html, re.DOTALL)
    for row_html in rows:
        # Extract data attributes from the surrounding <tr>
        tr_match = re.search(
            r'<tr[^>]*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*'
            r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*'
            r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*'
            r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"',
            html
        )
        # More flexible: search around each row
        pass
    # Better approach: find each tr tag with all its attributes
    for match in re.finditer(r'<tr\s+([^>]*data-cena="[^"]*"[^>]*)>(.*?)</tr>', html, re.DOTALL):
        attrs_str = match.group(1)
        row_content = match.group(2)
        # Extract all data attributes
        cena = re.search(r'data-cena="(\d+)"', attrs_str)
        plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str)
        unittype = re.search(r'data-unittype="(\d+)"', attrs_str)
        free = re.search(r'data-free="(yes|no)"', attrs_str)
        project = re.search(r'data-project="(\d+)"', attrs_str)
        transaction = re.search(r'data-transaction="([^"]*)"', attrs_str)
        dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str)
        location = re.search(r'data-location="([^"]*)"', attrs_str)
        if not cena:
            continue
        # Extract detail URL and unit name from first cell
        link_match = re.search(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', row_content, re.DOTALL)
        detail_url = link_match.group(1).strip() if link_match else ""
        unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else ""
        if detail_url and not detail_url.startswith("http"):
            detail_url = BASE_URL + detail_url
        # Extract floor from cells — look for pattern like "3.NP" or "2.PP"
        cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
        floor = None
        floor_text = ""
        project_name = ""
        for cell in cells:
            cell_text = re.sub(r'<[^>]+>', '', cell).strip()
            # Floor pattern
            np_match = re.search(r'(\d+)\.\s*NP', cell_text)
            pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
            if np_match:
                floor = int(np_match.group(1))
                floor_text = cell_text
            elif pp_match:
                floor = -int(pp_match.group(1))  # Underground
                floor_text = cell_text
        # Extract project name — usually in a cell that's not a number/price/floor
        for cell in cells:
            cell_text = re.sub(r'<[^>]+>', '', cell).strip()
            if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
                if len(cell_text) > 3 and cell_text != unit_name:
                    project_name = cell_text
                    break
        listing = {
            "price": int(cena.group(1)),
            "area": float(plocha.group(1)) if plocha else 0,
            "unittype": int(unittype.group(1)) if unittype else 0,
            "free": free.group(1) if free else "no",
            "project_id": project.group(1) if project else "",
            "transaction": transaction.group(1) if transaction else "",
            "disposition": dispozition.group(1) if dispozition else "",
            "location": location.group(1) if location else "",
            "url": detail_url,
            "unit_name": unit_name,
            "floor": floor,
            "project_name": project_name,
        }
        listings.append(listing)
    return listings
 def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
    """Extract GPS coordinates for projects from locality pages."""
    # Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
    gps_data = {}
    for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
        name = match.group(1).strip()
        lat = float(match.group(2))
        lon = float(match.group(3))
        gps_data[name] = (lat, lon)
    return gps_data
 def scrape():
    print("=" * 60)
    print("Stahuji inzeráty z CityHome (city-home.cz)")
    print(f"Cena: do {format_price(MAX_PRICE)}")
    print(f"Min. plocha: {MIN_AREA} m²")
    print(f"Patro: od {MIN_FLOOR}. NP")
    print("=" * 60)
    # Step 1: Fetch the main filter page
    print("\nFáze 1: Stahování seznamu bytů...")
    html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
    all_listings = parse_filter_page(html)
    print(f"  Nalezeno: {len(all_listings)} jednotek")
    # Step 2: Collect unique project slugs from detail URLs to fetch GPS
    print("\nFáze 2: Stahování GPS souřadnic projektů...")
    project_slugs = set()
    for listing in all_listings:
        url = listing.get("url", "")
        # /projekty/zateckych-14/nabidka-nemovitosti/byt-a31
        slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
        if slug_match:
            project_slugs.add(slug_match.group(1))
    # Fetch GPS for each project from locality pages
    project_gps = {}
    for slug in sorted(project_slugs):
        time.sleep(0.5)
        try:
            locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
            loc_html = fetch_url(locality_url)
            gps = extract_project_gps(loc_html)
            if gps:
                # Take first entry (the project itself)
                first_name, (lat, lon) = next(iter(gps.items()))
                project_gps[slug] = (lat, lon)
                print(f"  ✓ {slug}: {lat}, {lon}")
            else:
                print(f"  ✗ {slug}: GPS nenalezeno")
        except Exception as e:
            print(f"  ✗ {slug}: chyba ({e})")
    # Step 3: Filter listings
    print(f"\nFáze 3: Filtrování...")
    results = []
    excluded_sold = 0
    excluded_type = 0
    excluded_disp = 0
    excluded_price = 0
    excluded_area = 0
    excluded_floor = 0
    excluded_no_gps = 0
    for listing in all_listings:
        # Only available units
        if listing["free"] != "yes":
            excluded_sold += 1
            continue
        # Only apartments (unittype=2)
        if listing["unittype"] != 2:
            excluded_type += 1
            continue
        # Only sales
        if listing["transaction"] != "prodej":
            excluded_type += 1
            continue
        # Disposition
        disp = listing["disposition"]
        if disp not in WANTED_DISPOSITIONS:
            excluded_disp += 1
            continue
        # Price
        price = listing["price"]
        if price <= 0 or price > MAX_PRICE:
            excluded_price += 1
            continue
        # Area
        area = listing["area"]
        if area < MIN_AREA:
            excluded_area += 1
            continue
        # Floor
        floor = listing["floor"]
        if floor is not None and floor < MIN_FLOOR:
            excluded_floor += 1
            continue
        # GPS from project
        url = listing.get("url", "")
        slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
        slug = slug_match.group(1) if slug_match else ""
        gps = project_gps.get(slug)
        if not gps:
            excluded_no_gps += 1
            continue
        lat, lon = gps
        result = {
            "hash_id": f"cityhome_{slug}_{listing['unit_name']}",
            "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
            "price": price,
            "price_formatted": format_price(price),
            "locality": f"{listing['project_name']}, Praha",
            "lat": lat,
            "lon": lon,
            "disposition": disp,
            "floor": floor,
            "area": area,
            "building_type": "Cihlová",  # CityHome renovuje cihlové domy
            "ownership": "neuvedeno",
            "url": url,
            "source": "cityhome",
            "image": "",
        }
        results.append(result)
    print(f"\n{'=' * 60}")
    print(f"Výsledky CityHome:")
    print(f"  Celkem jednotek:       {len(all_listings)}")
    print(f"  Vyloučeno (prodáno):   {excluded_sold}")
    print(f"  Vyloučeno (typ):       {excluded_type}")
    print(f"  Vyloučeno (dispozice): {excluded_disp}")
    print(f"  Vyloučeno (cena):      {excluded_price}")
    print(f"  Vyloučeno (plocha):    {excluded_area}")
    print(f"  Vyloučeno (patro):     {excluded_floor}")
    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
    print(f"  ✓ Vyhovující byty:    {len(results)}")
    print(f"{'=' * 60}")
    return results
 if __name__ == "__main__":
    start = time.time()
    estates = scrape()
    if estates:
        json_path = Path("byty_cityhome.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        print(f"\n✓ Data uložena: {json_path.resolve()}")
        print(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        print("\nŽádné byty z CityHome neodpovídají kritériím :(")
--- a/scrape_idnes.py
+++ b/scrape_idnes.py
@@ -0,0 +1,464 @@
 #!/usr/bin/env python3
 """
 Reality iDNES scraper.
 Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
 Výstup: byty_idnes.json
 """
 from __future__ import annotations
 import json
 import math
 import re
 import time
 import urllib.request
 import urllib.parse
 from html.parser import HTMLParser
 from pathlib import Path
 # ── Konfigurace ─────────────────────────────────────────────────────────────
 MAX_PRICE = 13_500_000
 MIN_AREA = 69
 MIN_FLOOR = 2
 PER_PAGE = 26  # iDNES vrací 26 na stránku
 # Dispozice — kódy pro s-qc[subtypeFlat]
 DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k"
 # Mapování dispozice z titulku na label
 DISPOSITION_MAP = {
    "3+kk": "3+kk", "3+1": "3+1",
    "4+kk": "4+kk", "4+1": "4+1",
    "5+kk": "5+kk", "5+1": "5+1",
    "6+kk": "6+", "6+1": "6+",
    "6 a více": "6+",
 }
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "cs,en;q=0.9",
    "Accept-Encoding": "identity",
    "Connection": "keep-alive",
 }
 BASE_URL = "https://reality.idnes.cz"
 MAX_RETRIES = 5
 def fetch_url(url: str) -> str:
    """Fetch URL and return HTML string with retry logic."""
    for attempt in range(MAX_RETRIES):
        try:
            req = urllib.request.Request(url, headers=HEADERS)
            resp = urllib.request.urlopen(req, timeout=30)
            data = resp.read()
            return data.decode("utf-8")
        except (ConnectionResetError, ConnectionError, urllib.error.URLError,
                OSError) as e:
            if attempt < MAX_RETRIES - 1:
                wait = (attempt + 1) * 3  # 3, 6, 9, 12s
                print(f"    Retry {attempt + 1}/{MAX_RETRIES} (wait {wait}s): {e}")
                time.sleep(wait)
            else:
                raise
 def build_list_url(page: int = 0) -> str:
    """Build listing URL with all filters."""
    base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/"
    params = {
        "s-qc[subtypeFlat]": DISPOSITION_CODES,
        "s-qc[usableAreaMin]": str(MIN_AREA),
    }
    url = f"{base}?{urllib.parse.urlencode(params)}"
    if page > 0:
        url += f"&page={page}"
    return url
 def parse_total_count(html: str) -> int:
    """Extract total listing count from page."""
    # Look for "720 inzerátů" or similar
    match = re.search(r'(\d[\d\s]*)\s*inzerát', html)
    if match:
        return int(match.group(1).replace(" ", "").replace("\xa0", ""))
    return 0
 def parse_listings(html: str) -> list[dict]:
    """Parse listing cards from HTML using regex."""
    results = []
    # Find each listing block — look for c-products__link with detail URL
    # Pattern: <a ... class="c-products__link" href="/detail/..."> ... block ... </a>
    # Each listing card contains: title (h2), price (strong), info (p.c-products__info)
    # Split by listing items, skip ads
    items = re.findall(
        r'<div[^>]*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)</div>\s*</div>\s*</div>',
        html, re.DOTALL
    )
    # Alternative: find all detail links and extract surrounding context
    # More robust approach: find each detail link and parse nearby elements
    link_pattern = re.compile(
        r'<a[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>',
        re.DOTALL
    )
    # Also match when class comes before href
    link_pattern2 = re.compile(
        r'<a[^>]*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>',
        re.DOTALL
    )
    # Find all c-products__link anchors
    all_links = link_pattern.findall(html) + link_pattern2.findall(html)
    seen_urls = set()
    # For each link, find the surrounding product block
    for link_url in all_links:
        if link_url in seen_urls:
            continue
        seen_urls.add(link_url)
        # Find context around this link (the product card)
        escaped_url = re.escape(link_url)
        context_match = re.search(
            escaped_url + r'(.*?)</div>\s*</div>',
            html, re.DOTALL
        )
        if not context_match:
            continue
        block = context_match.group(1)
        # Ensure full URL
        url = link_url
        if not url.startswith("http"):
            url = BASE_URL + url
        # Skip ads
        ad_check_start = max(0, context_match.start() - 500)
        ad_block = html[ad_check_start:context_match.start()]
        if "advertisment" in ad_block or "advertisement" in ad_block:
            continue
        # Parse title: <h2 class="c-products__title">prodej bytu 3+kk 79 m2</h2>
        title_match = re.search(r'class="c-products__title"[^>]*>(.*?)</h2>', block, re.DOTALL)
        title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else ""
        # Parse price: <p class="c-products__price"><strong>12 950 000 Kč</strong></p>
        price_match = re.search(r'c-products__price[^>]*>.*?<strong>(.*?)</strong>', block, re.DOTALL)
        price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else ""
        # Parse address: <p class="c-products__info">Klečkova, Praha 5 - Stodůlky</p>
        info_match = re.search(r'class="c-products__info"[^>]*>(.*?)</p>', block, re.DOTALL)
        info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else ""
        # Parse disposition and area from title
        disp_match = re.search(r'(\d\+(?:kk|\d))', title)
        area_match = re.search(r'(\d+)\s*m[²2]', title)
        disposition = disp_match.group(1) if disp_match else None
        area = int(area_match.group(1)) if area_match else None
        if not disposition and ("6 a" in title or "6+" in title):
            disposition = "6+"
        # Parse price
        price = 0
        if price_text and "vyžádání" not in price_text.lower():
            price_clean = re.sub(r'[^\d]', '', price_text)
            if price_clean:
                price = int(price_clean)
        # Extract listing ID from URL
        id_match = re.search(r'/([a-f0-9]{24})/?', url)
        listing_id = id_match.group(1) if id_match else url
        results.append({
            "id": listing_id,
            "url": url,
            "disposition": DISPOSITION_MAP.get(disposition, disposition or "?"),
            "area": area,
            "price": price,
            "locality": info,
        })
    return results
 def parse_detail(html: str) -> dict:
    """Parse detail page for GPS, floor, construction, ownership."""
    detail = {}
    # 1. Parse dataLayer.push() for GPS and other data
    dl_match = re.search(
        r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)',
        html, re.DOTALL
    )
    if dl_match:
        # Clean up JS object to valid JSON
        js_obj = dl_match.group(1)
        # Replace single quotes with double, handle trailing commas, etc.
        # The dataLayer is usually valid JSON-like, let's try parsing
        try:
            # Remove JS comments, handle unquoted keys
            # Most importantly: listing_lat, listing_lon, listing_price, listing_area
            lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj)
            lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj)
            if lat_match:
                detail["lat"] = float(lat_match.group(1))
            if lon_match:
                detail["lon"] = float(lon_match.group(1))
        except (ValueError, AttributeError):
            pass
    # 2. Parse DT/DD pairs for floor, construction, ownership
    # Pattern: <dt>Label</dt><dd>Value</dd>
    dt_dd_pairs = re.findall(
        r'<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>',
        html, re.DOTALL
    )
    for dt, dd in dt_dd_pairs:
        dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower()
        dd_clean = re.sub(r'<[^>]+>', '', dd).strip()
        if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean:
            # "2. patro (3. NP)" or "3. podlaží z celkem 5"
            # Try to find NP first
            np_match = re.search(r'(\d+)\.\s*NP', dd_clean)
            if np_match:
                detail["floor"] = int(np_match.group(1))
            else:
                # Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly
                patro_match = re.search(r'(\d+)', dd_clean)
                if patro_match:
                    detail["floor"] = int(patro_match.group(1))
        if "konstrukce" in dt_clean or "stavba" in dt_clean:
            detail["construction"] = dd_clean.lower()
        if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean:
            detail["ownership"] = dd_clean
    return detail
 def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"
 def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
    """Load previously scraped data as cache keyed by hash_id."""
    path = Path(json_path)
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        return {str(e["hash_id"]): e for e in data if "hash_id" in e}
    except (json.JSONDecodeError, KeyError):
        return {}
 def scrape():
    cache = load_cache()
    print("=" * 60)
    print("Stahuji inzeráty z Reality iDNES")
    print(f"Cena: do {format_price(MAX_PRICE)}")
    print(f"Min. plocha: {MIN_AREA} m²")
    print(f"Patro: od {MIN_FLOOR}. NP")
    print(f"Region: Praha")
    if cache:
        print(f"Cache: {len(cache)} bytů z minulého běhu")
    print("=" * 60)
    # Step 1: Fetch listing pages
    print("\nFáze 1: Stahování seznamu inzerátů...")
    all_listings = {}  # id -> listing dict
    page = 0
    total = None
    while True:
        url = build_list_url(page)
        print(f"  Strana {page + 1} ...")
        html = fetch_url(url)
        if total is None:
            total = parse_total_count(html)
            total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1
            print(f"  → Celkem {total} inzerátů, ~{total_pages} stran")
        listings = parse_listings(html)
        if not listings:
            break
        for item in listings:
            lid = item["id"]
            if lid not in all_listings:
                all_listings[lid] = item
        page += 1
        if total and page >= math.ceil(total / PER_PAGE):
            break
        time.sleep(1.0)
    print(f"\n  Staženo: {len(all_listings)} unikátních inzerátů")
    # Step 2: Pre-filter by price and area from list data
    pre_filtered = []
    excluded_price = 0
    excluded_area = 0
    excluded_disp = 0
    for item in all_listings.values():
        if item["price"] <= 0 or item["price"] > MAX_PRICE:
            excluded_price += 1
            continue
        if item["area"] is not None and item["area"] < MIN_AREA:
            excluded_area += 1
            continue
        if item["disposition"] == "?":
            excluded_disp += 1
            continue
        pre_filtered.append(item)
    print(f"\nPo předfiltraci:")
    print(f"  Vyloučeno (cena):      {excluded_price}")
    print(f"  Vyloučeno (plocha):    {excluded_area}")
    print(f"  Vyloučeno (dispozice): {excluded_disp}")
    print(f"  Zbývá:                 {len(pre_filtered)}")
    # Step 3: Fetch details for GPS, floor, construction
    print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
    results = []
    excluded_panel = 0
    excluded_floor = 0
    excluded_no_gps = 0
    excluded_detail = 0
    cache_hits = 0
    for i, item in enumerate(pre_filtered):
        # Check cache — if hash_id exists and price unchanged, reuse
        cached = cache.get(str(item["id"]))
        if cached and cached.get("price") == item["price"]:
            cache_hits += 1
            results.append(cached)
            continue
        url = item["url"]
        time.sleep(0.4)
        try:
            html = fetch_url(url)
        except Exception as e:
            print(f"    Warning: detail failed for {item['id']}: {e}")
            excluded_detail += 1
            continue
        detail = parse_detail(html)
        # Must have GPS
        if not detail.get("lat") or not detail.get("lon"):
            excluded_no_gps += 1
            continue
        # Check construction — exclude panel
        construction = detail.get("construction", "")
        if "panel" in construction:
            excluded_panel += 1
            print(f"  ✗ Vyloučen {item['id'][:12]}...: panel ({construction})")
            continue
        # Check for sídliště in construction/description
        if "sídliště" in construction or "sidliste" in construction:
            excluded_panel += 1
            print(f"  ✗ Vyloučen {item['id'][:12]}...: sídliště")
            continue
        # Check floor
        floor = detail.get("floor")
        if floor is not None and floor < MIN_FLOOR:
            excluded_floor += 1
            continue
        # Map construction to Czech label
        building_type = "neuvedeno"
        if construction:
            if "cihlo" in construction or "cihla" in construction:
                building_type = "Cihlová"
            elif "smíšen" in construction or "smisen" in construction:
                building_type = "Smíšená"
            elif "skelet" in construction:
                building_type = "Skeletová"
            elif "dřevo" in construction or "drevo" in construction:
                building_type = "Dřevostavba"
            elif "mont" in construction:
                building_type = "Montovaná"
            else:
                building_type = construction.capitalize()
        result = {
            "hash_id": item["id"],
            "name": f"Prodej bytu {item['disposition']} {item.get('area', '?')} m²",
            "price": item["price"],
            "price_formatted": format_price(item["price"]),
            "locality": item["locality"],
            "lat": detail["lat"],
            "lon": detail["lon"],
            "disposition": item["disposition"],
            "floor": floor,
            "area": item["area"],
            "building_type": building_type,
            "ownership": detail.get("ownership", "neuvedeno"),
            "url": item["url"],
            "source": "idnes",
            "image": "",
        }
        results.append(result)
        if (i + 1) % 20 == 0:
            print(f"  Zpracováno {i + 1}/{len(pre_filtered)} ...")
    print(f"\n{'=' * 60}")
    print(f"Výsledky Reality iDNES:")
    print(f"  Předfiltrováno:        {len(pre_filtered)}")
    print(f"  Z cache (přeskočeno): {cache_hits}")
    print(f"  Vyloučeno (panel/síd): {excluded_panel}")
    print(f"  Vyloučeno (patro):     {excluded_floor}")
    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
    print(f"  Vyloučeno (bez detailu): {excluded_detail}")
    print(f"  ✓ Vyhovující byty:    {len(results)}")
    print(f"{'=' * 60}")
    return results
 if __name__ == "__main__":
    start = time.time()
    estates = scrape()
    if estates:
        json_path = Path("byty_idnes.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        print(f"\n✓ Data uložena: {json_path.resolve()}")
        print(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        print("\nŽádné byty z Reality iDNES neodpovídají kritériím :(")
--- a/scrape_psn.py
+++ b/scrape_psn.py
@@ -0,0 +1,306 @@
 #!/usr/bin/env python3
 """
 PSN.cz scraper.
 Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
 Výstup: byty_psn.json
 """
 from __future__ import annotations
 import json
 import re
 import subprocess
 import time
 from pathlib import Path
 # ── Konfigurace ─────────────────────────────────────────────────────────────
 MAX_PRICE = 14_000_000
 MIN_AREA = 69
 MIN_FLOOR = 2
 WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
 UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 BASE_URL = "https://psn.cz"
 # Known Prague project slugs with GPS (from research)
 PRAGUE_PROJECTS = [
    {"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
    {"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
    {"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
    {"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
    {"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
    {"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
    {"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
    {"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
    {"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
    {"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
    {"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
    {"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
    {"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
    {"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
    {"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
    {"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
 ]
 def fetch_url(url: str) -> str:
    """Fetch URL via curl (urllib SSL too old for Cloudflare)."""
    result = subprocess.run(
        ["curl", "-s", "-L", "--max-time", "30",
         "-H", f"User-Agent: {UA}",
         "-H", "Accept: text/html",
         url],
        capture_output=True, text=True, timeout=60
    )
    if result.returncode != 0:
        raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
    return result.stdout
 def extract_units_from_html(html: str) -> list[dict]:
    """Extract unit JSON objects from raw HTML with escaped quotes."""
    # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
    # Step 1: Unescape the double-backslash-quotes to regular quotes
    cleaned = html.replace('\\"', '"')
    # Step 2: Find each unit by looking for "title":"Byt and walking back to {
    units = []
    decoder = json.JSONDecoder()
    for m in re.finditer(r'"title":"Byt', cleaned):
        pos = m.start()
        # Walk backwards to find the opening brace
        depth = 0
        found = False
        for i in range(pos - 1, max(pos - 3000, 0), -1):
            if cleaned[i] == '}':
                depth += 1
            elif cleaned[i] == '{':
                if depth == 0:
                    try:
                        obj, end = decoder.raw_decode(cleaned, i)
                        if isinstance(obj, dict) and 'price_czk' in obj:
                            units.append(obj)
                            found = True
                    except (json.JSONDecodeError, ValueError):
                        pass
                    break
                depth -= 1
    return units
 def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"
 def scrape():
    print("=" * 60)
    print("Stahuji inzeráty z PSN.cz")
    print(f"Cena: do {format_price(MAX_PRICE)}")
    print(f"Min. plocha: {MIN_AREA} m²")
    print(f"Patro: od {MIN_FLOOR}. NP")
    print(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
    print("=" * 60)
    # Fetch units from each Prague project
    all_units = []
    for proj in PRAGUE_PROJECTS:
        page = 1
        project_units = []
        while True:
            url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
            print(f"  {proj['name']} — strana {page} ...")
            time.sleep(0.5)
            try:
                html = fetch_url(url)
            except Exception as e:
                print(f"    Chyba: {e}")
                break
            units = extract_units_from_html(html)
            if not units:
                if page == 1:
                    print(f"    → 0 jednotek")
                break
            # Add project info to each unit
            for unit in units:
                if not unit.get("latitude") or not unit.get("longitude"):
                    unit["latitude"] = proj["lat"]
                    unit["longitude"] = proj["lon"]
                unit["_project_name"] = proj["name"]
                unit["_project_slug"] = proj["slug"]
            project_units.extend(units)
            if page == 1:
                print(f"    → {len(units)} jednotek na stránce")
            # Check if there might be more pages
            # If we got fewer than expected or same units, stop
            if len(units) < 10:
                break
            page += 1
            if page > 10:  # Safety limit
                break
        all_units.extend(project_units)
    # Deduplicate by slug
    seen_slugs = set()
    unique_units = []
    for u in all_units:
        slug = u.get("slug", "")
        if slug and slug not in seen_slugs:
            seen_slugs.add(slug)
            unique_units.append(u)
        elif not slug:
            unique_units.append(u)
    print(f"\n  Staženo celkem: {len(unique_units)} unikátních jednotek")
    # Filter
    print(f"\nFiltrování...")
    results = []
    excluded_sold = 0
    excluded_type = 0
    excluded_disp = 0
    excluded_price = 0
    excluded_area = 0
    excluded_floor = 0
    excluded_panel = 0
    for unit in unique_units:
        # Only free units
        is_free = unit.get("is_free", False)
        is_sold = unit.get("is_sold", False)
        if is_sold or not is_free:
            excluded_sold += 1
            continue
        # Only apartments
        category = str(unit.get("category", "")).lower()
        if "byt" not in category and "ateliér" not in category:
            excluded_type += 1
            continue
        # Disposition
        disp = unit.get("disposition", "")
        if disp not in WANTED_DISPOSITIONS:
            excluded_disp += 1
            continue
        # Price
        price = unit.get("price_czk") or unit.get("action_price_czk") or 0
        if price <= 0 or price > MAX_PRICE:
            excluded_price += 1
            continue
        # Area
        area = unit.get("total_area") or unit.get("floor_area") or 0
        if area < MIN_AREA:
            excluded_area += 1
            continue
        # Floor
        floor_str = str(unit.get("floor", ""))
        floor = None
        if floor_str:
            try:
                floor = int(floor_str)
            except ValueError:
                floor_match = re.search(r'(-?\d+)', floor_str)
                if floor_match:
                    floor = int(floor_match.group(1))
        if floor is not None and floor < MIN_FLOOR:
            excluded_floor += 1
            continue
        # Construction — check for panel
        build_type = str(unit.get("build_type", "")).lower()
        if "panel" in build_type:
            excluded_panel += 1
            print(f"  ✗ Vyloučen: panel ({build_type})")
            continue
        # Build construction label
        building_type = "neuvedeno"
        if build_type and build_type != "nevybráno":
            if "cihlo" in build_type or "cihla" in build_type:
                building_type = "Cihlová"
            elif "skelet" in build_type:
                building_type = "Skeletová"
            else:
                building_type = build_type.capitalize()
        lat = unit.get("latitude", 0)
        lon = unit.get("longitude", 0)
        slug = unit.get("slug", "")
        project_slug = unit.get("_project_slug", "")
        detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
        result = {
            "hash_id": unit.get("id", slug),
            "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
            "price": int(price),
            "price_formatted": format_price(int(price)),
            "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
            "lat": lat,
            "lon": lon,
            "disposition": disp,
            "floor": floor,
            "area": area,
            "building_type": building_type,
            "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
            "url": detail_url,
            "source": "psn",
            "image": "",
        }
        results.append(result)
    print(f"\n{'=' * 60}")
    print(f"Výsledky PSN:")
    print(f"  Celkem jednotek:       {len(unique_units)}")
    print(f"  Vyloučeno (prodáno):   {excluded_sold}")
    print(f"  Vyloučeno (typ):       {excluded_type}")
    print(f"  Vyloučeno (dispozice): {excluded_disp}")
    print(f"  Vyloučeno (cena):      {excluded_price}")
    print(f"  Vyloučeno (plocha):    {excluded_area}")
    print(f"  Vyloučeno (patro):     {excluded_floor}")
    print(f"  Vyloučeno (panel):     {excluded_panel}")
    print(f"  ✓ Vyhovující byty:    {len(results)}")
    print(f"{'=' * 60}")
    return results
 if __name__ == "__main__":
    start = time.time()
    estates = scrape()
    if estates:
        json_path = Path("byty_psn.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        print(f"\n✓ Data uložena: {json_path.resolve()}")
        print(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        print("\nŽádné byty z PSN neodpovídají kritériím :(")
--- a/scrape_realingo.py
+++ b/scrape_realingo.py
@@ -0,0 +1,311 @@
 #!/usr/bin/env python3
 """
 Realingo.cz scraper.
 Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
 Výstup: byty_realingo.json
 """
 from __future__ import annotations
 import json
 import math
 import re
 import time
 import urllib.request
 from pathlib import Path
 # ── Konfigurace (sdílená se Sreality scraperem) ─────────────────────────────
 MAX_PRICE = 13_500_000
 MIN_AREA = 69
 MIN_FLOOR = 2
 PER_PAGE = 40  # Realingo vrací 40 na stránku
 # Kategorie které chceme (dispozice 3+kk a větší)
 WANTED_CATEGORIES = {
    "FLAT3_KK", "FLAT31",   # 3+kk, 3+1
    "FLAT4_KK", "FLAT41",   # 4+kk, 4+1
    "FLAT5_KK", "FLAT51",   # 5+kk, 5+1
    "FLAT6",                 # 6+
    "OTHERS_FLAT",           # atypické — zkontrolujeme plochu
 }
 # Mapování category → label
 CATEGORY_LABELS = {
    "FLAT1_KK": "1+kk", "FLAT11": "1+1",
    "FLAT2_KK": "2+kk", "FLAT21": "2+1",
    "FLAT3_KK": "3+kk", "FLAT31": "3+1",
    "FLAT4_KK": "4+kk", "FLAT41": "4+1",
    "FLAT5_KK": "5+kk", "FLAT51": "5+1",
    "FLAT6": "6+",
    "OTHERS_FLAT": "Atypický",
 }
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Accept": "text/html,application/xhtml+xml",
 }
 BASE_URL = "https://www.realingo.cz"
 def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
    """Fetch a page of Prague listings. Returns (items, total_count)."""
    if page == 1:
        url = f"{BASE_URL}/prodej_byty/praha/"
    else:
        url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
    req = urllib.request.Request(url, headers=HEADERS)
    resp = urllib.request.urlopen(req, timeout=30)
    html = resp.read().decode("utf-8")
    match = re.search(
        r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
        html, re.DOTALL
    )
    if not match:
        return [], 0
    data = json.loads(match.group(1))
    offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
    return offer_list["data"], offer_list["total"]
 def fetch_detail(listing_url: str) -> dict | None:
    """Fetch detail page for a listing to get floor, building type, etc."""
    try:
        url = f"{BASE_URL}{listing_url}"
        req = urllib.request.Request(url, headers=HEADERS)
        resp = urllib.request.urlopen(req, timeout=30)
        html = resp.read().decode("utf-8")
        match = re.search(
            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
            html, re.DOTALL
        )
        if not match:
            return None
        data = json.loads(match.group(1))
        details = data["props"]["pageProps"]["store"]["offer"]["details"]
        # Get first (only) detail entry
        for detail_data in details.values():
            return detail_data
    except Exception as e:
        print(f"    Warning: detail fetch failed for {listing_url}: {e}")
    return None
 def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"
 def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
    """Load previously scraped data as cache keyed by hash_id."""
    path = Path(json_path)
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        return {e["hash_id"]: e for e in data if "hash_id" in e}
    except (json.JSONDecodeError, KeyError):
        return {}
 def scrape():
    cache = load_cache()
    print("=" * 60)
    print("Stahuji inzeráty z Realingo.cz")
    print(f"Cena: do {format_price(MAX_PRICE)}")
    print(f"Min. plocha: {MIN_AREA} m²")
    print(f"Patro: od {MIN_FLOOR}. NP")
    print(f"Region: Praha")
    if cache:
        print(f"Cache: {len(cache)} bytů z minulého běhu")
    print("=" * 60)
    # Step 1: Fetch all listing pages
    print("\nFáze 1: Stahování seznamu inzerátů...")
    all_listings = []
    page = 1
    total = None
    while True:
        print(f"  Strana {page} ...")
        items, total_count = fetch_listing_page(page)
        if total is None:
            total = total_count
            total_pages = math.ceil(total / PER_PAGE)
            print(f"  → Celkem {total} inzerátů, {total_pages} stran")
        if not items:
            break
        all_listings.extend(items)
        page += 1
        if page > total_pages:
            break
        time.sleep(0.5)
    print(f"\n  Staženo: {len(all_listings)} inzerátů")
    # Step 2: Pre-filter by category, price, area from listing data
    pre_filtered = []
    excluded_category = 0
    excluded_price = 0
    excluded_area = 0
    excluded_no_gps = 0
    for item in all_listings:
        cat = item.get("category", "")
        if cat not in WANTED_CATEGORIES:
            excluded_category += 1
            continue
        price = item.get("price", {}).get("total", 0) or 0
        if price > MAX_PRICE or price == 0:
            excluded_price += 1
            continue
        area = item.get("area", {}).get("main")
        if area is not None and area < MIN_AREA:
            excluded_area += 1
            continue
        loc = item.get("location", {})
        if not loc.get("latitude") or not loc.get("longitude"):
            excluded_no_gps += 1
            continue
        pre_filtered.append(item)
    print(f"\nPo předfiltraci:")
    print(f"  Vyloučeno (dispozice): {excluded_category}")
    print(f"  Vyloučeno (cena):      {excluded_price}")
    print(f"  Vyloučeno (plocha):    {excluded_area}")
    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
    print(f"  Zbývá:                 {len(pre_filtered)}")
    # Step 3: Fetch details for remaining listings (floor, building type)
    print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
    results = []
    excluded_panel = 0
    excluded_floor = 0
    excluded_detail = 0
    cache_hits = 0
    for i, item in enumerate(pre_filtered):
        # Check cache — if hash_id exists and price unchanged, reuse
        item_id = int(item["id"])
        item_price = item.get("price", {}).get("total", 0) or 0
        cached = cache.get(item_id)
        if cached and cached.get("price") == item_price:
            cache_hits += 1
            results.append(cached)
            continue
        time.sleep(0.3)
        detail_data = fetch_detail(item["url"])
        if not detail_data:
            excluded_detail += 1
            continue
        detail = detail_data.get("offer", {}).get("detail", {})
        if not detail and "detail" in detail_data:
            detail = detail_data["detail"]
        # Check building type — exclude panel
        building_type = detail.get("buildingType", "")
        if building_type == "PANEL":
            excluded_panel += 1
            print(f"  ✗ Vyloučen #{item['id']}: panel")
            continue
        # Check building position — exclude sídliště
        building_position = detail.get("buildingPosition", "")
        if building_position and "ESTATE" in str(building_position).upper():
            excluded_panel += 1
            print(f"  ✗ Vyloučen #{item['id']}: sídliště")
            continue
        # Check floor
        floor = detail.get("floor")
        if floor is not None and floor < MIN_FLOOR:
            excluded_floor += 1
            continue
        # Map building type
        bt_map = {
            "BRICK": "Cihlová",
            "PANEL": "Panelová",
            "WOOD": "Dřevostavba",
            "STEEL": "Ocelová",
            "MIXED": "Smíšená",
            "MONTAGE": "Montovaná",
        }
        ownership_map = {
            "PRIVATE": "Osobní",
            "COOPERATIVE": "Družstevní",
            "STATE": "Státní/obecní",
        }
        cat = item.get("category", "")
        loc = item.get("location", {})
        result = {
            "hash_id": int(item["id"]),
            "name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')} m²",
            "price": item.get("price", {}).get("total", 0),
            "price_formatted": format_price(item.get("price", {}).get("total", 0)),
            "locality": loc.get("address", "Praha"),
            "lat": loc["latitude"],
            "lon": loc["longitude"],
            "disposition": CATEGORY_LABELS.get(cat, "?"),
            "floor": floor,
            "area": item.get("area", {}).get("main"),
            "building_type": bt_map.get(building_type, building_type or "neuvedeno"),
            "ownership": ownership_map.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
            "url": f"{BASE_URL}{item['url']}",
            "source": "realingo",
            "image": "",
        }
        results.append(result)
        if (i + 1) % 20 == 0:
            print(f"  Zpracováno {i + 1}/{len(pre_filtered)} ...")
    print(f"\n{'=' * 60}")
    print(f"Výsledky Realingo:")
    print(f"  Předfiltrováno:        {len(pre_filtered)}")
    print(f"  Z cache (přeskočeno): {cache_hits}")
    print(f"  Vyloučeno (panel/síd): {excluded_panel}")
    print(f"  Vyloučeno (patro):     {excluded_floor}")
    print(f"  Vyloučeno (bez detailu): {excluded_detail}")
    print(f"  ✓ Vyhovující byty:    {len(results)}")
    print(f"{'=' * 60}")
    return results
 if __name__ == "__main__":
    start = time.time()
    estates = scrape()
    if estates:
        json_path = Path("byty_realingo.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        print(f"\n✓ Data uložena: {json_path.resolve()}")
        print(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        print("\nŽádné byty z Realinga neodpovídají kritériím :(")