maru-hleda-byt/scrape_cityhome.py

#!/usr/bin/env python3
"""
CityHome (city-home.cz) scraper.
Stáhne byty na prodej v Praze z projektů CityHome/SATPO.
Výstup: byty_cityhome.json
"""
from __future__ import annotations

import json
import re
import time
import urllib.request
from pathlib import Path

# ── Konfigurace ─────────────────────────────────────────────────────────────

MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2

WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml",
    "Accept-Language": "cs,en;q=0.9",
}

BASE_URL = "https://www.city-home.cz"


def fetch_url(url: str) -> str:
    """Fetch URL and return HTML string."""
    for attempt in range(3):
        try:
            req = urllib.request.Request(url, headers=HEADERS)
            resp = urllib.request.urlopen(req, timeout=30)
            return resp.read().decode("utf-8")
        except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
            if attempt < 2:
                time.sleep((attempt + 1) * 2)
                print(f"    Retry {attempt + 1}: {e}")
            else:
                raise


def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"


def parse_filter_page(html: str) -> list[dict]:
    """Parse all listing rows from the filter page."""
    listings = []

    # Find all <tr> with data-cena attribute
    row_pattern = re.compile(
        r'<tr[^>]*'
        r'data-cena="(\d+)"[^>]*'
        r'data-plocha="([\d.]+)"[^>]*'
        r'data-unittype="(\d+)"[^>]*'
        r'data-free="(yes|no)"[^>]*'
        r'data-project="(\d+)"[^>]*'
        r'data-transaction="([^"]*)"[^>]*'
        r'data-dispozition="([^"]*)"[^>]*'
        r'data-location="([^"]*)"[^>]*'
        r'>(.*?)</tr>',
        re.DOTALL
    )

    # Also try with different attribute order
    rows = re.findall(r'<tr[^>]*data-cena="[^"]*"[^>]*>(.*?)</tr>', html, re.DOTALL)

    for row_html in rows:
        # Extract data attributes from the surrounding <tr>
        tr_match = re.search(
            r'<tr[^>]*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*'
            r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*'
            r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*'
            r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"',
            html
        )

        # More flexible: search around each row
        pass

    # Better approach: find each tr tag with all its attributes
    for match in re.finditer(r'<tr\s+([^>]*data-cena="[^"]*"[^>]*)>(.*?)</tr>', html, re.DOTALL):
        attrs_str = match.group(1)
        row_content = match.group(2)

        # Extract all data attributes
        cena = re.search(r'data-cena="(\d+)"', attrs_str)
        plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str)
        unittype = re.search(r'data-unittype="(\d+)"', attrs_str)
        free = re.search(r'data-free="(yes|no)"', attrs_str)
        project = re.search(r'data-project="(\d+)"', attrs_str)
        transaction = re.search(r'data-transaction="([^"]*)"', attrs_str)
        dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str)
        location = re.search(r'data-location="([^"]*)"', attrs_str)

        if not cena:
            continue

        # Extract detail URL and unit name from first cell
        link_match = re.search(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', row_content, re.DOTALL)
        detail_url = link_match.group(1).strip() if link_match else ""
        unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else ""

        if detail_url and not detail_url.startswith("http"):
            detail_url = BASE_URL + detail_url

        # Extract floor from cells — look for pattern like "3.NP" or "2.PP"
        cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
        floor = None
        floor_text = ""
        project_name = ""

        for cell in cells:
            cell_text = re.sub(r'<[^>]+>', '', cell).strip()
            # Floor pattern
            np_match = re.search(r'(\d+)\.\s*NP', cell_text)
            pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
            if np_match:
                floor = int(np_match.group(1))
                floor_text = cell_text
            elif pp_match:
                floor = -int(pp_match.group(1))  # Underground
                floor_text = cell_text

        # Extract project name — usually in a cell that's not a number/price/floor
        for cell in cells:
            cell_text = re.sub(r'<[^>]+>', '', cell).strip()
            if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
                if len(cell_text) > 3 and cell_text != unit_name:
                    project_name = cell_text
                    break

        listing = {
            "price": int(cena.group(1)),
            "area": float(plocha.group(1)) if plocha else 0,
            "unittype": int(unittype.group(1)) if unittype else 0,
            "free": free.group(1) if free else "no",
            "project_id": project.group(1) if project else "",
            "transaction": transaction.group(1) if transaction else "",
            "disposition": dispozition.group(1) if dispozition else "",
            "location": location.group(1) if location else "",
            "url": detail_url,
            "unit_name": unit_name,
            "floor": floor,
            "project_name": project_name,
        }
        listings.append(listing)

    return listings


def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
    """Extract GPS coordinates for projects from locality pages."""
    # Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
    gps_data = {}
    for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
        name = match.group(1).strip()
        lat = float(match.group(2))
        lon = float(match.group(3))
        gps_data[name] = (lat, lon)
    return gps_data


def scrape():
    print("=" * 60)
    print("Stahuji inzeráty z CityHome (city-home.cz)")
    print(f"Cena: do {format_price(MAX_PRICE)}")
    print(f"Min. plocha: {MIN_AREA} m²")
    print(f"Patro: od {MIN_FLOOR}. NP")
    print("=" * 60)

    # Step 1: Fetch the main filter page
    print("\nFáze 1: Stahování seznamu bytů...")
    html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
    all_listings = parse_filter_page(html)
    print(f"  Nalezeno: {len(all_listings)} jednotek")

    # Step 2: Collect unique project slugs from detail URLs to fetch GPS
    print("\nFáze 2: Stahování GPS souřadnic projektů...")
    project_slugs = set()
    for listing in all_listings:
        url = listing.get("url", "")
        # /projekty/zateckych-14/nabidka-nemovitosti/byt-a31
        slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
        if slug_match:
            project_slugs.add(slug_match.group(1))

    # Fetch GPS for each project from locality pages
    project_gps = {}
    for slug in sorted(project_slugs):
        time.sleep(0.5)
        try:
            locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
            loc_html = fetch_url(locality_url)
            gps = extract_project_gps(loc_html)
            if gps:
                # Take first entry (the project itself)
                first_name, (lat, lon) = next(iter(gps.items()))
                project_gps[slug] = (lat, lon)
                print(f"  ✓ {slug}: {lat}, {lon}")
            else:
                print(f"  ✗ {slug}: GPS nenalezeno")
        except Exception as e:
            print(f"  ✗ {slug}: chyba ({e})")

    # Step 3: Filter listings
    print(f"\nFáze 3: Filtrování...")
    results = []
    excluded_sold = 0
    excluded_type = 0
    excluded_disp = 0
    excluded_price = 0
    excluded_area = 0
    excluded_floor = 0
    excluded_no_gps = 0

    for listing in all_listings:
        # Only available units
        if listing["free"] != "yes":
            excluded_sold += 1
            continue

        # Only apartments (unittype=2)
        if listing["unittype"] != 2:
            excluded_type += 1
            continue

        # Only sales
        if listing["transaction"] != "prodej":
            excluded_type += 1
            continue

        # Disposition
        disp = listing["disposition"]
        if disp not in WANTED_DISPOSITIONS:
            excluded_disp += 1
            continue

        # Price
        price = listing["price"]
        if price <= 0 or price > MAX_PRICE:
            excluded_price += 1
            continue

        # Area
        area = listing["area"]
        if area < MIN_AREA:
            excluded_area += 1
            continue

        # Floor
        floor = listing["floor"]
        if floor is not None and floor < MIN_FLOOR:
            excluded_floor += 1
            continue

        # GPS from project
        url = listing.get("url", "")
        slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
        slug = slug_match.group(1) if slug_match else ""
        gps = project_gps.get(slug)

        if not gps:
            excluded_no_gps += 1
            continue

        lat, lon = gps

        result = {
            "hash_id": f"cityhome_{slug}_{listing['unit_name']}",
            "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
            "price": price,
            "price_formatted": format_price(price),
            "locality": f"{listing['project_name']}, Praha",
            "lat": lat,
            "lon": lon,
            "disposition": disp,
            "floor": floor,
            "area": area,
            "building_type": "Cihlová",  # CityHome renovuje cihlové domy
            "ownership": "neuvedeno",
            "url": url,
            "source": "cityhome",
            "image": "",
        }
        results.append(result)

    print(f"\n{'=' * 60}")
    print(f"Výsledky CityHome:")
    print(f"  Celkem jednotek:       {len(all_listings)}")
    print(f"  Vyloučeno (prodáno):   {excluded_sold}")
    print(f"  Vyloučeno (typ):       {excluded_type}")
    print(f"  Vyloučeno (dispozice): {excluded_disp}")
    print(f"  Vyloučeno (cena):      {excluded_price}")
    print(f"  Vyloučeno (plocha):    {excluded_area}")
    print(f"  Vyloučeno (patro):     {excluded_floor}")
    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
    print(f"  ✓ Vyhovující byty:    {len(results)}")
    print(f"{'=' * 60}")

    return results


if __name__ == "__main__":
    start = time.time()
    estates = scrape()

    if estates:
        json_path = Path("byty_cityhome.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        print(f"\n✓ Data uložena: {json_path.resolve()}")
        print(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        print("\nŽádné byty z CityHome neodpovídají kritériím :(")