maru-hleda-byt/scrape_idnes.py

#!/usr/bin/env python3
"""
Reality iDNES scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_idnes.json
"""
from __future__ import annotations

import argparse
from datetime import datetime
import json
import logging
import math
import re
import time
import urllib.request
import urllib.parse
from html.parser import HTMLParser
from pathlib import Path

logger = logging.getLogger(__name__)

# ── Konfigurace ─────────────────────────────────────────────────────────────

MAX_PRICE = 13_500_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 26  # iDNES vrací 26 na stránku

# Dispozice — kódy pro s-qc[subtypeFlat]
DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k"

# Mapování dispozice z titulku na label
DISPOSITION_MAP = {
    "3+kk": "3+kk", "3+1": "3+1",
    "4+kk": "4+kk", "4+1": "4+1",
    "5+kk": "5+kk", "5+1": "5+1",
    "6+kk": "6+", "6+1": "6+",
    "6 a více": "6+",
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "cs,en;q=0.9",
    "Accept-Encoding": "identity",
    "Connection": "keep-alive",
}

BASE_URL = "https://reality.idnes.cz"

MAX_RETRIES = 5


def fetch_url(url: str) -> str:
    """Fetch URL and return HTML string with retry logic."""
    for attempt in range(MAX_RETRIES):
        try:
            logger.debug(f"HTTP GET request (attempt {attempt + 1}/{MAX_RETRIES}): {url}")
            logger.debug(f"Headers: {HEADERS}")
            req = urllib.request.Request(url, headers=HEADERS)
            resp = urllib.request.urlopen(req, timeout=30)
            data = resp.read()
            logger.debug(f"HTTP response: status={resp.status}, size={len(data)} bytes")
            return data.decode("utf-8")
        except (ConnectionResetError, ConnectionError, urllib.error.URLError,
                OSError) as e:
            if attempt < MAX_RETRIES - 1:
                wait = (attempt + 1) * 3  # 3, 6, 9, 12s
                logger.warning(f"Connection error (retry {attempt + 1}/{MAX_RETRIES} after {wait}s): {e}")
                time.sleep(wait)
            else:
                logger.error(f"HTTP request failed after {MAX_RETRIES} attempts: {e}", exc_info=True)
                raise


def build_list_url(page: int = 0) -> str:
    """Build listing URL with all filters."""
    base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/"
    params = {
        "s-qc[subtypeFlat]": DISPOSITION_CODES,
        "s-qc[usableAreaMin]": str(MIN_AREA),
    }
    url = f"{base}?{urllib.parse.urlencode(params)}"
    if page > 0:
        url += f"&page={page}"
    return url


def parse_total_count(html: str) -> int:
    """Extract total listing count from page."""
    # Look for "720 inzerátů" or similar
    match = re.search(r'(\d[\d\s]*)\s*inzerát', html)
    if match:
        return int(match.group(1).replace(" ", "").replace("\xa0", ""))
    return 0


def parse_listings(html: str) -> list[dict]:
    """Parse listing cards from HTML using regex."""
    results = []

    # Find each listing block — look for c-products__link with detail URL
    # Pattern: <a ... class="c-products__link" href="/detail/..."> ... block ... </a>
    # Each listing card contains: title (h2), price (strong), info (p.c-products__info)

    # Split by listing items, skip ads
    items = re.findall(
        r'<div[^>]*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)</div>\s*</div>\s*</div>',
        html, re.DOTALL
    )

    # Alternative: find all detail links and extract surrounding context
    # More robust approach: find each detail link and parse nearby elements
    link_pattern = re.compile(
        r'<a[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>',
        re.DOTALL
    )
    # Also match when class comes before href
    link_pattern2 = re.compile(
        r'<a[^>]*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>',
        re.DOTALL
    )

    # Find all c-products__link anchors
    all_links = link_pattern.findall(html) + link_pattern2.findall(html)
    seen_urls = set()

    # For each link, find the surrounding product block
    for link_url in all_links:
        if link_url in seen_urls:
            continue
        seen_urls.add(link_url)

        # Find context around this link (the product card)
        escaped_url = re.escape(link_url)
        context_match = re.search(
            escaped_url + r'(.*?)</div>\s*</div>',
            html, re.DOTALL
        )
        if not context_match:
            continue

        block = context_match.group(1)

        # Ensure full URL
        url = link_url
        if not url.startswith("http"):
            url = BASE_URL + url

        # Skip ads
        ad_check_start = max(0, context_match.start() - 500)
        ad_block = html[ad_check_start:context_match.start()]
        if "advertisment" in ad_block or "advertisement" in ad_block:
            continue

        # Parse title: <h2 class="c-products__title">prodej bytu 3+kk 79 m2</h2>
        title_match = re.search(r'class="c-products__title"[^>]*>(.*?)</h2>', block, re.DOTALL)
        title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else ""

        # Parse price: <p class="c-products__price"><strong>12 950 000 Kč</strong></p>
        price_match = re.search(r'c-products__price[^>]*>.*?<strong>(.*?)</strong>', block, re.DOTALL)
        price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else ""

        # Parse address: <p class="c-products__info">Klečkova, Praha 5 - Stodůlky</p>
        info_match = re.search(r'class="c-products__info"[^>]*>(.*?)</p>', block, re.DOTALL)
        info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else ""

        # Parse disposition and area from title
        disp_match = re.search(r'(\d\+(?:kk|\d))', title)
        area_match = re.search(r'(\d+)\s*m[²2]', title)

        disposition = disp_match.group(1) if disp_match else None
        area = int(area_match.group(1)) if area_match else None

        if not disposition and ("6 a" in title or "6+" in title):
            disposition = "6+"

        # Parse price
        price = 0
        if price_text and "vyžádání" not in price_text.lower():
            price_clean = re.sub(r'[^\d]', '', price_text)
            if price_clean:
                price = int(price_clean)

        # Extract listing ID from URL
        id_match = re.search(r'/([a-f0-9]{24})/?', url)
        listing_id = id_match.group(1) if id_match else url

        results.append({
            "id": listing_id,
            "url": url,
            "disposition": DISPOSITION_MAP.get(disposition, disposition or "?"),
            "area": area,
            "price": price,
            "locality": info,
        })

    return results


def parse_detail(html: str) -> dict:
    """Parse detail page for GPS, floor, construction, ownership."""
    detail = {}

    # 1. Parse dataLayer.push() for GPS and other data
    dl_match = re.search(
        r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)',
        html, re.DOTALL
    )
    if dl_match:
        # Clean up JS object to valid JSON
        js_obj = dl_match.group(1)
        # Replace single quotes with double, handle trailing commas, etc.
        # The dataLayer is usually valid JSON-like, let's try parsing
        try:
            # Remove JS comments, handle unquoted keys
            # Most importantly: listing_lat, listing_lon, listing_price, listing_area
            lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj)
            lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj)
            if lat_match:
                detail["lat"] = float(lat_match.group(1))
            if lon_match:
                detail["lon"] = float(lon_match.group(1))
        except (ValueError, AttributeError):
            pass

    # 2. Parse DT/DD pairs for floor, construction, ownership
    # Pattern: <dt>Label</dt><dd>Value</dd>
    dt_dd_pairs = re.findall(
        r'<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>',
        html, re.DOTALL
    )

    for dt, dd in dt_dd_pairs:
        dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower()
        dd_clean = re.sub(r'<[^>]+>', '', dd).strip()

        if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean:
            # "2. patro (3. NP)" or "3. podlaží z celkem 5"
            # Try to find NP first
            np_match = re.search(r'(\d+)\.\s*NP', dd_clean)
            if np_match:
                detail["floor"] = int(np_match.group(1))
            else:
                # Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly
                patro_match = re.search(r'(\d+)', dd_clean)
                if patro_match:
                    detail["floor"] = int(patro_match.group(1))

        if "konstrukce" in dt_clean or "stavba" in dt_clean:
            detail["construction"] = dd_clean.lower()

        if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean:
            detail["ownership"] = dd_clean

    return detail


def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"


def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
    """Load previously scraped data as cache keyed by hash_id."""
    path = Path(json_path)
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        return {str(e["hash_id"]): e for e in data if "hash_id" in e}
    except (json.JSONDecodeError, KeyError):
        return {}


def scrape(max_pages: int | None = None, max_properties: int | None = None):
    cache = load_cache()

    logger.info("=" * 60)
    logger.info("Stahuji inzeráty z Reality iDNES")
    logger.info(f"Cena: do {format_price(MAX_PRICE)}")
    logger.info(f"Min. plocha: {MIN_AREA} m²")
    logger.info(f"Patro: od {MIN_FLOOR}. NP")
    logger.info(f"Region: Praha")
    if cache:
        logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
    if max_pages:
        logger.info(f"Max. stran: {max_pages}")
    if max_properties:
        logger.info(f"Max. bytů: {max_properties}")
    logger.info("=" * 60)

    # Step 1: Fetch listing pages
    logger.info("\nFáze 1: Stahování seznamu inzerátů...")
    all_listings = {}  # id -> listing dict
    page = 0
    total = None

    while True:
        if max_pages and page >= max_pages:
            logger.debug(f"Max pages limit reached: {max_pages}")
            break
        url = build_list_url(page)
        logger.info(f"Strana {page + 1} ...")
        html = fetch_url(url)

        if total is None:
            total = parse_total_count(html)
            total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1
            logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran")

        listings = parse_listings(html)
        logger.debug(f"Page {page}: found {len(listings)} listings")

        if not listings:
            logger.debug(f"No listings found on page {page}, stopping")
            break

        for item in listings:
            lid = item["id"]
            if lid not in all_listings:
                all_listings[lid] = item

        page += 1
        if total and page >= math.ceil(total / PER_PAGE):
            break
        time.sleep(1.0)

    logger.info(f"\nStaženo: {len(all_listings)} unikátních inzerátů")

    # Step 2: Pre-filter by price and area from list data
    pre_filtered = []
    excluded_price = 0
    excluded_area = 0
    excluded_disp = 0

    for item in all_listings.values():
        item_id = item["id"]
        if item["price"] <= 0 or item["price"] > MAX_PRICE:
            excluded_price += 1
            logger.debug(f"Filter: id={item_id} - excluded (price {item['price']})")
            continue

        if item["area"] is not None and item["area"] < MIN_AREA:
            excluded_area += 1
            logger.debug(f"Filter: id={item_id} - excluded (area {item['area']} m²)")
            continue

        if item["disposition"] == "?":
            excluded_disp += 1
            logger.debug(f"Filter: id={item_id} - excluded (unknown disposition)")
            continue

        pre_filtered.append(item)

    logger.info(f"\nPo předfiltraci:")
    logger.info(f"  Vyloučeno (cena):      {excluded_price}")
    logger.info(f"  Vyloučeno (plocha):    {excluded_area}")
    logger.info(f"  Vyloučeno (dispozice): {excluded_disp}")
    logger.info(f"  Zbývá:                 {len(pre_filtered)}")

    # Step 3: Fetch details for GPS, floor, construction
    logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
    results = []
    excluded_panel = 0
    excluded_floor = 0
    excluded_no_gps = 0
    excluded_detail = 0
    cache_hits = 0
    properties_fetched = 0

    for i, item in enumerate(pre_filtered):
        if max_properties and properties_fetched >= max_properties:
            logger.debug(f"Max properties limit reached: {max_properties}")
            break
        # Check cache — if hash_id exists and price unchanged, reuse
        cached = cache.get(str(item["id"]))
        if cached and cached.get("price") == item["price"]:
            cache_hits += 1
            logger.debug(f"Cache hit for id={item['id']}")
            results.append(cached)
            continue

        url = item["url"]
        time.sleep(0.4)

        try:
            html = fetch_url(url)
        except Exception as e:
            excluded_detail += 1
            logger.warning(f"Detail failed for id={item['id']}: {e}")
            continue

        detail = parse_detail(html)
        logger.debug(f"Detail parsed for id={item['id']}: lat={detail.get('lat')}, lon={detail.get('lon')}, floor={detail.get('floor')}")

        # Must have GPS
        if not detail.get("lat") or not detail.get("lon"):
            excluded_no_gps += 1
            logger.debug(f"Filter: id={item['id']} - excluded (no GPS)")
            continue

        # Check construction — exclude panel
        construction = detail.get("construction", "")
        if "panel" in construction:
            excluded_panel += 1
            logger.debug(f"Filter: id={item['id']} - excluded (panel construction)")
            logger.info(f"✗ Vyloučen {item['id'][:12]}...: panel ({construction})")
            continue

        # Check for sídliště in construction/description
        if "sídliště" in construction or "sidliste" in construction:
            excluded_panel += 1
            logger.debug(f"Filter: id={item['id']} - excluded (housing estate)")
            logger.info(f"✗ Vyloučen {item['id'][:12]}...: sídliště")
            continue

        # Check floor
        floor = detail.get("floor")
        if floor is not None and floor < MIN_FLOOR:
            excluded_floor += 1
            logger.debug(f"Filter: id={item['id']} - excluded (floor {floor})")
            continue

        # Map construction to Czech label
        building_type = "neuvedeno"
        if construction:
            if "cihlo" in construction or "cihla" in construction:
                building_type = "Cihlová"
            elif "smíšen" in construction or "smisen" in construction:
                building_type = "Smíšená"
            elif "skelet" in construction:
                building_type = "Skeletová"
            elif "dřevo" in construction or "drevo" in construction:
                building_type = "Dřevostavba"
            elif "mont" in construction:
                building_type = "Montovaná"
            else:
                building_type = construction.capitalize()

        result = {
            "hash_id": item["id"],
            "name": f"Prodej bytu {item['disposition']} {item.get('area', '?')} m²",
            "price": item["price"],
            "price_formatted": format_price(item["price"]),
            "locality": item["locality"],
            "lat": detail["lat"],
            "lon": detail["lon"],
            "disposition": item["disposition"],
            "floor": floor,
            "area": item["area"],
            "building_type": building_type,
            "ownership": detail.get("ownership", "neuvedeno"),
            "url": item["url"],
            "source": "idnes",
            "image": "",
            "scraped_at": datetime.now().strftime("%Y-%m-%d"),
        }
        results.append(result)
        properties_fetched += 1

        if (i + 1) % 20 == 0:
            logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")

    logger.info(f"\n{'=' * 60}")
    logger.info(f"Výsledky Reality iDNES:")
    logger.info(f"  Předfiltrováno:        {len(pre_filtered)}")
    logger.info(f"  Z cache (přeskočeno): {cache_hits}")
    logger.info(f"  Vyloučeno (panel/síd): {excluded_panel}")
    logger.info(f"  Vyloučeno (patro):     {excluded_floor}")
    logger.info(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
    logger.info(f"  Vyloučeno (bez detailu): {excluded_detail}")
    logger.info(f"  ✓ Vyhovující byty:    {len(results)}")
    logger.info(f"{'=' * 60}")

    return results


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Scrape apartments from Reality iDNES")
    parser.add_argument("--max-pages", type=int, default=None,
                        help="Maximum number of listing pages to scrape")
    parser.add_argument("--max-properties", type=int, default=None,
                        help="Maximum number of properties to fetch details for")
    parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
                        help="Logging level (default: INFO)")
    args = parser.parse_args()

    # Configure logging
    logging.basicConfig(
        level=getattr(logging, args.log_level),
        format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
        handlers=[logging.StreamHandler()]
    )

    start = time.time()
    estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)

    if estates:
        json_path = Path("byty_idnes.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
        logger.info(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        logger.info("\nŽádné byty z Reality iDNES neodpovídají kritériím :(")