Upload files to "/"

v1 scrapery
2026-02-13 16:11:28 +00:00
parent 82d1f94104
commit 846d0bd9f2
5 changed files with 1760 additions and 0 deletions
--- a/scrape_idnes.py
+++ b/scrape_idnes.py
@@ -0,0 +1,464 @@
+#!/usr/bin/env python3
+"""
+Reality iDNES scraper.
+Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
+Výstup: byty_idnes.json
+"""
+from __future__ import annotations
+
+import json
+import math
+import re
+import time
+import urllib.request
+import urllib.parse
+from html.parser import HTMLParser
+from pathlib import Path
+
+# ── Konfigurace ─────────────────────────────────────────────────────────────
+
+MAX_PRICE = 13_500_000
+MIN_AREA = 69
+MIN_FLOOR = 2
+PER_PAGE = 26  # iDNES vrací 26 na stránku
+
+# Dispozice — kódy pro s-qc[subtypeFlat]
+DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k"
+
+# Mapování dispozice z titulku na label
+DISPOSITION_MAP = {
+    "3+kk": "3+kk", "3+1": "3+1",
+    "4+kk": "4+kk", "4+1": "4+1",
+    "5+kk": "5+kk", "5+1": "5+1",
+    "6+kk": "6+", "6+1": "6+",
+    "6 a více": "6+",
+}
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "cs,en;q=0.9",
+    "Accept-Encoding": "identity",
+    "Connection": "keep-alive",
+}
+
+BASE_URL = "https://reality.idnes.cz"
+
+MAX_RETRIES = 5
+
+
+def fetch_url(url: str) -> str:
+    """Fetch URL and return HTML string with retry logic."""
+    for attempt in range(MAX_RETRIES):
+        try:
+            req = urllib.request.Request(url, headers=HEADERS)
+            resp = urllib.request.urlopen(req, timeout=30)
+            data = resp.read()
+            return data.decode("utf-8")
+        except (ConnectionResetError, ConnectionError, urllib.error.URLError,
+                OSError) as e:
+            if attempt < MAX_RETRIES - 1:
+                wait = (attempt + 1) * 3  # 3, 6, 9, 12s
+                print(f"    Retry {attempt + 1}/{MAX_RETRIES} (wait {wait}s): {e}")
+                time.sleep(wait)
+            else:
+                raise
+
+
+def build_list_url(page: int = 0) -> str:
+    """Build listing URL with all filters."""
+    base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/"
+    params = {
+        "s-qc[subtypeFlat]": DISPOSITION_CODES,
+        "s-qc[usableAreaMin]": str(MIN_AREA),
+    }
+    url = f"{base}?{urllib.parse.urlencode(params)}"
+    if page > 0:
+        url += f"&page={page}"
+    return url
+
+
+def parse_total_count(html: str) -> int:
+    """Extract total listing count from page."""
+    # Look for "720 inzerátů" or similar
+    match = re.search(r'(\d[\d\s]*)\s*inzerát', html)
+    if match:
+        return int(match.group(1).replace(" ", "").replace("\xa0", ""))
+    return 0
+
+
+def parse_listings(html: str) -> list[dict]:
+    """Parse listing cards from HTML using regex."""
+    results = []
+
+    # Find each listing block — look for c-products__link with detail URL
+    # Pattern: <a ... class="c-products__link" href="/detail/..."> ... block ... </a>
+    # Each listing card contains: title (h2), price (strong), info (p.c-products__info)
+
+    # Split by listing items, skip ads
+    items = re.findall(
+        r'<div[^>]*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)</div>\s*</div>\s*</div>',
+        html, re.DOTALL
+    )
+
+    # Alternative: find all detail links and extract surrounding context
+    # More robust approach: find each detail link and parse nearby elements
+    link_pattern = re.compile(
+        r'<a[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>',
+        re.DOTALL
+    )
+    # Also match when class comes before href
+    link_pattern2 = re.compile(
+        r'<a[^>]*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>',
+        re.DOTALL
+    )
+
+    # Find all c-products__link anchors
+    all_links = link_pattern.findall(html) + link_pattern2.findall(html)
+    seen_urls = set()
+
+    # For each link, find the surrounding product block
+    for link_url in all_links:
+        if link_url in seen_urls:
+            continue
+        seen_urls.add(link_url)
+
+        # Find context around this link (the product card)
+        escaped_url = re.escape(link_url)
+        context_match = re.search(
+            escaped_url + r'(.*?)</div>\s*</div>',
+            html, re.DOTALL
+        )
+        if not context_match:
+            continue
+
+        block = context_match.group(1)
+
+        # Ensure full URL
+        url = link_url
+        if not url.startswith("http"):
+            url = BASE_URL + url
+
+        # Skip ads
+        ad_check_start = max(0, context_match.start() - 500)
+        ad_block = html[ad_check_start:context_match.start()]
+        if "advertisment" in ad_block or "advertisement" in ad_block:
+            continue
+
+        # Parse title: <h2 class="c-products__title">prodej bytu 3+kk 79 m2</h2>
+        title_match = re.search(r'class="c-products__title"[^>]*>(.*?)</h2>', block, re.DOTALL)
+        title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else ""
+
+        # Parse price: <p class="c-products__price"><strong>12 950 000 Kč</strong></p>
+        price_match = re.search(r'c-products__price[^>]*>.*?<strong>(.*?)</strong>', block, re.DOTALL)
+        price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else ""
+
+        # Parse address: <p class="c-products__info">Klečkova, Praha 5 - Stodůlky</p>
+        info_match = re.search(r'class="c-products__info"[^>]*>(.*?)</p>', block, re.DOTALL)
+        info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else ""
+
+        # Parse disposition and area from title
+        disp_match = re.search(r'(\d\+(?:kk|\d))', title)
+        area_match = re.search(r'(\d+)\s*m[²2]', title)
+
+        disposition = disp_match.group(1) if disp_match else None
+        area = int(area_match.group(1)) if area_match else None
+
+        if not disposition and ("6 a" in title or "6+" in title):
+            disposition = "6+"
+
+        # Parse price
+        price = 0
+        if price_text and "vyžádání" not in price_text.lower():
+            price_clean = re.sub(r'[^\d]', '', price_text)
+            if price_clean:
+                price = int(price_clean)
+
+        # Extract listing ID from URL
+        id_match = re.search(r'/([a-f0-9]{24})/?', url)
+        listing_id = id_match.group(1) if id_match else url
+
+        results.append({
+            "id": listing_id,
+            "url": url,
+            "disposition": DISPOSITION_MAP.get(disposition, disposition or "?"),
+            "area": area,
+            "price": price,
+            "locality": info,
+        })
+
+    return results
+
+
+def parse_detail(html: str) -> dict:
+    """Parse detail page for GPS, floor, construction, ownership."""
+    detail = {}
+
+    # 1. Parse dataLayer.push() for GPS and other data
+    dl_match = re.search(
+        r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)',
+        html, re.DOTALL
+    )
+    if dl_match:
+        # Clean up JS object to valid JSON
+        js_obj = dl_match.group(1)
+        # Replace single quotes with double, handle trailing commas, etc.
+        # The dataLayer is usually valid JSON-like, let's try parsing
+        try:
+            # Remove JS comments, handle unquoted keys
+            # Most importantly: listing_lat, listing_lon, listing_price, listing_area
+            lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj)
+            lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj)
+            if lat_match:
+                detail["lat"] = float(lat_match.group(1))
+            if lon_match:
+                detail["lon"] = float(lon_match.group(1))
+        except (ValueError, AttributeError):
+            pass
+
+    # 2. Parse DT/DD pairs for floor, construction, ownership
+    # Pattern: <dt>Label</dt><dd>Value</dd>
+    dt_dd_pairs = re.findall(
+        r'<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>',
+        html, re.DOTALL
+    )
+
+    for dt, dd in dt_dd_pairs:
+        dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower()
+        dd_clean = re.sub(r'<[^>]+>', '', dd).strip()
+
+        if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean:
+            # "2. patro (3. NP)" or "3. podlaží z celkem 5"
+            # Try to find NP first
+            np_match = re.search(r'(\d+)\.\s*NP', dd_clean)
+            if np_match:
+                detail["floor"] = int(np_match.group(1))
+            else:
+                # Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly
+                patro_match = re.search(r'(\d+)', dd_clean)
+                if patro_match:
+                    detail["floor"] = int(patro_match.group(1))
+
+        if "konstrukce" in dt_clean or "stavba" in dt_clean:
+            detail["construction"] = dd_clean.lower()
+
+        if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean:
+            detail["ownership"] = dd_clean
+
+    return detail
+
+
+def format_price(price: int) -> str:
+    s = str(price)
+    parts = []
+    while s:
+        parts.append(s[-3:])
+        s = s[:-3]
+    return " ".join(reversed(parts)) + " Kč"
+
+
+def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
+    """Load previously scraped data as cache keyed by hash_id."""
+    path = Path(json_path)
+    if not path.exists():
+        return {}
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+        return {str(e["hash_id"]): e for e in data if "hash_id" in e}
+    except (json.JSONDecodeError, KeyError):
+        return {}
+
+
+def scrape():
+    cache = load_cache()
+
+    print("=" * 60)
+    print("Stahuji inzeráty z Reality iDNES")
+    print(f"Cena: do {format_price(MAX_PRICE)}")
+    print(f"Min. plocha: {MIN_AREA} m²")
+    print(f"Patro: od {MIN_FLOOR}. NP")
+    print(f"Region: Praha")
+    if cache:
+        print(f"Cache: {len(cache)} bytů z minulého běhu")
+    print("=" * 60)
+
+    # Step 1: Fetch listing pages
+    print("\nFáze 1: Stahování seznamu inzerátů...")
+    all_listings = {}  # id -> listing dict
+    page = 0
+    total = None
+
+    while True:
+        url = build_list_url(page)
+        print(f"  Strana {page + 1} ...")
+        html = fetch_url(url)
+
+        if total is None:
+            total = parse_total_count(html)
+            total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1
+            print(f"  → Celkem {total} inzerátů, ~{total_pages} stran")
+
+        listings = parse_listings(html)
+
+        if not listings:
+            break
+
+        for item in listings:
+            lid = item["id"]
+            if lid not in all_listings:
+                all_listings[lid] = item
+
+        page += 1
+        if total and page >= math.ceil(total / PER_PAGE):
+            break
+        time.sleep(1.0)
+
+    print(f"\n  Staženo: {len(all_listings)} unikátních inzerátů")
+
+    # Step 2: Pre-filter by price and area from list data
+    pre_filtered = []
+    excluded_price = 0
+    excluded_area = 0
+    excluded_disp = 0
+
+    for item in all_listings.values():
+        if item["price"] <= 0 or item["price"] > MAX_PRICE:
+            excluded_price += 1
+            continue
+
+        if item["area"] is not None and item["area"] < MIN_AREA:
+            excluded_area += 1
+            continue
+
+        if item["disposition"] == "?":
+            excluded_disp += 1
+            continue
+
+        pre_filtered.append(item)
+
+    print(f"\nPo předfiltraci:")
+    print(f"  Vyloučeno (cena):      {excluded_price}")
+    print(f"  Vyloučeno (plocha):    {excluded_area}")
+    print(f"  Vyloučeno (dispozice): {excluded_disp}")
+    print(f"  Zbývá:                 {len(pre_filtered)}")
+
+    # Step 3: Fetch details for GPS, floor, construction
+    print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
+    results = []
+    excluded_panel = 0
+    excluded_floor = 0
+    excluded_no_gps = 0
+    excluded_detail = 0
+    cache_hits = 0
+
+    for i, item in enumerate(pre_filtered):
+        # Check cache — if hash_id exists and price unchanged, reuse
+        cached = cache.get(str(item["id"]))
+        if cached and cached.get("price") == item["price"]:
+            cache_hits += 1
+            results.append(cached)
+            continue
+
+        url = item["url"]
+        time.sleep(0.4)
+
+        try:
+            html = fetch_url(url)
+        except Exception as e:
+            print(f"    Warning: detail failed for {item['id']}: {e}")
+            excluded_detail += 1
+            continue
+
+        detail = parse_detail(html)
+
+        # Must have GPS
+        if not detail.get("lat") or not detail.get("lon"):
+            excluded_no_gps += 1
+            continue
+
+        # Check construction — exclude panel
+        construction = detail.get("construction", "")
+        if "panel" in construction:
+            excluded_panel += 1
+            print(f"  ✗ Vyloučen {item['id'][:12]}...: panel ({construction})")
+            continue
+
+        # Check for sídliště in construction/description
+        if "sídliště" in construction or "sidliste" in construction:
+            excluded_panel += 1
+            print(f"  ✗ Vyloučen {item['id'][:12]}...: sídliště")
+            continue
+
+        # Check floor
+        floor = detail.get("floor")
+        if floor is not None and floor < MIN_FLOOR:
+            excluded_floor += 1
+            continue
+
+        # Map construction to Czech label
+        building_type = "neuvedeno"
+        if construction:
+            if "cihlo" in construction or "cihla" in construction:
+                building_type = "Cihlová"
+            elif "smíšen" in construction or "smisen" in construction:
+                building_type = "Smíšená"
+            elif "skelet" in construction:
+                building_type = "Skeletová"
+            elif "dřevo" in construction or "drevo" in construction:
+                building_type = "Dřevostavba"
+            elif "mont" in construction:
+                building_type = "Montovaná"
+            else:
+                building_type = construction.capitalize()
+
+        result = {
+            "hash_id": item["id"],
+            "name": f"Prodej bytu {item['disposition']} {item.get('area', '?')} m²",
+            "price": item["price"],
+            "price_formatted": format_price(item["price"]),
+            "locality": item["locality"],
+            "lat": detail["lat"],
+            "lon": detail["lon"],
+            "disposition": item["disposition"],
+            "floor": floor,
+            "area": item["area"],
+            "building_type": building_type,
+            "ownership": detail.get("ownership", "neuvedeno"),
+            "url": item["url"],
+            "source": "idnes",
+            "image": "",
+        }
+        results.append(result)
+
+        if (i + 1) % 20 == 0:
+            print(f"  Zpracováno {i + 1}/{len(pre_filtered)} ...")
+
+    print(f"\n{'=' * 60}")
+    print(f"Výsledky Reality iDNES:")
+    print(f"  Předfiltrováno:        {len(pre_filtered)}")
+    print(f"  Z cache (přeskočeno): {cache_hits}")
+    print(f"  Vyloučeno (panel/síd): {excluded_panel}")
+    print(f"  Vyloučeno (patro):     {excluded_floor}")
+    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
+    print(f"  Vyloučeno (bez detailu): {excluded_detail}")
+    print(f"  ✓ Vyhovující byty:    {len(results)}")
+    print(f"{'=' * 60}")
+
+    return results
+
+
+if __name__ == "__main__":
+    start = time.time()
+    estates = scrape()
+
+    if estates:
+        json_path = Path("byty_idnes.json")
+        json_path.write_text(
+            json.dumps(estates, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+        elapsed = time.time() - start
+        print(f"\n✓ Data uložena: {json_path.resolve()}")
+        print(f"⏱  Celkový čas: {elapsed:.0f} s")
+    else:
+        print("\nŽádné byty z Reality iDNES neodpovídají kritériím :(")