Upload files to "/"

v1 scrapery
2026-02-13 16:11:28 +00:00
parent 82d1f94104
commit 846d0bd9f2
5 changed files with 1760 additions and 0 deletions
--- a/scrape_cityhome.py
+++ b/scrape_cityhome.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""
+CityHome (city-home.cz) scraper.
+Stáhne byty na prodej v Praze z projektů CityHome/SATPO.
+Výstup: byty_cityhome.json
+"""
+from __future__ import annotations
+
+import json
+import re
+import time
+import urllib.request
+from pathlib import Path
+
+# ── Konfigurace ─────────────────────────────────────────────────────────────
+
+MAX_PRICE = 14_000_000
+MIN_AREA = 69
+MIN_FLOOR = 2
+
+WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    "Accept": "text/html,application/xhtml+xml",
+    "Accept-Language": "cs,en;q=0.9",
+}
+
+BASE_URL = "https://www.city-home.cz"
+
+
+def fetch_url(url: str) -> str:
+    """Fetch URL and return HTML string."""
+    for attempt in range(3):
+        try:
+            req = urllib.request.Request(url, headers=HEADERS)
+            resp = urllib.request.urlopen(req, timeout=30)
+            return resp.read().decode("utf-8")
+        except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
+            if attempt < 2:
+                time.sleep((attempt + 1) * 2)
+                print(f"    Retry {attempt + 1}: {e}")
+            else:
+                raise
+
+
+def format_price(price: int) -> str:
+    s = str(price)
+    parts = []
+    while s:
+        parts.append(s[-3:])
+        s = s[:-3]
+    return " ".join(reversed(parts)) + " Kč"
+
+
+def parse_filter_page(html: str) -> list[dict]:
+    """Parse all listing rows from the filter page."""
+    listings = []
+
+    # Find all <tr> with data-cena attribute
+    row_pattern = re.compile(
+        r'<tr[^>]*'
+        r'data-cena="(\d+)"[^>]*'
+        r'data-plocha="([\d.]+)"[^>]*'
+        r'data-unittype="(\d+)"[^>]*'
+        r'data-free="(yes|no)"[^>]*'
+        r'data-project="(\d+)"[^>]*'
+        r'data-transaction="([^"]*)"[^>]*'
+        r'data-dispozition="([^"]*)"[^>]*'
+        r'data-location="([^"]*)"[^>]*'
+        r'>(.*?)</tr>',
+        re.DOTALL
+    )
+
+    # Also try with different attribute order
+    rows = re.findall(r'<tr[^>]*data-cena="[^"]*"[^>]*>(.*?)</tr>', html, re.DOTALL)
+
+    for row_html in rows:
+        # Extract data attributes from the surrounding <tr>
+        tr_match = re.search(
+            r'<tr[^>]*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*'
+            r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*'
+            r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*'
+            r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"',
+            html
+        )
+
+        # More flexible: search around each row
+        pass
+
+    # Better approach: find each tr tag with all its attributes
+    for match in re.finditer(r'<tr\s+([^>]*data-cena="[^"]*"[^>]*)>(.*?)</tr>', html, re.DOTALL):
+        attrs_str = match.group(1)
+        row_content = match.group(2)
+
+        # Extract all data attributes
+        cena = re.search(r'data-cena="(\d+)"', attrs_str)
+        plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str)
+        unittype = re.search(r'data-unittype="(\d+)"', attrs_str)
+        free = re.search(r'data-free="(yes|no)"', attrs_str)
+        project = re.search(r'data-project="(\d+)"', attrs_str)
+        transaction = re.search(r'data-transaction="([^"]*)"', attrs_str)
+        dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str)
+        location = re.search(r'data-location="([^"]*)"', attrs_str)
+
+        if not cena:
+            continue
+
+        # Extract detail URL and unit name from first cell
+        link_match = re.search(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', row_content, re.DOTALL)
+        detail_url = link_match.group(1).strip() if link_match else ""
+        unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else ""
+
+        if detail_url and not detail_url.startswith("http"):
+            detail_url = BASE_URL + detail_url
+
+        # Extract floor from cells — look for pattern like "3.NP" or "2.PP"
+        cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
+        floor = None
+        floor_text = ""
+        project_name = ""
+
+        for cell in cells:
+            cell_text = re.sub(r'<[^>]+>', '', cell).strip()
+            # Floor pattern
+            np_match = re.search(r'(\d+)\.\s*NP', cell_text)
+            pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
+            if np_match:
+                floor = int(np_match.group(1))
+                floor_text = cell_text
+            elif pp_match:
+                floor = -int(pp_match.group(1))  # Underground
+                floor_text = cell_text
+
+        # Extract project name — usually in a cell that's not a number/price/floor
+        for cell in cells:
+            cell_text = re.sub(r'<[^>]+>', '', cell).strip()
+            if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
+                if len(cell_text) > 3 and cell_text != unit_name:
+                    project_name = cell_text
+                    break
+
+        listing = {
+            "price": int(cena.group(1)),
+            "area": float(plocha.group(1)) if plocha else 0,
+            "unittype": int(unittype.group(1)) if unittype else 0,
+            "free": free.group(1) if free else "no",
+            "project_id": project.group(1) if project else "",
+            "transaction": transaction.group(1) if transaction else "",
+            "disposition": dispozition.group(1) if dispozition else "",
+            "location": location.group(1) if location else "",
+            "url": detail_url,
+            "unit_name": unit_name,
+            "floor": floor,
+            "project_name": project_name,
+        }
+        listings.append(listing)
+
+    return listings
+
+
+def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
+    """Extract GPS coordinates for projects from locality pages."""
+    # Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
+    gps_data = {}
+    for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
+        name = match.group(1).strip()
+        lat = float(match.group(2))
+        lon = float(match.group(3))
+        gps_data[name] = (lat, lon)
+    return gps_data
+
+
+def scrape():
+    print("=" * 60)
+    print("Stahuji inzeráty z CityHome (city-home.cz)")
+    print(f"Cena: do {format_price(MAX_PRICE)}")
+    print(f"Min. plocha: {MIN_AREA} m²")
+    print(f"Patro: od {MIN_FLOOR}. NP")
+    print("=" * 60)
+
+    # Step 1: Fetch the main filter page
+    print("\nFáze 1: Stahování seznamu bytů...")
+    html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
+    all_listings = parse_filter_page(html)
+    print(f"  Nalezeno: {len(all_listings)} jednotek")
+
+    # Step 2: Collect unique project slugs from detail URLs to fetch GPS
+    print("\nFáze 2: Stahování GPS souřadnic projektů...")
+    project_slugs = set()
+    for listing in all_listings:
+        url = listing.get("url", "")
+        # /projekty/zateckych-14/nabidka-nemovitosti/byt-a31
+        slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
+        if slug_match:
+            project_slugs.add(slug_match.group(1))
+
+    # Fetch GPS for each project from locality pages
+    project_gps = {}
+    for slug in sorted(project_slugs):
+        time.sleep(0.5)
+        try:
+            locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
+            loc_html = fetch_url(locality_url)
+            gps = extract_project_gps(loc_html)
+            if gps:
+                # Take first entry (the project itself)
+                first_name, (lat, lon) = next(iter(gps.items()))
+                project_gps[slug] = (lat, lon)
+                print(f"  ✓ {slug}: {lat}, {lon}")
+            else:
+                print(f"  ✗ {slug}: GPS nenalezeno")
+        except Exception as e:
+            print(f"  ✗ {slug}: chyba ({e})")
+
+    # Step 3: Filter listings
+    print(f"\nFáze 3: Filtrování...")
+    results = []
+    excluded_sold = 0
+    excluded_type = 0
+    excluded_disp = 0
+    excluded_price = 0
+    excluded_area = 0
+    excluded_floor = 0
+    excluded_no_gps = 0
+
+    for listing in all_listings:
+        # Only available units
+        if listing["free"] != "yes":
+            excluded_sold += 1
+            continue
+
+        # Only apartments (unittype=2)
+        if listing["unittype"] != 2:
+            excluded_type += 1
+            continue
+
+        # Only sales
+        if listing["transaction"] != "prodej":
+            excluded_type += 1
+            continue
+
+        # Disposition
+        disp = listing["disposition"]
+        if disp not in WANTED_DISPOSITIONS:
+            excluded_disp += 1
+            continue
+
+        # Price
+        price = listing["price"]
+        if price <= 0 or price > MAX_PRICE:
+            excluded_price += 1
+            continue
+
+        # Area
+        area = listing["area"]
+        if area < MIN_AREA:
+            excluded_area += 1
+            continue
+
+        # Floor
+        floor = listing["floor"]
+        if floor is not None and floor < MIN_FLOOR:
+            excluded_floor += 1
+            continue
+
+        # GPS from project
+        url = listing.get("url", "")
+        slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
+        slug = slug_match.group(1) if slug_match else ""
+        gps = project_gps.get(slug)
+
+        if not gps:
+            excluded_no_gps += 1
+            continue
+
+        lat, lon = gps
+
+        result = {
+            "hash_id": f"cityhome_{slug}_{listing['unit_name']}",
+            "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
+            "price": price,
+            "price_formatted": format_price(price),
+            "locality": f"{listing['project_name']}, Praha",
+            "lat": lat,
+            "lon": lon,
+            "disposition": disp,
+            "floor": floor,
+            "area": area,
+            "building_type": "Cihlová",  # CityHome renovuje cihlové domy
+            "ownership": "neuvedeno",
+            "url": url,
+            "source": "cityhome",
+            "image": "",
+        }
+        results.append(result)
+
+    print(f"\n{'=' * 60}")
+    print(f"Výsledky CityHome:")
+    print(f"  Celkem jednotek:       {len(all_listings)}")
+    print(f"  Vyloučeno (prodáno):   {excluded_sold}")
+    print(f"  Vyloučeno (typ):       {excluded_type}")
+    print(f"  Vyloučeno (dispozice): {excluded_disp}")
+    print(f"  Vyloučeno (cena):      {excluded_price}")
+    print(f"  Vyloučeno (plocha):    {excluded_area}")
+    print(f"  Vyloučeno (patro):     {excluded_floor}")
+    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
+    print(f"  ✓ Vyhovující byty:    {len(results)}")
+    print(f"{'=' * 60}")
+
+    return results
+
+
+if __name__ == "__main__":
+    start = time.time()
+    estates = scrape()
+
+    if estates:
+        json_path = Path("byty_cityhome.json")
+        json_path.write_text(
+            json.dumps(estates, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+        elapsed = time.time() - start
+        print(f"\n✓ Data uložena: {json_path.resolve()}")
+        print(f"⏱  Celkový čas: {elapsed:.0f} s")
+    else:
+        print("\nŽádné byty z CityHome neodpovídají kritériím :(")