#!/usr/bin/env python3 """ CityHome (city-home.cz) scraper. Stáhne byty na prodej v Praze z projektů CityHome/SATPO. Výstup: byty_cityhome.json """ from __future__ import annotations import argparse import json import logging import re import time import urllib.request from pathlib import Path logger = logging.getLogger(__name__) # ── Konfigurace ───────────────────────────────────────────────────────────── MAX_PRICE = 14_000_000 MIN_AREA = 69 MIN_FLOOR = 2 WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml", "Accept-Language": "cs,en;q=0.9", } BASE_URL = "https://www.city-home.cz" def fetch_url(url: str) -> str: """Fetch URL and return HTML string.""" for attempt in range(3): try: logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}") logger.debug(f"Headers: {HEADERS}") req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=30) html = resp.read().decode("utf-8") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") return html except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e: if attempt < 2: wait = (attempt + 1) * 2 logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}") time.sleep(wait) else: logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True) raise def format_price(price: int) -> str: s = str(price) parts = [] while s: parts.append(s[-3:]) s = s[:-3] return " ".join(reversed(parts)) + " Kč" def parse_filter_page(html: str) -> list[dict]: """Parse all listing rows from the filter page.""" listings = [] # Find all with data-cena attribute row_pattern = re.compile( r']*' r'data-cena="(\d+)"[^>]*' r'data-plocha="([\d.]+)"[^>]*' r'data-unittype="(\d+)"[^>]*' r'data-free="(yes|no)"[^>]*' r'data-project="(\d+)"[^>]*' r'data-transaction="([^"]*)"[^>]*' r'data-dispozition="([^"]*)"[^>]*' r'data-location="([^"]*)"[^>]*' r'>(.*?)', re.DOTALL ) # Also try with different attribute order rows = re.findall(r']*data-cena="[^"]*"[^>]*>(.*?)', html, re.DOTALL) for row_html in rows: # Extract data attributes from the surrounding tr_match = re.search( r']*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*' r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*' r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*' r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"', html ) # More flexible: search around each row pass # Better approach: find each tr tag with all its attributes for match in re.finditer(r']*data-cena="[^"]*"[^>]*)>(.*?)', html, re.DOTALL): attrs_str = match.group(1) row_content = match.group(2) # Extract all data attributes cena = re.search(r'data-cena="(\d+)"', attrs_str) plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str) unittype = re.search(r'data-unittype="(\d+)"', attrs_str) free = re.search(r'data-free="(yes|no)"', attrs_str) project = re.search(r'data-project="(\d+)"', attrs_str) transaction = re.search(r'data-transaction="([^"]*)"', attrs_str) dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str) location = re.search(r'data-location="([^"]*)"', attrs_str) if not cena: continue # Extract detail URL and unit name from first cell link_match = re.search(r']*href="([^"]*)"[^>]*>(.*?)', row_content, re.DOTALL) detail_url = link_match.group(1).strip() if link_match else "" unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else "" if detail_url and not detail_url.startswith("http"): detail_url = BASE_URL + detail_url # Extract floor from cells — look for pattern like "3.NP" or "2.PP" cells = re.findall(r']*>(.*?)', row_content, re.DOTALL) floor = None floor_text = "" project_name = "" for cell in cells: cell_text = re.sub(r'<[^>]+>', '', cell).strip() # Floor pattern np_match = re.search(r'(\d+)\.\s*NP', cell_text) pp_match = re.search(r'(\d+)\.\s*PP', cell_text) if np_match: floor = int(np_match.group(1)) floor_text = cell_text elif pp_match: floor = -int(pp_match.group(1)) # Underground floor_text = cell_text # Extract project name — usually in a cell that's not a number/price/floor for cell in cells: cell_text = re.sub(r'<[^>]+>', '', cell).strip() if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text: if len(cell_text) > 3 and cell_text != unit_name: project_name = cell_text break listing = { "price": int(cena.group(1)), "area": float(plocha.group(1)) if plocha else 0, "unittype": int(unittype.group(1)) if unittype else 0, "free": free.group(1) if free else "no", "project_id": project.group(1) if project else "", "transaction": transaction.group(1) if transaction else "", "disposition": dispozition.group(1) if dispozition else "", "location": location.group(1) if location else "", "url": detail_url, "unit_name": unit_name, "floor": floor, "project_name": project_name, } listings.append(listing) return listings def extract_project_gps(html: str) -> dict[str, tuple[float, float]]: """Extract GPS coordinates for projects from locality pages.""" # Pattern in JS: ['

Project Name

...', 'LAT', 'LON', '1', 'Name'] gps_data = {} for match in re.finditer(r"\['[^']*

([^<]+)

[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html): name = match.group(1).strip() lat = float(match.group(2)) lon = float(match.group(3)) gps_data[name] = (lat, lon) return gps_data def scrape(max_pages: int | None = None, max_properties: int | None = None): logger.info("=" * 60) logger.info("Stahuji inzeráty z CityHome (city-home.cz)") logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Min. plocha: {MIN_AREA} m²") logger.info(f"Patro: od {MIN_FLOOR}. NP") if max_properties: logger.info(f"Max. bytů: {max_properties}") logger.info("=" * 60) # Step 1: Fetch the main filter page logger.info("\nFáze 1: Stahování seznamu bytů...") html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1") all_listings = parse_filter_page(html) logger.info(f"Nalezeno: {len(all_listings)} jednotek") # Step 2: Collect unique project slugs from detail URLs to fetch GPS logger.info("\nFáze 2: Stahování GPS souřadnic projektů...") project_slugs = set() for listing in all_listings: url = listing.get("url", "") # /projekty/zateckych-14/nabidka-nemovitosti/byt-a31 slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url) if slug_match: project_slugs.add(slug_match.group(1)) # Fetch GPS for each project from locality pages project_gps = {} for slug in sorted(project_slugs): time.sleep(0.5) try: locality_url = f"{BASE_URL}/projekty/{slug}/lokalita" logger.debug(f"Fetching project GPS: {locality_url}") loc_html = fetch_url(locality_url) gps = extract_project_gps(loc_html) if gps: # Take first entry (the project itself) first_name, (lat, lon) = next(iter(gps.items())) project_gps[slug] = (lat, lon) logger.info(f"✓ {slug}: {lat}, {lon}") else: logger.info(f"✗ {slug}: GPS nenalezeno") except Exception as e: logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True) logger.info(f"✗ {slug}: chyba ({e})") # Step 3: Filter listings logger.info(f"\nFáze 3: Filtrování...") results = [] excluded_sold = 0 excluded_type = 0 excluded_disp = 0 excluded_price = 0 excluded_area = 0 excluded_floor = 0 excluded_no_gps = 0 properties_fetched = 0 for listing in all_listings: if max_properties and properties_fetched >= max_properties: logger.debug(f"Max properties limit reached: {max_properties}") break unit_name = listing.get("unit_name", "unknown") # Only available units if listing["free"] != "yes": excluded_sold += 1 logger.debug(f"Filter: {unit_name} - excluded (not free)") continue # Only apartments (unittype=2) if listing["unittype"] != 2: excluded_type += 1 logger.debug(f"Filter: {unit_name} - excluded (not apartment, unittype={listing['unittype']})") continue # Only sales if listing["transaction"] != "prodej": excluded_type += 1 logger.debug(f"Filter: {unit_name} - excluded (not sale, transaction={listing['transaction']})") continue # Disposition disp = listing["disposition"] if disp not in WANTED_DISPOSITIONS: excluded_disp += 1 logger.debug(f"Filter: {unit_name} - excluded (disposition {disp})") continue # Price price = listing["price"] if price <= 0 or price > MAX_PRICE: excluded_price += 1 logger.debug(f"Filter: {unit_name} - excluded (price {price})") continue # Area area = listing["area"] if area < MIN_AREA: excluded_area += 1 logger.debug(f"Filter: {unit_name} - excluded (area {area} m²)") continue # Floor floor = listing["floor"] if floor is not None and floor < MIN_FLOOR: excluded_floor += 1 logger.debug(f"Filter: {unit_name} - excluded (floor {floor})") continue # GPS from project url = listing.get("url", "") slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url) slug = slug_match.group(1) if slug_match else "" gps = project_gps.get(slug) if not gps: excluded_no_gps += 1 logger.debug(f"Filter: {unit_name} - excluded (no GPS for project {slug})") continue lat, lon = gps result = { "hash_id": f"cityhome_{slug}_{listing['unit_name']}", "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}", "price": price, "price_formatted": format_price(price), "locality": f"{listing['project_name']}, Praha", "lat": lat, "lon": lon, "disposition": disp, "floor": floor, "area": area, "building_type": "Cihlová", # CityHome renovuje cihlové domy "ownership": "neuvedeno", "url": url, "source": "cityhome", "image": "", } results.append(result) properties_fetched += 1 logger.info(f"\n{'=' * 60}") logger.info(f"Výsledky CityHome:") logger.info(f" Celkem jednotek: {len(all_listings)}") logger.info(f" Vyloučeno (prodáno): {excluded_sold}") logger.info(f" Vyloučeno (typ): {excluded_type}") logger.info(f" Vyloučeno (dispozice): {excluded_disp}") logger.info(f" Vyloučeno (cena): {excluded_price}") logger.info(f" Vyloučeno (plocha): {excluded_area}") logger.info(f" Vyloučeno (patro): {excluded_floor}") logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}") logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f"{'=' * 60}") return results if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape apartments from CityHome") parser.add_argument("--max-pages", type=int, default=None, help="Maximum number of listing pages to scrape (not applicable for CityHome)") parser.add_argument("--max-properties", type=int, default=None, help="Maximum number of properties to include in results") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") args = parser.parse_args() # Configure logging logging.basicConfig( level=getattr(logging, args.log_level), format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", handlers=[logging.StreamHandler()] ) start = time.time() estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) if estates: json_path = Path("byty_cityhome.json") json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed = time.time() - start logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") else: logger.info("\nŽádné byty z CityHome neodpovídají kritériím :(")