#!/usr/bin/env python3 """ CityHome (city-home.cz) scraper. Stáhne byty na prodej v Praze z projektů CityHome/SATPO. Výstup: byty_cityhome.json """ from __future__ import annotations import argparse import json import logging import re import time import urllib.request from datetime import datetime from pathlib import Path logger = logging.getLogger(__name__) # ── Konfigurace ───────────────────────────────────────────────────────────── MAX_PRICE = 14_000_000 MIN_AREA = 69 MIN_FLOOR = 2 WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml", "Accept-Language": "cs,en;q=0.9", } BASE_URL = "https://www.city-home.cz" def fetch_url(url: str, retries: int = 3) -> str: """Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx.""" for attempt in range(retries): try: logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}") req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=30) html = resp.read().decode("utf-8") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") return html except urllib.error.HTTPError: # Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately raise except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e: if attempt < retries - 1: wait = (attempt + 1) * 2 logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}") time.sleep(wait) else: logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True) raise def format_price(price: int) -> str: s = str(price) parts = [] while s: parts.append(s[-3:]) s = s[:-3] return " ".join(reversed(parts)) + " Kč" def parse_filter_page(html: str) -> list[dict]: """Parse all listing rows from the filter page.""" listings = [] # Find all with data-cena attribute row_pattern = re.compile( r']*' r'data-cena="(\d+)"[^>]*' r'data-plocha="([\d.]+)"[^>]*' r'data-unittype="(\d+)"[^>]*' r'data-free="(yes|no)"[^>]*' r'data-project="(\d+)"[^>]*' r'data-transaction="([^"]*)"[^>]*' r'data-dispozition="([^"]*)"[^>]*' r'data-location="([^"]*)"[^>]*' r'>(.*?)', re.DOTALL ) # Also try with different attribute order rows = re.findall(r']*data-cena="[^"]*"[^>]*>(.*?)', html, re.DOTALL) for row_html in rows: # Extract data attributes from the surrounding tr_match = re.search( r']*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*' r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*' r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*' r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"', html ) # More flexible: search around each row pass # Better approach: find each tr tag with all its attributes for match in re.finditer(r']*data-cena="[^"]*"[^>]*)>(.*?)', html, re.DOTALL): attrs_str = match.group(1) row_content = match.group(2) # Extract all data attributes cena = re.search(r'data-cena="(\d+)"', attrs_str) plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str) unittype = re.search(r'data-unittype="(\d+)"', attrs_str) free = re.search(r'data-free="(yes|no)"', attrs_str) project = re.search(r'data-project="(\d+)"', attrs_str) transaction = re.search(r'data-transaction="([^"]*)"', attrs_str) dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str) location = re.search(r'data-location="([^"]*)"', attrs_str) if not cena: continue # Extract detail URL and unit name from first cell link_match = re.search(r']*href="([^"]*)"[^>]*>(.*?)', row_content, re.DOTALL) detail_url = link_match.group(1).strip() if link_match else "" unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else "" if detail_url and not detail_url.startswith("http"): detail_url = BASE_URL + detail_url # Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price] cells = re.findall(r']*>(.*?)', row_content, re.DOTALL) cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells] # Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP") project_address = cell_texts[2] if len(cell_texts) > 2 else "" floor = None if len(cell_texts) > 3: np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3]) pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3]) if np_match: floor = int(np_match.group(1)) elif pp_match: floor = -int(pp_match.group(1)) listing = { "price": int(cena.group(1)), "area": float(plocha.group(1)) if plocha else 0, "unittype": int(unittype.group(1)) if unittype else 0, "free": free.group(1) if free else "no", "project_id": project.group(1) if project else "", "transaction": transaction.group(1) if transaction else "", "disposition": dispozition.group(1) if dispozition else "", "url": detail_url, "unit_name": unit_name, "floor": floor, "project_address": project_address, } listings.append(listing) return listings def get_lokalita_urls(slug: str) -> list[str]: """Return candidate lokalita URLs to try in order.""" return [ f"{BASE_URL}/projekty/{slug}/lokalita", f"{BASE_URL}/bytove-domy/{slug}/lokalita", f"{BASE_URL}/bytove-domy/{slug}/lokalita1", ] def extract_project_gps(html: str) -> tuple[float, float] | None: """Extract project GPS from lokalita page JS variable. The page contains: var locations = [['

Name

...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...] Category '1' = the project's own marker. Some projects have two cat-1 entries (data error); in that case we pick the one whose name contains a digit and is not a transit landmark. """ block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL) if not block: return None entries = re.findall( r"'

(.*?)

.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'", block.group(0), re.DOTALL, ) if not entries: return None if len(entries) == 1: return float(entries[0][1]), float(entries[0][2]) # Multiple cat-1 entries: pick the real project marker transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE) for name, lat, lng in entries: if re.search(r'\d', name) and not transit_re.search(name): return float(lat), float(lng) # Fallback: first entry return float(entries[0][1]), float(entries[0][2]) def scrape(max_pages: int | None = None, max_properties: int | None = None): logger.info("=" * 60) logger.info("Stahuji inzeráty z CityHome (city-home.cz)") logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Min. plocha: {MIN_AREA} m²") logger.info(f"Patro: od {MIN_FLOOR}. NP") if max_properties: logger.info(f"Max. bytů: {max_properties}") logger.info("=" * 60) # Step 1: Fetch the main filter page logger.info("\nFáze 1: Stahování seznamu bytů...") html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1") all_listings = parse_filter_page(html) logger.info(f"Nalezeno: {len(all_listings)} jednotek") # Step 2: Collect unique project slugs from detail URLs to fetch GPS logger.info("\nFáze 2: Stahování GPS souřadnic projektů...") project_slugs = set() for listing in all_listings: url = listing.get("url", "") # /projekty/zateckych-14/nabidka-nemovitosti/byt-a31 slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url) if slug_match: project_slugs.add(slug_match.group(1)) # Fetch GPS for each project from locality pages project_gps = {} for slug in sorted(project_slugs): time.sleep(0.3) gps = None for url in get_lokalita_urls(slug): try: logger.debug(f"Fetching project GPS: {url}") loc_html = fetch_url(url) gps = extract_project_gps(loc_html) if gps: break except Exception as e: logger.debug(f"GPS fetch failed for {url}: {e}") continue if gps: project_gps[slug] = gps logger.info(f"✓ {slug}: {gps[0]}, {gps[1]}") else: logger.info(f"✗ {slug}: GPS nenalezeno") # Step 3: Filter listings logger.info(f"\nFáze 3: Filtrování...") results = [] excluded_sold = 0 excluded_type = 0 excluded_disp = 0 excluded_price = 0 excluded_area = 0 excluded_floor = 0 excluded_no_gps = 0 properties_fetched = 0 for listing in all_listings: if max_properties and properties_fetched >= max_properties: logger.debug(f"Max properties limit reached: {max_properties}") break unit_name = listing.get("unit_name", "unknown") # Only available units if listing["free"] != "yes": excluded_sold += 1 logger.debug(f"Filter: {unit_name} - excluded (not free)") continue # Only apartments (unittype=2) if listing["unittype"] != 2: excluded_type += 1 logger.debug(f"Filter: {unit_name} - excluded (not apartment, unittype={listing['unittype']})") continue # Only sales if listing["transaction"] != "prodej": excluded_type += 1 logger.debug(f"Filter: {unit_name} - excluded (not sale, transaction={listing['transaction']})") continue # Disposition disp = listing["disposition"] if disp not in WANTED_DISPOSITIONS: excluded_disp += 1 logger.debug(f"Filter: {unit_name} - excluded (disposition {disp})") continue # Price price = listing["price"] if price <= 0 or price > MAX_PRICE: excluded_price += 1 logger.debug(f"Filter: {unit_name} - excluded (price {price})") continue # Area area = listing["area"] if area < MIN_AREA: excluded_area += 1 logger.debug(f"Filter: {unit_name} - excluded (area {area} m²)") continue # Floor floor = listing["floor"] if floor is not None and floor < MIN_FLOOR: excluded_floor += 1 logger.debug(f"Filter: {unit_name} - excluded (floor {floor})") continue # GPS from project url = listing.get("url", "") slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url) slug = slug_match.group(1) if slug_match else "" gps = project_gps.get(slug) if not gps: excluded_no_gps += 1 logger.debug(f"Filter: {unit_name} - excluded (no GPS for project {slug})") continue lat, lon = gps # locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup project_address = listing.get("project_address", "") # derive city from slug (GPS lookup key) city_map = { "karlinske-namesti-5": "Praha 8", "melnicka-12": "Praha 7", "na-vaclavce-34": "Praha 5", "nad-kajetankou-12": "Praha 6", "vosmikovych-3": "Praha 9", "zateckych-14": "Praha 2", } city_str = city_map.get(slug, "Praha") locality_str = f"{project_address}, {city_str}" if project_address else city_str result = { "hash_id": f"cityhome_{slug}_{listing['unit_name']}", "name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}", "price": price, "price_formatted": format_price(price), "locality": locality_str, "lat": lat, "lon": lon, "disposition": disp, "floor": floor, "area": float(area), "building_type": "Cihlová", # CityHome renovuje cihlové domy "ownership": "neuvedeno", "url": url, "source": "cityhome", "image": "", "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) properties_fetched += 1 logger.info(f"\n{'=' * 60}") logger.info(f"Výsledky CityHome:") logger.info(f" Celkem jednotek: {len(all_listings)}") logger.info(f" Vyloučeno (prodáno): {excluded_sold}") logger.info(f" Vyloučeno (typ): {excluded_type}") logger.info(f" Vyloučeno (dispozice): {excluded_disp}") logger.info(f" Vyloučeno (cena): {excluded_price}") logger.info(f" Vyloučeno (plocha): {excluded_area}") logger.info(f" Vyloučeno (patro): {excluded_floor}") logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}") logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f"{'=' * 60}") return results if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape apartments from CityHome") parser.add_argument("--max-pages", type=int, default=None, help="Maximum number of listing pages to scrape (not applicable for CityHome)") parser.add_argument("--max-properties", type=int, default=None, help="Maximum number of properties to include in results") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") args = parser.parse_args() # Configure logging logging.basicConfig( level=getattr(logging, args.log_level), format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", handlers=[logging.StreamHandler()] ) start = time.time() estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) if estates: json_path = Path("byty_cityhome.json") json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed = time.time() - start logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") else: logger.info("\nŽádné byty z CityHome neodpovídají kritériím :(")