#!/usr/bin/env python3
"""
CityHome (city-home.cz) scraper.
Stáhne byty na prodej v Praze z projektů CityHome/SATPO.
Výstup: byty_cityhome.json
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import time
import urllib.request
from pathlib import Path
logger = logging.getLogger(__name__)
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "cs,en;q=0.9",
}
BASE_URL = "https://www.city-home.cz"
def fetch_url(url: str) -> str:
"""Fetch URL and return HTML string."""
for attempt in range(3):
try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
if attempt < 2:
wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
time.sleep(wait)
else:
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
raise
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + " Kč"
def parse_filter_page(html: str) -> list[dict]:
"""Parse all listing rows from the filter page."""
listings = []
# Find all
with data-cena attribute
row_pattern = re.compile(
r'
]*'
r'data-cena="(\d+)"[^>]*'
r'data-plocha="([\d.]+)"[^>]*'
r'data-unittype="(\d+)"[^>]*'
r'data-free="(yes|no)"[^>]*'
r'data-project="(\d+)"[^>]*'
r'data-transaction="([^"]*)"[^>]*'
r'data-dispozition="([^"]*)"[^>]*'
r'data-location="([^"]*)"[^>]*'
r'>(.*?)
',
re.DOTALL
)
# Also try with different attribute order
rows = re.findall(r']*data-cena="[^"]*"[^>]*>(.*?)
', html, re.DOTALL)
for row_html in rows:
# Extract data attributes from the surrounding
tr_match = re.search(
r'
]*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*'
r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*'
r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*'
r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"',
html
)
# More flexible: search around each row
pass
# Better approach: find each tr tag with all its attributes
for match in re.finditer(r'
]*data-cena="[^"]*"[^>]*)>(.*?)
', html, re.DOTALL):
attrs_str = match.group(1)
row_content = match.group(2)
# Extract all data attributes
cena = re.search(r'data-cena="(\d+)"', attrs_str)
plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str)
unittype = re.search(r'data-unittype="(\d+)"', attrs_str)
free = re.search(r'data-free="(yes|no)"', attrs_str)
project = re.search(r'data-project="(\d+)"', attrs_str)
transaction = re.search(r'data-transaction="([^"]*)"', attrs_str)
dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str)
location = re.search(r'data-location="([^"]*)"', attrs_str)
if not cena:
continue
# Extract detail URL and unit name from first cell
link_match = re.search(r']*href="([^"]*)"[^>]*>(.*?)', row_content, re.DOTALL)
detail_url = link_match.group(1).strip() if link_match else ""
unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else ""
if detail_url and not detail_url.startswith("http"):
detail_url = BASE_URL + detail_url
# Extract floor from cells — look for pattern like "3.NP" or "2.PP"
cells = re.findall(r']*>(.*?) | ', row_content, re.DOTALL)
floor = None
floor_text = ""
project_name = ""
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
# Floor pattern
np_match = re.search(r'(\d+)\.\s*NP', cell_text)
pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
if np_match:
floor = int(np_match.group(1))
floor_text = cell_text
elif pp_match:
floor = -int(pp_match.group(1)) # Underground
floor_text = cell_text
# Extract project name — usually in a cell that's not a number/price/floor
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
if len(cell_text) > 3 and cell_text != unit_name:
project_name = cell_text
break
listing = {
"price": int(cena.group(1)),
"area": float(plocha.group(1)) if plocha else 0,
"unittype": int(unittype.group(1)) if unittype else 0,
"free": free.group(1) if free else "no",
"project_id": project.group(1) if project else "",
"transaction": transaction.group(1) if transaction else "",
"disposition": dispozition.group(1) if dispozition else "",
"location": location.group(1) if location else "",
"url": detail_url,
"unit_name": unit_name,
"floor": floor,
"project_name": project_name,
}
listings.append(listing)
return listings
def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
"""Extract GPS coordinates for projects from locality pages."""
# Pattern in JS: ['Project Name
...', 'LAT', 'LON', '1', 'Name']
gps_data = {}
for match in re.finditer(r"\['[^']*([^<]+)
[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
name = match.group(1).strip()
lat = float(match.group(2))
lon = float(match.group(3))
gps_data[name] = (lat, lon)
return gps_data
def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.info("=" * 60)
logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA} m²")
logger.info(f"Patro: od {MIN_FLOOR}. NP")
if max_properties:
logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60)
# Step 1: Fetch the main filter page
logger.info("\nFáze 1: Stahování seznamu bytů...")
html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
all_listings = parse_filter_page(html)
logger.info(f"Nalezeno: {len(all_listings)} jednotek")
# Step 2: Collect unique project slugs from detail URLs to fetch GPS
logger.info("\nFáze 2: Stahování GPS souřadnic projektů...")
project_slugs = set()
for listing in all_listings:
url = listing.get("url", "")
# /projekty/zateckych-14/nabidka-nemovitosti/byt-a31
slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
if slug_match:
project_slugs.add(slug_match.group(1))
# Fetch GPS for each project from locality pages
project_gps = {}
for slug in sorted(project_slugs):
time.sleep(0.5)
try:
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
logger.debug(f"Fetching project GPS: {locality_url}")
loc_html = fetch_url(locality_url)
gps = extract_project_gps(loc_html)
if gps:
# Take first entry (the project itself)
first_name, (lat, lon) = next(iter(gps.items()))
project_gps[slug] = (lat, lon)
logger.info(f"✓ {slug}: {lat}, {lon}")
else:
logger.info(f"✗ {slug}: GPS nenalezeno")
except Exception as e:
logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
logger.info(f"✗ {slug}: chyba ({e})")
# Step 3: Filter listings
logger.info(f"\nFáze 3: Filtrování...")
results = []
excluded_sold = 0
excluded_type = 0
excluded_disp = 0
excluded_price = 0
excluded_area = 0
excluded_floor = 0
excluded_no_gps = 0
properties_fetched = 0
for listing in all_listings:
if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break
unit_name = listing.get("unit_name", "unknown")
# Only available units
if listing["free"] != "yes":
excluded_sold += 1
logger.debug(f"Filter: {unit_name} - excluded (not free)")
continue
# Only apartments (unittype=2)
if listing["unittype"] != 2:
excluded_type += 1
logger.debug(f"Filter: {unit_name} - excluded (not apartment, unittype={listing['unittype']})")
continue
# Only sales
if listing["transaction"] != "prodej":
excluded_type += 1
logger.debug(f"Filter: {unit_name} - excluded (not sale, transaction={listing['transaction']})")
continue
# Disposition
disp = listing["disposition"]
if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1
logger.debug(f"Filter: {unit_name} - excluded (disposition {disp})")
continue
# Price
price = listing["price"]
if price <= 0 or price > MAX_PRICE:
excluded_price += 1
logger.debug(f"Filter: {unit_name} - excluded (price {price})")
continue
# Area
area = listing["area"]
if area < MIN_AREA:
excluded_area += 1
logger.debug(f"Filter: {unit_name} - excluded (area {area} m²)")
continue
# Floor
floor = listing["floor"]
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
logger.debug(f"Filter: {unit_name} - excluded (floor {floor})")
continue
# GPS from project
url = listing.get("url", "")
slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
slug = slug_match.group(1) if slug_match else ""
gps = project_gps.get(slug)
if not gps:
excluded_no_gps += 1
logger.debug(f"Filter: {unit_name} - excluded (no GPS for project {slug})")
continue
lat, lon = gps
result = {
"hash_id": f"cityhome_{slug}_{listing['unit_name']}",
"name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
"price": price,
"price_formatted": format_price(price),
"locality": f"{listing['project_name']}, Praha",
"lat": lat,
"lon": lon,
"disposition": disp,
"floor": floor,
"area": area,
"building_type": "Cihlová", # CityHome renovuje cihlové domy
"ownership": "neuvedeno",
"url": url,
"source": "cityhome",
"image": "",
}
results.append(result)
properties_fetched += 1
logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky CityHome:")
logger.info(f" Celkem jednotek: {len(all_listings)}")
logger.info(f" Vyloučeno (prodáno): {excluded_sold}")
logger.info(f" Vyloučeno (typ): {excluded_type}")
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}")
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from CityHome")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages to scrape (not applicable for CityHome)")
parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
args = parser.parse_args()
# Configure logging
logging.basicConfig(
level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()]
)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates:
json_path = Path("byty_cityhome.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
logger.info("\nŽádné byty z CityHome neodpovídají kritériím :(")