- Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing) - Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries - Color map markers by price/m² instead of disposition (blue→green→orange→red scale) - Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback - Rejected markers show original color at reduced opacity with 🚫 SVG overlay - Favorite markers shown as ⭐ star icons with gold pulse animation - Add "new today" marker logic (scraped_at == today) with larger pulsing green outline - Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons - Add generate_status.py for scraper run statistics and status.html dashboard - Add scraped_at field to all scrapers for freshness tracking - Update run_all.sh with log capture and status generation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
411 lines
15 KiB
Python
411 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
CityHome (city-home.cz) scraper.
|
|
Stáhne byty na prodej v Praze z projektů CityHome/SATPO.
|
|
Výstup: byty_cityhome.json
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import time
|
|
import urllib.request
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Konfigurace ─────────────────────────────────────────────────────────────
|
|
|
|
MAX_PRICE = 14_000_000
|
|
MIN_AREA = 69
|
|
MIN_FLOOR = 2
|
|
|
|
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
"Accept-Language": "cs,en;q=0.9",
|
|
}
|
|
|
|
BASE_URL = "https://www.city-home.cz"
|
|
|
|
|
|
def fetch_url(url: str, retries: int = 3) -> str:
|
|
"""Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx."""
|
|
for attempt in range(retries):
|
|
try:
|
|
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
|
|
req = urllib.request.Request(url, headers=HEADERS)
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
html = resp.read().decode("utf-8")
|
|
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
|
return html
|
|
except urllib.error.HTTPError:
|
|
# Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
|
|
raise
|
|
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
|
|
if attempt < retries - 1:
|
|
wait = (attempt + 1) * 2
|
|
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
|
|
time.sleep(wait)
|
|
else:
|
|
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
|
|
raise
|
|
|
|
|
|
def format_price(price: int) -> str:
|
|
s = str(price)
|
|
parts = []
|
|
while s:
|
|
parts.append(s[-3:])
|
|
s = s[:-3]
|
|
return " ".join(reversed(parts)) + " Kč"
|
|
|
|
|
|
def parse_filter_page(html: str) -> list[dict]:
|
|
"""Parse all listing rows from the filter page."""
|
|
listings = []
|
|
|
|
# Find all <tr> with data-cena attribute
|
|
row_pattern = re.compile(
|
|
r'<tr[^>]*'
|
|
r'data-cena="(\d+)"[^>]*'
|
|
r'data-plocha="([\d.]+)"[^>]*'
|
|
r'data-unittype="(\d+)"[^>]*'
|
|
r'data-free="(yes|no)"[^>]*'
|
|
r'data-project="(\d+)"[^>]*'
|
|
r'data-transaction="([^"]*)"[^>]*'
|
|
r'data-dispozition="([^"]*)"[^>]*'
|
|
r'data-location="([^"]*)"[^>]*'
|
|
r'>(.*?)</tr>',
|
|
re.DOTALL
|
|
)
|
|
|
|
# Also try with different attribute order
|
|
rows = re.findall(r'<tr[^>]*data-cena="[^"]*"[^>]*>(.*?)</tr>', html, re.DOTALL)
|
|
|
|
for row_html in rows:
|
|
# Extract data attributes from the surrounding <tr>
|
|
tr_match = re.search(
|
|
r'<tr[^>]*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*'
|
|
r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*'
|
|
r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*'
|
|
r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"',
|
|
html
|
|
)
|
|
|
|
# More flexible: search around each row
|
|
pass
|
|
|
|
# Better approach: find each tr tag with all its attributes
|
|
for match in re.finditer(r'<tr\s+([^>]*data-cena="[^"]*"[^>]*)>(.*?)</tr>', html, re.DOTALL):
|
|
attrs_str = match.group(1)
|
|
row_content = match.group(2)
|
|
|
|
# Extract all data attributes
|
|
cena = re.search(r'data-cena="(\d+)"', attrs_str)
|
|
plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str)
|
|
unittype = re.search(r'data-unittype="(\d+)"', attrs_str)
|
|
free = re.search(r'data-free="(yes|no)"', attrs_str)
|
|
project = re.search(r'data-project="(\d+)"', attrs_str)
|
|
transaction = re.search(r'data-transaction="([^"]*)"', attrs_str)
|
|
dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str)
|
|
location = re.search(r'data-location="([^"]*)"', attrs_str)
|
|
|
|
if not cena:
|
|
continue
|
|
|
|
# Extract detail URL and unit name from first cell
|
|
link_match = re.search(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', row_content, re.DOTALL)
|
|
detail_url = link_match.group(1).strip() if link_match else ""
|
|
unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else ""
|
|
|
|
if detail_url and not detail_url.startswith("http"):
|
|
detail_url = BASE_URL + detail_url
|
|
|
|
# Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price]
|
|
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
|
|
cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
|
|
|
|
# Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
|
|
project_address = cell_texts[2] if len(cell_texts) > 2 else ""
|
|
|
|
floor = None
|
|
if len(cell_texts) > 3:
|
|
np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3])
|
|
pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
|
|
if np_match:
|
|
floor = int(np_match.group(1))
|
|
elif pp_match:
|
|
floor = -int(pp_match.group(1))
|
|
|
|
listing = {
|
|
"price": int(cena.group(1)),
|
|
"area": float(plocha.group(1)) if plocha else 0,
|
|
"unittype": int(unittype.group(1)) if unittype else 0,
|
|
"free": free.group(1) if free else "no",
|
|
"project_id": project.group(1) if project else "",
|
|
"transaction": transaction.group(1) if transaction else "",
|
|
"disposition": dispozition.group(1) if dispozition else "",
|
|
"url": detail_url,
|
|
"unit_name": unit_name,
|
|
"floor": floor,
|
|
"project_address": project_address,
|
|
}
|
|
listings.append(listing)
|
|
|
|
return listings
|
|
|
|
|
|
def get_lokalita_urls(slug: str) -> list[str]:
|
|
"""Return candidate lokalita URLs to try in order."""
|
|
return [
|
|
f"{BASE_URL}/projekty/{slug}/lokalita",
|
|
f"{BASE_URL}/bytove-domy/{slug}/lokalita",
|
|
f"{BASE_URL}/bytove-domy/{slug}/lokalita1",
|
|
]
|
|
|
|
|
|
def extract_project_gps(html: str) -> tuple[float, float] | None:
|
|
"""Extract project GPS from lokalita page JS variable.
|
|
|
|
The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...]
|
|
Category '1' = the project's own marker. Some projects have two cat-1 entries (data error);
|
|
in that case we pick the one whose name contains a digit and is not a transit landmark.
|
|
"""
|
|
block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL)
|
|
if not block:
|
|
return None
|
|
|
|
entries = re.findall(
|
|
r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
|
|
block.group(0),
|
|
re.DOTALL,
|
|
)
|
|
if not entries:
|
|
return None
|
|
|
|
if len(entries) == 1:
|
|
return float(entries[0][1]), float(entries[0][2])
|
|
|
|
# Multiple cat-1 entries: pick the real project marker
|
|
transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
|
|
for name, lat, lng in entries:
|
|
if re.search(r'\d', name) and not transit_re.search(name):
|
|
return float(lat), float(lng)
|
|
|
|
# Fallback: first entry
|
|
return float(entries[0][1]), float(entries[0][2])
|
|
|
|
|
|
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|
logger.info("=" * 60)
|
|
logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
|
|
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
|
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
|
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
|
if max_properties:
|
|
logger.info(f"Max. bytů: {max_properties}")
|
|
logger.info("=" * 60)
|
|
|
|
# Step 1: Fetch the main filter page
|
|
logger.info("\nFáze 1: Stahování seznamu bytů...")
|
|
html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
|
|
all_listings = parse_filter_page(html)
|
|
logger.info(f"Nalezeno: {len(all_listings)} jednotek")
|
|
|
|
# Step 2: Collect unique project slugs from detail URLs to fetch GPS
|
|
logger.info("\nFáze 2: Stahování GPS souřadnic projektů...")
|
|
project_slugs = set()
|
|
for listing in all_listings:
|
|
url = listing.get("url", "")
|
|
# /projekty/zateckych-14/nabidka-nemovitosti/byt-a31
|
|
slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
|
|
if slug_match:
|
|
project_slugs.add(slug_match.group(1))
|
|
|
|
# Fetch GPS for each project from locality pages
|
|
project_gps = {}
|
|
for slug in sorted(project_slugs):
|
|
time.sleep(0.3)
|
|
gps = None
|
|
for url in get_lokalita_urls(slug):
|
|
try:
|
|
logger.debug(f"Fetching project GPS: {url}")
|
|
loc_html = fetch_url(url)
|
|
gps = extract_project_gps(loc_html)
|
|
if gps:
|
|
break
|
|
except Exception as e:
|
|
logger.debug(f"GPS fetch failed for {url}: {e}")
|
|
continue
|
|
|
|
if gps:
|
|
project_gps[slug] = gps
|
|
logger.info(f"✓ {slug}: {gps[0]}, {gps[1]}")
|
|
else:
|
|
logger.info(f"✗ {slug}: GPS nenalezeno")
|
|
|
|
# Step 3: Filter listings
|
|
logger.info(f"\nFáze 3: Filtrování...")
|
|
results = []
|
|
excluded_sold = 0
|
|
excluded_type = 0
|
|
excluded_disp = 0
|
|
excluded_price = 0
|
|
excluded_area = 0
|
|
excluded_floor = 0
|
|
excluded_no_gps = 0
|
|
properties_fetched = 0
|
|
|
|
for listing in all_listings:
|
|
if max_properties and properties_fetched >= max_properties:
|
|
logger.debug(f"Max properties limit reached: {max_properties}")
|
|
break
|
|
unit_name = listing.get("unit_name", "unknown")
|
|
# Only available units
|
|
if listing["free"] != "yes":
|
|
excluded_sold += 1
|
|
logger.debug(f"Filter: {unit_name} - excluded (not free)")
|
|
continue
|
|
|
|
# Only apartments (unittype=2)
|
|
if listing["unittype"] != 2:
|
|
excluded_type += 1
|
|
logger.debug(f"Filter: {unit_name} - excluded (not apartment, unittype={listing['unittype']})")
|
|
continue
|
|
|
|
# Only sales
|
|
if listing["transaction"] != "prodej":
|
|
excluded_type += 1
|
|
logger.debug(f"Filter: {unit_name} - excluded (not sale, transaction={listing['transaction']})")
|
|
continue
|
|
|
|
# Disposition
|
|
disp = listing["disposition"]
|
|
if disp not in WANTED_DISPOSITIONS:
|
|
excluded_disp += 1
|
|
logger.debug(f"Filter: {unit_name} - excluded (disposition {disp})")
|
|
continue
|
|
|
|
# Price
|
|
price = listing["price"]
|
|
if price <= 0 or price > MAX_PRICE:
|
|
excluded_price += 1
|
|
logger.debug(f"Filter: {unit_name} - excluded (price {price})")
|
|
continue
|
|
|
|
# Area
|
|
area = listing["area"]
|
|
if area < MIN_AREA:
|
|
excluded_area += 1
|
|
logger.debug(f"Filter: {unit_name} - excluded (area {area} m²)")
|
|
continue
|
|
|
|
# Floor
|
|
floor = listing["floor"]
|
|
if floor is not None and floor < MIN_FLOOR:
|
|
excluded_floor += 1
|
|
logger.debug(f"Filter: {unit_name} - excluded (floor {floor})")
|
|
continue
|
|
|
|
# GPS from project
|
|
url = listing.get("url", "")
|
|
slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
|
|
slug = slug_match.group(1) if slug_match else ""
|
|
gps = project_gps.get(slug)
|
|
|
|
if not gps:
|
|
excluded_no_gps += 1
|
|
logger.debug(f"Filter: {unit_name} - excluded (no GPS for project {slug})")
|
|
continue
|
|
|
|
lat, lon = gps
|
|
|
|
# locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup
|
|
project_address = listing.get("project_address", "")
|
|
# derive city from slug (GPS lookup key)
|
|
city_map = {
|
|
"karlinske-namesti-5": "Praha 8",
|
|
"melnicka-12": "Praha 7",
|
|
"na-vaclavce-34": "Praha 5",
|
|
"nad-kajetankou-12": "Praha 6",
|
|
"vosmikovych-3": "Praha 9",
|
|
"zateckych-14": "Praha 2",
|
|
}
|
|
city_str = city_map.get(slug, "Praha")
|
|
locality_str = f"{project_address}, {city_str}" if project_address else city_str
|
|
|
|
result = {
|
|
"hash_id": f"cityhome_{slug}_{listing['unit_name']}",
|
|
"name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}",
|
|
"price": price,
|
|
"price_formatted": format_price(price),
|
|
"locality": locality_str,
|
|
"lat": lat,
|
|
"lon": lon,
|
|
"disposition": disp,
|
|
"floor": floor,
|
|
"area": float(area),
|
|
"building_type": "Cihlová", # CityHome renovuje cihlové domy
|
|
"ownership": "neuvedeno",
|
|
"url": url,
|
|
"source": "cityhome",
|
|
"image": "",
|
|
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
|
}
|
|
results.append(result)
|
|
properties_fetched += 1
|
|
|
|
logger.info(f"\n{'=' * 60}")
|
|
logger.info(f"Výsledky CityHome:")
|
|
logger.info(f" Celkem jednotek: {len(all_listings)}")
|
|
logger.info(f" Vyloučeno (prodáno): {excluded_sold}")
|
|
logger.info(f" Vyloučeno (typ): {excluded_type}")
|
|
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
|
|
logger.info(f" Vyloučeno (cena): {excluded_price}")
|
|
logger.info(f" Vyloučeno (plocha): {excluded_area}")
|
|
logger.info(f" Vyloučeno (patro): {excluded_floor}")
|
|
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
|
logger.info(f"{'=' * 60}")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Scrape apartments from CityHome")
|
|
parser.add_argument("--max-pages", type=int, default=None,
|
|
help="Maximum number of listing pages to scrape (not applicable for CityHome)")
|
|
parser.add_argument("--max-properties", type=int, default=None,
|
|
help="Maximum number of properties to include in results")
|
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
help="Logging level (default: INFO)")
|
|
args = parser.parse_args()
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=getattr(logging, args.log_level),
|
|
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
|
handlers=[logging.StreamHandler()]
|
|
)
|
|
|
|
start = time.time()
|
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
|
|
|
if estates:
|
|
json_path = Path("byty_cityhome.json")
|
|
json_path.write_text(
|
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
elapsed = time.time() - start
|
|
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
|
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
|
|
else:
|
|
logger.info("\nŽádné byty z CityHome neodpovídají kritériím :(")
|