Rewrite PSN + CityHome scrapers, add price/m² map coloring, ratings system, and status dashboard

- Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing)
- Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries
- Color map markers by price/m² instead of disposition (blue→green→orange→red scale)
- Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback
- Rejected markers show original color at reduced opacity with 🚫 SVG overlay
- Favorite markers shown as  star icons with gold pulse animation
- Add "new today" marker logic (scraped_at == today) with larger pulsing green outline
- Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons
- Add generate_status.py for scraper run statistics and status.html dashboard
- Add scraped_at field to all scrapers for freshness tracking
- Update run_all.sh with log capture and status generation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-18 15:15:25 +01:00
parent c6089f0da9
commit b8d4d44164
13 changed files with 1922 additions and 395 deletions

View File

@@ -12,6 +12,7 @@ import logging
import re
import time
import urllib.request
from datetime import datetime
from pathlib import Path
logger = logging.getLogger(__name__)
@@ -33,24 +34,26 @@ HEADERS = {
BASE_URL = "https://www.city-home.cz"
def fetch_url(url: str) -> str:
"""Fetch URL and return HTML string."""
for attempt in range(3):
def fetch_url(url: str, retries: int = 3) -> str:
"""Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx."""
for attempt in range(retries):
try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
logger.debug(f"Headers: {HEADERS}")
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html
except urllib.error.HTTPError:
# Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
if attempt < 2:
if attempt < retries - 1:
wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
time.sleep(wait)
else:
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
raise
@@ -124,31 +127,21 @@ def parse_filter_page(html: str) -> list[dict]:
if detail_url and not detail_url.startswith("http"):
detail_url = BASE_URL + detail_url
# Extract floor from cells — look for pattern like "3.NP" or "2.PP"
# Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price]
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
floor = None
floor_text = ""
project_name = ""
cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
# Floor pattern
np_match = re.search(r'(\d+)\.\s*NP', cell_text)
pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
# Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
project_address = cell_texts[2] if len(cell_texts) > 2 else ""
floor = None
if len(cell_texts) > 3:
np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3])
pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
if np_match:
floor = int(np_match.group(1))
floor_text = cell_text
elif pp_match:
floor = -int(pp_match.group(1)) # Underground
floor_text = cell_text
# Extract project name — usually in a cell that's not a number/price/floor
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "" not in cell_text and "" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
if len(cell_text) > 3 and cell_text != unit_name:
project_name = cell_text
break
floor = -int(pp_match.group(1))
listing = {
"price": int(cena.group(1)),
@@ -158,27 +151,55 @@ def parse_filter_page(html: str) -> list[dict]:
"project_id": project.group(1) if project else "",
"transaction": transaction.group(1) if transaction else "",
"disposition": dispozition.group(1) if dispozition else "",
"location": location.group(1) if location else "",
"url": detail_url,
"unit_name": unit_name,
"floor": floor,
"project_name": project_name,
"project_address": project_address,
}
listings.append(listing)
return listings
def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
"""Extract GPS coordinates for projects from locality pages."""
# Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
gps_data = {}
for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
name = match.group(1).strip()
lat = float(match.group(2))
lon = float(match.group(3))
gps_data[name] = (lat, lon)
return gps_data
def get_lokalita_urls(slug: str) -> list[str]:
"""Return candidate lokalita URLs to try in order."""
return [
f"{BASE_URL}/projekty/{slug}/lokalita",
f"{BASE_URL}/bytove-domy/{slug}/lokalita",
f"{BASE_URL}/bytove-domy/{slug}/lokalita1",
]
def extract_project_gps(html: str) -> tuple[float, float] | None:
"""Extract project GPS from lokalita page JS variable.
The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...]
Category '1' = the project's own marker. Some projects have two cat-1 entries (data error);
in that case we pick the one whose name contains a digit and is not a transit landmark.
"""
block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL)
if not block:
return None
entries = re.findall(
r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
block.group(0),
re.DOTALL,
)
if not entries:
return None
if len(entries) == 1:
return float(entries[0][1]), float(entries[0][2])
# Multiple cat-1 entries: pick the real project marker
transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
for name, lat, lng in entries:
if re.search(r'\d', name) and not transit_re.search(name):
return float(lat), float(lng)
# Fallback: first entry
return float(entries[0][1]), float(entries[0][2])
def scrape(max_pages: int | None = None, max_properties: int | None = None):
@@ -210,22 +231,24 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
# Fetch GPS for each project from locality pages
project_gps = {}
for slug in sorted(project_slugs):
time.sleep(0.5)
try:
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
logger.debug(f"Fetching project GPS: {locality_url}")
loc_html = fetch_url(locality_url)
gps = extract_project_gps(loc_html)
if gps:
# Take first entry (the project itself)
first_name, (lat, lon) = next(iter(gps.items()))
project_gps[slug] = (lat, lon)
logger.info(f"{slug}: {lat}, {lon}")
else:
logger.info(f"{slug}: GPS nenalezeno")
except Exception as e:
logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
logger.info(f" {slug}: chyba ({e})")
time.sleep(0.3)
gps = None
for url in get_lokalita_urls(slug):
try:
logger.debug(f"Fetching project GPS: {url}")
loc_html = fetch_url(url)
gps = extract_project_gps(loc_html)
if gps:
break
except Exception as e:
logger.debug(f"GPS fetch failed for {url}: {e}")
continue
if gps:
project_gps[slug] = gps
logger.info(f" {slug}: {gps[0]}, {gps[1]}")
else:
logger.info(f"{slug}: GPS nenalezeno")
# Step 3: Filter listings
logger.info(f"\nFáze 3: Filtrování...")
@@ -303,22 +326,37 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
lat, lon = gps
# locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup
project_address = listing.get("project_address", "")
# derive city from slug (GPS lookup key)
city_map = {
"karlinske-namesti-5": "Praha 8",
"melnicka-12": "Praha 7",
"na-vaclavce-34": "Praha 5",
"nad-kajetankou-12": "Praha 6",
"vosmikovych-3": "Praha 9",
"zateckych-14": "Praha 2",
}
city_str = city_map.get(slug, "Praha")
locality_str = f"{project_address}, {city_str}" if project_address else city_str
result = {
"hash_id": f"cityhome_{slug}_{listing['unit_name']}",
"name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
"name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}",
"price": price,
"price_formatted": format_price(price),
"locality": f"{listing['project_name']}, Praha",
"locality": locality_str,
"lat": lat,
"lon": lon,
"disposition": disp,
"floor": floor,
"area": area,
"area": float(area),
"building_type": "Cihlová", # CityHome renovuje cihlové domy
"ownership": "neuvedeno",
"url": url,
"source": "cityhome",
"image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
}
results.append(result)
properties_fetched += 1