Rewrite PSN + CityHome scrapers, add price/m² map coloring, ratings system, and status dashboard
- Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing) - Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries - Color map markers by price/m² instead of disposition (blue→green→orange→red scale) - Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback - Rejected markers show original color at reduced opacity with 🚫 SVG overlay - Favorite markers shown as ⭐ star icons with gold pulse animation - Add "new today" marker logic (scraped_at == today) with larger pulsing green outline - Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons - Add generate_status.py for scraper run statistics and status.html dashboard - Add scraped_at field to all scrapers for freshness tracking - Update run_all.sh with log capture and status generation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ import logging
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -33,24 +34,26 @@ HEADERS = {
|
||||
BASE_URL = "https://www.city-home.cz"
|
||||
|
||||
|
||||
def fetch_url(url: str) -> str:
|
||||
"""Fetch URL and return HTML string."""
|
||||
for attempt in range(3):
|
||||
def fetch_url(url: str, retries: int = 3) -> str:
|
||||
"""Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx."""
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
|
||||
logger.debug(f"Headers: {HEADERS}")
|
||||
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
html = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||
return html
|
||||
except urllib.error.HTTPError:
|
||||
# Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
|
||||
raise
|
||||
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
|
||||
if attempt < 2:
|
||||
if attempt < retries - 1:
|
||||
wait = (attempt + 1) * 2
|
||||
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
|
||||
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
|
||||
time.sleep(wait)
|
||||
else:
|
||||
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
|
||||
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
@@ -124,31 +127,21 @@ def parse_filter_page(html: str) -> list[dict]:
|
||||
if detail_url and not detail_url.startswith("http"):
|
||||
detail_url = BASE_URL + detail_url
|
||||
|
||||
# Extract floor from cells — look for pattern like "3.NP" or "2.PP"
|
||||
# Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price]
|
||||
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
|
||||
floor = None
|
||||
floor_text = ""
|
||||
project_name = ""
|
||||
cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
|
||||
|
||||
for cell in cells:
|
||||
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
|
||||
# Floor pattern
|
||||
np_match = re.search(r'(\d+)\.\s*NP', cell_text)
|
||||
pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
|
||||
# Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
|
||||
project_address = cell_texts[2] if len(cell_texts) > 2 else ""
|
||||
|
||||
floor = None
|
||||
if len(cell_texts) > 3:
|
||||
np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3])
|
||||
pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
|
||||
if np_match:
|
||||
floor = int(np_match.group(1))
|
||||
floor_text = cell_text
|
||||
elif pp_match:
|
||||
floor = -int(pp_match.group(1)) # Underground
|
||||
floor_text = cell_text
|
||||
|
||||
# Extract project name — usually in a cell that's not a number/price/floor
|
||||
for cell in cells:
|
||||
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
|
||||
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
|
||||
if len(cell_text) > 3 and cell_text != unit_name:
|
||||
project_name = cell_text
|
||||
break
|
||||
floor = -int(pp_match.group(1))
|
||||
|
||||
listing = {
|
||||
"price": int(cena.group(1)),
|
||||
@@ -158,27 +151,55 @@ def parse_filter_page(html: str) -> list[dict]:
|
||||
"project_id": project.group(1) if project else "",
|
||||
"transaction": transaction.group(1) if transaction else "",
|
||||
"disposition": dispozition.group(1) if dispozition else "",
|
||||
"location": location.group(1) if location else "",
|
||||
"url": detail_url,
|
||||
"unit_name": unit_name,
|
||||
"floor": floor,
|
||||
"project_name": project_name,
|
||||
"project_address": project_address,
|
||||
}
|
||||
listings.append(listing)
|
||||
|
||||
return listings
|
||||
|
||||
|
||||
def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
|
||||
"""Extract GPS coordinates for projects from locality pages."""
|
||||
# Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
|
||||
gps_data = {}
|
||||
for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
|
||||
name = match.group(1).strip()
|
||||
lat = float(match.group(2))
|
||||
lon = float(match.group(3))
|
||||
gps_data[name] = (lat, lon)
|
||||
return gps_data
|
||||
def get_lokalita_urls(slug: str) -> list[str]:
|
||||
"""Return candidate lokalita URLs to try in order."""
|
||||
return [
|
||||
f"{BASE_URL}/projekty/{slug}/lokalita",
|
||||
f"{BASE_URL}/bytove-domy/{slug}/lokalita",
|
||||
f"{BASE_URL}/bytove-domy/{slug}/lokalita1",
|
||||
]
|
||||
|
||||
|
||||
def extract_project_gps(html: str) -> tuple[float, float] | None:
|
||||
"""Extract project GPS from lokalita page JS variable.
|
||||
|
||||
The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...]
|
||||
Category '1' = the project's own marker. Some projects have two cat-1 entries (data error);
|
||||
in that case we pick the one whose name contains a digit and is not a transit landmark.
|
||||
"""
|
||||
block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL)
|
||||
if not block:
|
||||
return None
|
||||
|
||||
entries = re.findall(
|
||||
r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
|
||||
block.group(0),
|
||||
re.DOTALL,
|
||||
)
|
||||
if not entries:
|
||||
return None
|
||||
|
||||
if len(entries) == 1:
|
||||
return float(entries[0][1]), float(entries[0][2])
|
||||
|
||||
# Multiple cat-1 entries: pick the real project marker
|
||||
transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
|
||||
for name, lat, lng in entries:
|
||||
if re.search(r'\d', name) and not transit_re.search(name):
|
||||
return float(lat), float(lng)
|
||||
|
||||
# Fallback: first entry
|
||||
return float(entries[0][1]), float(entries[0][2])
|
||||
|
||||
|
||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
@@ -210,22 +231,24 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
# Fetch GPS for each project from locality pages
|
||||
project_gps = {}
|
||||
for slug in sorted(project_slugs):
|
||||
time.sleep(0.5)
|
||||
try:
|
||||
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
|
||||
logger.debug(f"Fetching project GPS: {locality_url}")
|
||||
loc_html = fetch_url(locality_url)
|
||||
gps = extract_project_gps(loc_html)
|
||||
if gps:
|
||||
# Take first entry (the project itself)
|
||||
first_name, (lat, lon) = next(iter(gps.items()))
|
||||
project_gps[slug] = (lat, lon)
|
||||
logger.info(f"✓ {slug}: {lat}, {lon}")
|
||||
else:
|
||||
logger.info(f"✗ {slug}: GPS nenalezeno")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
|
||||
logger.info(f"✗ {slug}: chyba ({e})")
|
||||
time.sleep(0.3)
|
||||
gps = None
|
||||
for url in get_lokalita_urls(slug):
|
||||
try:
|
||||
logger.debug(f"Fetching project GPS: {url}")
|
||||
loc_html = fetch_url(url)
|
||||
gps = extract_project_gps(loc_html)
|
||||
if gps:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"GPS fetch failed for {url}: {e}")
|
||||
continue
|
||||
|
||||
if gps:
|
||||
project_gps[slug] = gps
|
||||
logger.info(f"✓ {slug}: {gps[0]}, {gps[1]}")
|
||||
else:
|
||||
logger.info(f"✗ {slug}: GPS nenalezeno")
|
||||
|
||||
# Step 3: Filter listings
|
||||
logger.info(f"\nFáze 3: Filtrování...")
|
||||
@@ -303,22 +326,37 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
|
||||
lat, lon = gps
|
||||
|
||||
# locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup
|
||||
project_address = listing.get("project_address", "")
|
||||
# derive city from slug (GPS lookup key)
|
||||
city_map = {
|
||||
"karlinske-namesti-5": "Praha 8",
|
||||
"melnicka-12": "Praha 7",
|
||||
"na-vaclavce-34": "Praha 5",
|
||||
"nad-kajetankou-12": "Praha 6",
|
||||
"vosmikovych-3": "Praha 9",
|
||||
"zateckych-14": "Praha 2",
|
||||
}
|
||||
city_str = city_map.get(slug, "Praha")
|
||||
locality_str = f"{project_address}, {city_str}" if project_address else city_str
|
||||
|
||||
result = {
|
||||
"hash_id": f"cityhome_{slug}_{listing['unit_name']}",
|
||||
"name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
|
||||
"name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}",
|
||||
"price": price,
|
||||
"price_formatted": format_price(price),
|
||||
"locality": f"{listing['project_name']}, Praha",
|
||||
"locality": locality_str,
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"disposition": disp,
|
||||
"floor": floor,
|
||||
"area": area,
|
||||
"area": float(area),
|
||||
"building_type": "Cihlová", # CityHome renovuje cihlové domy
|
||||
"ownership": "neuvedeno",
|
||||
"url": url,
|
||||
"source": "cityhome",
|
||||
"image": "",
|
||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
Reference in New Issue
Block a user