Rewrite PSN + CityHome scrapers, add price/m² map coloring, ratings system, and status dashboard

- Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing) - Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries - Color map markers by price/m² instead of disposition (blue→green→orange→red scale) - Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback - Rejected markers show original color at reduced opacity with 🚫 SVG overlay - Favorite markers shown as ⭐ star icons with gold pulse animation - Add "new today" marker logic (scraped_at == today) with larger pulsing green outline - Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons - Add generate_status.py for scraper run statistics and status.html dashboard - Add scraped_at field to all scrapers for freshness tracking - Update run_all.sh with log capture and status generation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 15:15:25 +01:00
parent c6089f0da9
commit b8d4d44164
13 changed files with 1922 additions and 395 deletions
--- a/scrape_cityhome.py
+++ b/scrape_cityhome.py
@@ -12,6 +12,7 @@ import logging
 import re
 import time
 import urllib.request
+from datetime import datetime
 from pathlib import Path

 logger = logging.getLogger(__name__)
@@ -33,24 +34,26 @@ HEADERS = {
 BASE_URL = "https://www.city-home.cz"


-def fetch_url(url: str) -> str:
-    """Fetch URL and return HTML string."""
-    for attempt in range(3):
+def fetch_url(url: str, retries: int = 3) -> str:
+    """Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx."""
+    for attempt in range(retries):
        try:
-            logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
-            logger.debug(f"Headers: {HEADERS}")
+            logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
            req = urllib.request.Request(url, headers=HEADERS)
            resp = urllib.request.urlopen(req, timeout=30)
            html = resp.read().decode("utf-8")
            logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
            return html
+        except urllib.error.HTTPError:
+            # Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
+            raise
        except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
-            if attempt < 2:
+            if attempt < retries - 1:
                wait = (attempt + 1) * 2
-                logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
+                logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
                time.sleep(wait)
            else:
-                logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
+                logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
                raise


@@ -124,31 +127,21 @@ def parse_filter_page(html: str) -> list[dict]:
        if detail_url and not detail_url.startswith("http"):
            detail_url = BASE_URL + detail_url

-        # Extract floor from cells — look for pattern like "3.NP" or "2.PP"
+        # Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price]
        cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
-        floor = None
-        floor_text = ""
-        project_name = ""
+        cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]

-        for cell in cells:
-            cell_text = re.sub(r'<[^>]+>', '', cell).strip()
-            # Floor pattern
-            np_match = re.search(r'(\d+)\.\s*NP', cell_text)
-            pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
+        # Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
+        project_address = cell_texts[2] if len(cell_texts) > 2 else ""
+
+        floor = None
+        if len(cell_texts) > 3:
+            np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3])
+            pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
            if np_match:
                floor = int(np_match.group(1))
-                floor_text = cell_text
            elif pp_match:
-                floor = -int(pp_match.group(1))  # Underground
-                floor_text = cell_text
-
-        # Extract project name — usually in a cell that's not a number/price/floor
-        for cell in cells:
-            cell_text = re.sub(r'<[^>]+>', '', cell).strip()
-            if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
-                if len(cell_text) > 3 and cell_text != unit_name:
-                    project_name = cell_text
-                    break
+                floor = -int(pp_match.group(1))

        listing = {
            "price": int(cena.group(1)),
@@ -158,27 +151,55 @@ def parse_filter_page(html: str) -> list[dict]:
            "project_id": project.group(1) if project else "",
            "transaction": transaction.group(1) if transaction else "",
            "disposition": dispozition.group(1) if dispozition else "",
-            "location": location.group(1) if location else "",
            "url": detail_url,
            "unit_name": unit_name,
            "floor": floor,
-            "project_name": project_name,
+            "project_address": project_address,
        }
        listings.append(listing)

    return listings


-def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
-    """Extract GPS coordinates for projects from locality pages."""
-    # Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
-    gps_data = {}
-    for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
-        name = match.group(1).strip()
-        lat = float(match.group(2))
-        lon = float(match.group(3))
-        gps_data[name] = (lat, lon)
-    return gps_data
+def get_lokalita_urls(slug: str) -> list[str]:
+    """Return candidate lokalita URLs to try in order."""
+    return [
+        f"{BASE_URL}/projekty/{slug}/lokalita",
+        f"{BASE_URL}/bytove-domy/{slug}/lokalita",
+        f"{BASE_URL}/bytove-domy/{slug}/lokalita1",
+    ]
+
+
+def extract_project_gps(html: str) -> tuple[float, float] | None:
+    """Extract project GPS from lokalita page JS variable.
+
+    The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...]
+    Category '1' = the project's own marker. Some projects have two cat-1 entries (data error);
+    in that case we pick the one whose name contains a digit and is not a transit landmark.
+    """
+    block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL)
+    if not block:
+        return None
+
+    entries = re.findall(
+        r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
+        block.group(0),
+        re.DOTALL,
+    )
+    if not entries:
+        return None
+
+    if len(entries) == 1:
+        return float(entries[0][1]), float(entries[0][2])
+
+    # Multiple cat-1 entries: pick the real project marker
+    transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
+    for name, lat, lng in entries:
+        if re.search(r'\d', name) and not transit_re.search(name):
+            return float(lat), float(lng)
+
+    # Fallback: first entry
+    return float(entries[0][1]), float(entries[0][2])


 def scrape(max_pages: int | None = None, max_properties: int | None = None):
@@ -210,22 +231,24 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
    # Fetch GPS for each project from locality pages
    project_gps = {}
    for slug in sorted(project_slugs):
-        time.sleep(0.5)
-        try:
-            locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
-            logger.debug(f"Fetching project GPS: {locality_url}")
-            loc_html = fetch_url(locality_url)
-            gps = extract_project_gps(loc_html)
-            if gps:
-                # Take first entry (the project itself)
-                first_name, (lat, lon) = next(iter(gps.items()))
-                project_gps[slug] = (lat, lon)
-                logger.info(f"✓ {slug}: {lat}, {lon}")
-            else:
-                logger.info(f"✗ {slug}: GPS nenalezeno")
-        except Exception as e:
-            logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
-            logger.info(f"✗ {slug}: chyba ({e})")
+        time.sleep(0.3)
+        gps = None
+        for url in get_lokalita_urls(slug):
+            try:
+                logger.debug(f"Fetching project GPS: {url}")
+                loc_html = fetch_url(url)
+                gps = extract_project_gps(loc_html)
+                if gps:
+                    break
+            except Exception as e:
+                logger.debug(f"GPS fetch failed for {url}: {e}")
+                continue
+
+        if gps:
+            project_gps[slug] = gps
+            logger.info(f"✓ {slug}: {gps[0]}, {gps[1]}")
+        else:
+            logger.info(f"✗ {slug}: GPS nenalezeno")

    # Step 3: Filter listings
    logger.info(f"\nFáze 3: Filtrování...")
@@ -303,22 +326,37 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):

        lat, lon = gps

+        # locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup
+        project_address = listing.get("project_address", "")
+        # derive city from slug (GPS lookup key)
+        city_map = {
+            "karlinske-namesti-5": "Praha 8",
+            "melnicka-12": "Praha 7",
+            "na-vaclavce-34": "Praha 5",
+            "nad-kajetankou-12": "Praha 6",
+            "vosmikovych-3": "Praha 9",
+            "zateckych-14": "Praha 2",
+        }
+        city_str = city_map.get(slug, "Praha")
+        locality_str = f"{project_address}, {city_str}" if project_address else city_str
+
        result = {
            "hash_id": f"cityhome_{slug}_{listing['unit_name']}",
-            "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
+            "name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}",
            "price": price,
            "price_formatted": format_price(price),
-            "locality": f"{listing['project_name']}, Praha",
+            "locality": locality_str,
            "lat": lat,
            "lon": lon,
            "disposition": disp,
            "floor": floor,
-            "area": area,
+            "area": float(area),
            "building_type": "Cihlová",  # CityHome renovuje cihlové domy
            "ownership": "neuvedeno",
            "url": url,
            "source": "cityhome",
            "image": "",
+            "scraped_at": datetime.now().strftime("%Y-%m-%d"),
        }
        results.append(result)
        properties_fetched += 1