From 27a7834eb6180a4c229e816c1274bd643853fba1 Mon Sep 17 00:00:00 2001
From: Jan Novak <jan.novak@livesport.eu>
Date: Fri, 27 Feb 2026 10:36:37 +0100
Subject: [PATCH 1/2] Reliability improvements: retry logic, validation,
 ratings sync

- Add 3-attempt retry with exponential backoff to Sreality, Realingo,
  Bezrealitky, and PSN scrapers (CityHome and iDNES already had it)
- Add shared validate_listing() in scraper_stats.py; all 6 scrapers now
  validate GPS bounds, price, area, and required fields before output
- Wire ratings to server /api/ratings on page load (merge with
  localStorage) and save (async POST); ratings now persist across
  browsers and devices
- Namespace JS hash IDs as {source}_{id} to prevent rating collisions
  between listings from different portals with the same numeric ID
- Replace manual Czech diacritic table with unicodedata.normalize()
  in merge_and_map.py for correct deduplication of all edge cases
- Correct README schedule docs: every 4 hours, not twice daily

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md             |  6 +--
 merge_and_map.py      | 11 ++----
 scrape_and_map.py     | 65 +++++++++++++++++++++++--------
 scrape_bezrealitky.py | 91 ++++++++++++++++++++++++-------------------
 scrape_cityhome.py    |  4 +-
 scrape_idnes.py       |  4 +-
 scrape_psn.py         | 36 ++++++++++-------
 scrape_realingo.py    | 67 +++++++++++++++++--------------
 scraper_stats.py      | 42 +++++++++++++++++++-
 9 files changed, 212 insertions(+), 114 deletions(-)

diff --git a/README.md b/README.md
index cd7e52f..850b6c6 100644
--- a/README.md
+++ b/README.md
@@ -151,7 +151,7 @@ The project includes a Docker setup for unattended operation with a cron-based s
 │  PID 1: python3 -m http.server :8080    │
 │         serves /app/data/               │
 │                                         │
-│  crond:  runs run_all.sh at 06:00/18:00 │
+│  crond:  runs run_all.sh every 4 hours  │
 │          Europe/Prague timezone          │
 │                                         │
 │  /app/        -- scripts (.py, .sh)     │
@@ -160,7 +160,7 @@ The project includes a Docker setup for unattended operation with a cron-based s
 └─────────────────────────────────────────┘
 ```
 
-On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place twice daily at 06:00 and 18:00 CET/CEST.
+On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place every 4 hours.
 
 ### Quick start
 
@@ -208,7 +208,7 @@ Validation targets run scrapers with `--max-pages 1 --max-properties 10` for a f
 ├── build/
 │   ├── Dockerfile          # Container image definition (python:3.13-alpine)
 │   ├── entrypoint.sh       # Container entrypoint (HTTP server + cron + initial scrape)
-│   ├── crontab             # Cron schedule (06:00 and 18:00 CET)
+│   ├── crontab             # Cron schedule (every 4 hours)
 │   └── CONTAINER.md        # Container-specific documentation
 └── .gitignore              # Ignores byty_*.json, __pycache__, .vscode
 ```
diff --git a/merge_and_map.py b/merge_and_map.py
index 74fceaa..1eb9406 100644
--- a/merge_and_map.py
+++ b/merge_and_map.py
@@ -9,6 +9,7 @@ from __future__ import annotations
 
 import json
 import re
+import unicodedata
 from pathlib import Path
 
 from scrape_and_map import generate_map, format_price
@@ -19,14 +20,8 @@ def normalize_street(locality: str) -> str:
     # "Studentská, Praha 6 - Dejvice" → "studentska"
     # "Rýnská, Praha" → "rynska"
     street = locality.split(",")[0].strip().lower()
-    # Remove diacritics (simple Czech)
-    replacements = {
-        "á": "a", "č": "c", "ď": "d", "é": "e", "ě": "e",
-        "í": "i", "ň": "n", "ó": "o", "ř": "r", "š": "s",
-        "ť": "t", "ú": "u", "ů": "u", "ý": "y", "ž": "z",
-    }
-    for src, dst in replacements.items():
-        street = street.replace(src, dst)
+    # Remove diacritics using Unicode decomposition (handles all Czech characters)
+    street = unicodedata.normalize("NFKD", street).encode("ascii", "ignore").decode("ascii")
     # Remove non-alphanumeric
     street = re.sub(r"[^a-z0-9]", "", street)
     return street
diff --git a/scrape_and_map.py b/scrape_and_map.py
index c129bb8..0b49717 100644
--- a/scrape_and_map.py
+++ b/scrape_and_map.py
@@ -15,7 +15,7 @@ import urllib.request
 import urllib.parse
 from datetime import datetime, timedelta
 from pathlib import Path
-from scraper_stats import write_stats
+from scraper_stats import write_stats, validate_listing
 
 STATS_FILE = "stats_sreality.json"
 
@@ -45,19 +45,26 @@ HEADERS = {
 
 
 def api_get(url: str) -> dict:
-    """Fetch JSON from Sreality API."""
-    logger.debug(f"HTTP GET request: {url}")
-    logger.debug(f"Headers: {HEADERS}")
-    req = urllib.request.Request(url, headers=HEADERS)
-    try:
-        with urllib.request.urlopen(req, timeout=30) as resp:
-            response_data = resp.read().decode("utf-8")
-            logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes")
-            logger.debug(f"Response preview: {response_data[:200]}")
-            return json.loads(response_data)
-    except (urllib.error.URLError, ConnectionError, OSError) as e:
-        logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
-        raise
+    """Fetch JSON from Sreality API with retry."""
+    for attempt in range(3):
+        logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
+        req = urllib.request.Request(url, headers=HEADERS)
+        try:
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                response_data = resp.read().decode("utf-8")
+                logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes")
+                logger.debug(f"Response preview: {response_data[:200]}")
+                return json.loads(response_data)
+        except urllib.error.HTTPError:
+            raise
+        except (urllib.error.URLError, ConnectionError, OSError) as e:
+            if attempt < 2:
+                wait = (attempt + 1) * 2
+                logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
+                time.sleep(wait)
+            else:
+                logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
+                raise
 
 
 def build_list_url(disposition: int, page: int = 1) -> str:
@@ -356,6 +363,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
             "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
             "last_changed": datetime.now().strftime("%Y-%m-%d"),
         }
+        if not validate_listing(result, "sreality"):
+            continue
         results.append(result)
         details_fetched += 1
 
@@ -476,7 +485,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
         source_label = source_labels.get(source, source)
         source_color = source_colors.get(source, "#999")
 
-        hash_id = e.get("hash_id", "")
+        hash_id = f"{source}_{e.get('hash_id', '')}"
 
         first_seen = e.get("first_seen", "")
         last_changed = e.get("last_changed", "")
@@ -864,6 +873,11 @@ function loadRatings() {{
 
 function saveRatings(ratings) {{
   localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
+  fetch('/api/ratings', {{
+    method: 'POST',
+    headers: {{'Content-Type': 'application/json'}},
+    body: JSON.stringify(ratings)
+  }}).catch(function() {{}});
 }}
 
 function addRejectStrike(marker) {{
@@ -1167,8 +1181,25 @@ function applyFilters() {{
   document.getElementById('visible-count').textContent = visible;
 }}
 
-// Initialize ratings on load
-restoreRatings();
+// Initialize ratings: load from server, merge with localStorage, then restore
+function initRatings() {{
+  var local = loadRatings();
+  fetch('/api/ratings')
+    .then(function(r) {{ return r.ok ? r.json() : null; }})
+    .then(function(server) {{
+      if (server && typeof server === 'object') {{
+        var merged = Object.assign({{}}, local, server);
+        localStorage.setItem(RATINGS_KEY, JSON.stringify(merged));
+      }}
+      restoreRatings();
+      updateRatingCounts();
+    }})
+    .catch(function() {{
+      restoreRatings();
+      updateRatingCounts();
+    }});
+}}
+initRatings();
 
 // ── Panel toggle ──────────────────────────────────────────────
 function togglePanel() {{
diff --git a/scrape_bezrealitky.py b/scrape_bezrealitky.py
index bd0761e..4bb3dd4 100644
--- a/scrape_bezrealitky.py
+++ b/scrape_bezrealitky.py
@@ -15,7 +15,7 @@ import re
 import time
 import urllib.request
 from pathlib import Path
-from scraper_stats import write_stats
+from scraper_stats import write_stats, validate_listing
 
 STATS_FILE = "stats_bezrealitky.json"
 
@@ -71,62 +71,71 @@ HEADERS = {
 BASE_URL = "https://www.bezrealitky.cz"
 
 
+def fetch_url(url: str, retries: int = 3) -> str:
+    """Fetch URL and return HTML string with retry on transient errors."""
+    for attempt in range(retries):
+        try:
+            logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
+            req = urllib.request.Request(url, headers=HEADERS)
+            resp = urllib.request.urlopen(req, timeout=30)
+            html = resp.read().decode("utf-8")
+            logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+            return html
+        except urllib.error.HTTPError:
+            raise
+        except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
+            if attempt < retries - 1:
+                wait = (attempt + 1) * 2
+                logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
+                time.sleep(wait)
+            else:
+                logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
+                raise
+
+
 def fetch_page(page: int) -> tuple[list[dict], int]:
     """
     Fetch a listing page from Bezrealitky.
     Returns (list of advert dicts from Apollo cache, total count).
     """
     url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
-    logger.debug(f"HTTP GET request: {url}")
-    logger.debug(f"Headers: {HEADERS}")
-    req = urllib.request.Request(url, headers=HEADERS)
-    try:
-        resp = urllib.request.urlopen(req, timeout=30)
-        html = resp.read().decode("utf-8")
-        logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+    html = fetch_url(url)
 
-        match = re.search(
-            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
-            html, re.DOTALL
-        )
-        if not match:
-            logger.debug("No __NEXT_DATA__ script found in HTML")
-            return [], 0
+    match = re.search(
+        r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
+        html, re.DOTALL
+    )
+    if not match:
+        logger.debug("No __NEXT_DATA__ script found in HTML")
+        return [], 0
 
-        data = json.loads(match.group(1))
-        cache = data["props"]["pageProps"]["apolloCache"]
+    data = json.loads(match.group(1))
+    cache = data["props"]["pageProps"]["apolloCache"]
 
-        # Extract adverts from cache
-        adverts = []
-        for key, val in cache.items():
-            if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
-                adverts.append(val)
+    # Extract adverts from cache
+    adverts = []
+    for key, val in cache.items():
+        if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
+            adverts.append(val)
 
-        # Get total count from ROOT_QUERY
-        total = 0
-        root = cache.get("ROOT_QUERY", {})
-        for key, val in root.items():
-            if "listAdverts" in key and isinstance(val, dict):
-                tc = val.get("totalCount")
-                if tc and tc > total:
-                    total = tc
+    # Get total count from ROOT_QUERY
+    total = 0
+    root = cache.get("ROOT_QUERY", {})
+    for key, val in root.items():
+        if "listAdverts" in key and isinstance(val, dict):
+            tc = val.get("totalCount")
+            if tc and tc > total:
+                total = tc
 
-        logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
-        return adverts, total
-    except (urllib.error.URLError, ConnectionError, OSError) as e:
-        logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
-        raise
+    logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
+    return adverts, total
 
 
 def fetch_detail(uri: str) -> dict | None:
     """Fetch detail page for a listing."""
     try:
         url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
-        logger.debug(f"HTTP GET request: {url}")
-        req = urllib.request.Request(url, headers=HEADERS)
-        resp = urllib.request.urlopen(req, timeout=30)
-        html = resp.read().decode("utf-8")
-        logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+        html = fetch_url(url)
 
         match = re.search(
             r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
@@ -365,6 +374,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
             "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
             "last_changed": datetime.now().strftime("%Y-%m-%d"),
         }
+        if not validate_listing(result, "bezrealitky"):
+            continue
         results.append(result)
         properties_fetched += 1
 
diff --git a/scrape_cityhome.py b/scrape_cityhome.py
index dc2da92..d39f735 100644
--- a/scrape_cityhome.py
+++ b/scrape_cityhome.py
@@ -14,7 +14,7 @@ import time
 import urllib.request
 from datetime import datetime
 from pathlib import Path
-from scraper_stats import write_stats
+from scraper_stats import write_stats, validate_listing
 
 STATS_FILE = "stats_cityhome.json"
 
@@ -375,6 +375,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
             "first_seen": _prev_cache.get(f"cityhome_{slug}_{listing['unit_name']}", {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")),
             "last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(f"cityhome_{slug}_{listing['unit_name']}", {}).get("price") != price else _prev_cache[f"cityhome_{slug}_{listing['unit_name']}"].get("last_changed", datetime.now().strftime("%Y-%m-%d")),
         }
+        if not validate_listing(result, "cityhome"):
+            continue
         results.append(result)
         properties_fetched += 1
 
diff --git a/scrape_idnes.py b/scrape_idnes.py
index 88f17ff..b788acc 100644
--- a/scrape_idnes.py
+++ b/scrape_idnes.py
@@ -16,7 +16,7 @@ import time
 import urllib.request
 import urllib.parse
 from pathlib import Path
-from scraper_stats import write_stats
+from scraper_stats import write_stats, validate_listing
 
 STATS_FILE = "stats_idnes.json"
 
@@ -467,6 +467,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
             "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
             "last_changed": datetime.now().strftime("%Y-%m-%d"),
         }
+        if not validate_listing(result, "idnes"):
+            continue
         results.append(result)
         properties_fetched += 1
 
diff --git a/scrape_psn.py b/scrape_psn.py
index 71cde1a..b4ce6f8 100644
--- a/scrape_psn.py
+++ b/scrape_psn.py
@@ -15,7 +15,7 @@ import time
 from datetime import datetime
 from pathlib import Path
 from urllib.parse import urlencode
-from scraper_stats import write_stats
+from scraper_stats import write_stats, validate_listing
 
 STATS_FILE = "stats_psn.json"
 
@@ -38,19 +38,25 @@ BASE_URL = "https://psn.cz"
 UNITS_API = f"{BASE_URL}/api/units-list"
 
 
-def fetch_json(url: str) -> dict:
-    """Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
-    logger.debug(f"HTTP GET: {url}")
-    result = subprocess.run(
-        ["curl", "-s", "-L", "--max-time", "30",
-         "-H", f"User-Agent: {UA}",
-         "-H", "Accept: application/json",
-         url],
-        capture_output=True, text=True, timeout=60
-    )
-    if result.returncode != 0:
-        raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
-    return json.loads(result.stdout)
+def fetch_json(url: str, retries: int = 3) -> dict:
+    """Fetch JSON via curl (urllib SSL may fail on Cloudflare) with retry."""
+    for attempt in range(retries):
+        logger.debug(f"HTTP GET (attempt {attempt + 1}/{retries}): {url}")
+        result = subprocess.run(
+            ["curl", "-s", "-L", "--max-time", "30",
+             "-H", f"User-Agent: {UA}",
+             "-H", "Accept: application/json",
+             url],
+            capture_output=True, text=True, timeout=60
+        )
+        if result.returncode == 0:
+            return json.loads(result.stdout)
+        if attempt < retries - 1:
+            wait = (attempt + 1) * 2
+            logger.warning(f"curl failed (retry {attempt + 1}/{retries} after {wait}s): {result.stderr[:200]}")
+            time.sleep(wait)
+        else:
+            raise RuntimeError(f"curl failed after {retries} attempts ({result.returncode}): {result.stderr[:200]}")
 
 
 def fix_gps(lat, lng):
@@ -255,6 +261,8 @@ def scrape(max_properties: int | None = None):
             "first_seen": _prev_cache.get(str(unit_id), {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")),
             "last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(str(unit_id), {}).get("price") != int(price) else _prev_cache[str(unit_id)].get("last_changed", datetime.now().strftime("%Y-%m-%d")),
         }
+        if not validate_listing(result, "psn"):
+            continue
         results.append(result)
         properties_fetched += 1
 
diff --git a/scrape_realingo.py b/scrape_realingo.py
index 096d78d..8dd3d84 100644
--- a/scrape_realingo.py
+++ b/scrape_realingo.py
@@ -15,7 +15,7 @@ import re
 import time
 import urllib.request
 from pathlib import Path
-from scraper_stats import write_stats
+from scraper_stats import write_stats, validate_listing
 
 STATS_FILE = "stats_realingo.json"
 
@@ -56,6 +56,28 @@ HEADERS = {
 BASE_URL = "https://www.realingo.cz"
 
 
+def fetch_url(url: str, retries: int = 3) -> str:
+    """Fetch URL and return HTML string with retry on transient errors."""
+    for attempt in range(retries):
+        try:
+            logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
+            req = urllib.request.Request(url, headers=HEADERS)
+            resp = urllib.request.urlopen(req, timeout=30)
+            html = resp.read().decode("utf-8")
+            logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+            return html
+        except urllib.error.HTTPError:
+            raise
+        except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
+            if attempt < retries - 1:
+                wait = (attempt + 1) * 2
+                logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
+                time.sleep(wait)
+            else:
+                logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
+                raise
+
+
 def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
     """Fetch a page of Prague listings. Returns (items, total_count)."""
     if page == 1:
@@ -63,41 +85,26 @@ def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
     else:
         url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
 
-    logger.debug(f"HTTP GET request: {url}")
-    logger.debug(f"Headers: {HEADERS}")
-    req = urllib.request.Request(url, headers=HEADERS)
-    try:
-        resp = urllib.request.urlopen(req, timeout=30)
-        html = resp.read().decode("utf-8")
-        logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+    html = fetch_url(url)
+    match = re.search(
+        r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
+        html, re.DOTALL
+    )
+    if not match:
+        logger.debug("No __NEXT_DATA__ script found in HTML")
+        return [], 0
 
-        match = re.search(
-            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
-            html, re.DOTALL
-        )
-        if not match:
-            logger.debug("No __NEXT_DATA__ script found in HTML")
-            return [], 0
-
-        data = json.loads(match.group(1))
-        offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
-        logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
-        return offer_list["data"], offer_list["total"]
-    except (urllib.error.URLError, ConnectionError, OSError) as e:
-        logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
-        raise
+    data = json.loads(match.group(1))
+    offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
+    logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
+    return offer_list["data"], offer_list["total"]
 
 
 def fetch_detail(listing_url: str) -> dict | None:
     """Fetch detail page for a listing to get floor, building type, etc."""
     try:
         url = f"{BASE_URL}{listing_url}"
-        logger.debug(f"HTTP GET request: {url}")
-        req = urllib.request.Request(url, headers=HEADERS)
-        resp = urllib.request.urlopen(req, timeout=30)
-        html = resp.read().decode("utf-8")
-        logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
-
+        html = fetch_url(url)
         match = re.search(
             r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
             html, re.DOTALL
@@ -324,6 +331,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
             "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
             "last_changed": datetime.now().strftime("%Y-%m-%d"),
         }
+        if not validate_listing(result, "realingo"):
+            continue
         results.append(result)
         properties_fetched += 1
 
diff --git a/scraper_stats.py b/scraper_stats.py
index b605533..c3d234f 100644
--- a/scraper_stats.py
+++ b/scraper_stats.py
@@ -1,13 +1,53 @@
-"""Shared utility for writing per-scraper run statistics to JSON."""
+"""Shared utilities for scraper run statistics and listing validation."""
 from __future__ import annotations
 
 import json
+import logging
 import os
 from pathlib import Path
 
 HERE = Path(__file__).parent
 DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
 
+_val_log = logging.getLogger(__name__)
+
+_REQUIRED_FIELDS = ("hash_id", "price", "locality", "lat", "lon", "url", "source")
+
+
+def validate_listing(listing: dict, context: str = "") -> bool:
+    """
+    Validate a listing dict before it is written to the output JSON.
+    Returns True if valid, False if the listing should be skipped.
+    Logs a warning for each invalid listing.
+    """
+    prefix = f"[{context}] " if context else ""
+
+    for field in _REQUIRED_FIELDS:
+        val = listing.get(field)
+        if val is None or val == "":
+            _val_log.warning(f"{prefix}Skipping listing — missing field '{field}': {listing.get('hash_id', '?')}")
+            return False
+
+    price = listing.get("price")
+    if not isinstance(price, (int, float)) or price <= 0:
+        _val_log.warning(f"{prefix}Skipping listing — invalid price={price!r}: {listing.get('hash_id', '?')}")
+        return False
+
+    lat, lon = listing.get("lat"), listing.get("lon")
+    if not isinstance(lat, (int, float)) or not isinstance(lon, (int, float)):
+        _val_log.warning(f"{prefix}Skipping listing — non-numeric GPS lat={lat!r} lon={lon!r}: {listing.get('hash_id', '?')}")
+        return False
+    if not (47.0 <= lat <= 52.0) or not (12.0 <= lon <= 19.0):
+        _val_log.warning(f"{prefix}Skipping listing — GPS outside Czech Republic lat={lat} lon={lon}: {listing.get('hash_id', '?')}")
+        return False
+
+    area = listing.get("area")
+    if area is not None and (not isinstance(area, (int, float)) or area <= 0):
+        _val_log.warning(f"{prefix}Skipping listing — invalid area={area!r}: {listing.get('hash_id', '?')}")
+        return False
+
+    return True
+
 
 def write_stats(filename: str, stats: dict) -> None:
     """Write scraper run stats dict to the data directory."""
-- 
2.49.1


From fd3991f8d6b29bb782ab61cc6812a897bbc82768 Mon Sep 17 00:00:00 2001
From: Jan Novak <jan.novak@livesport.eu>
Date: Fri, 27 Feb 2026 10:44:08 +0100
Subject: [PATCH 2/2] Remove regen_map.py references from Dockerfile and README

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md        | 5 -----
 build/Dockerfile | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 850b6c6..f80a326 100644
--- a/README.md
+++ b/README.md
@@ -83,10 +83,6 @@ Merges all `byty_*.json` files into `byty_merged.json` and generates `mapa_bytu.
 
 **Deduplication logic:** Two listings are considered duplicates if they share the same normalized street name + price + area. PSN and CityHome have priority during dedup (loaded first), so their listings are kept over duplicates from other portals.
 
-### `regen_map.py`
-
-Regenerates the map from existing `byty_sreality.json` data without re-scraping. Fetches missing area values from the Sreality API, fixes URLs, and re-applies the area filter. Useful for tweaking map output after data has already been collected.
-
 ## Interactive map (`mapa_bytu.html`)
 
 The generated map is a standalone HTML file using Leaflet.js with CARTO basemap tiles. Features:
@@ -201,7 +197,6 @@ Validation targets run scrapers with `--max-pages 1 --max-properties 10` for a f
 ├── scrape_psn.py           # PSN scraper
 ├── scrape_cityhome.py      # CityHome scraper
 ├── merge_and_map.py        # Merge all sources + generate final map
-├── regen_map.py            # Regenerate map from cached Sreality data
 ├── run_all.sh              # Orchestrator script (runs all scrapers + merge)
 ├── mapa_bytu.html          # Generated interactive map (output)
 ├── Makefile                # Docker management + validation shortcuts
diff --git a/build/Dockerfile b/build/Dockerfile
index 541f268..b5a2b91 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -11,7 +11,7 @@ WORKDIR /app
 
 COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
      scrape_idnes.py scrape_psn.py scrape_cityhome.py \
-     merge_and_map.py regen_map.py generate_status.py scraper_stats.py \
+     merge_and_map.py generate_status.py scraper_stats.py \
      run_all.sh server.py ./
 
 COPY build/crontab /etc/crontabs/root
-- 
2.49.1