From 27a7834eb6180a4c229e816c1274bd643853fba1 Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Fri, 27 Feb 2026 10:36:37 +0100 Subject: [PATCH 1/2] Reliability improvements: retry logic, validation, ratings sync - Add 3-attempt retry with exponential backoff to Sreality, Realingo, Bezrealitky, and PSN scrapers (CityHome and iDNES already had it) - Add shared validate_listing() in scraper_stats.py; all 6 scrapers now validate GPS bounds, price, area, and required fields before output - Wire ratings to server /api/ratings on page load (merge with localStorage) and save (async POST); ratings now persist across browsers and devices - Namespace JS hash IDs as {source}_{id} to prevent rating collisions between listings from different portals with the same numeric ID - Replace manual Czech diacritic table with unicodedata.normalize() in merge_and_map.py for correct deduplication of all edge cases - Correct README schedule docs: every 4 hours, not twice daily Co-Authored-By: Claude Sonnet 4.6 --- README.md | 6 +-- merge_and_map.py | 11 ++---- scrape_and_map.py | 65 +++++++++++++++++++++++-------- scrape_bezrealitky.py | 91 ++++++++++++++++++++++++------------------- scrape_cityhome.py | 4 +- scrape_idnes.py | 4 +- scrape_psn.py | 36 ++++++++++------- scrape_realingo.py | 67 +++++++++++++++++-------------- scraper_stats.py | 42 +++++++++++++++++++- 9 files changed, 212 insertions(+), 114 deletions(-) diff --git a/README.md b/README.md index cd7e52f..850b6c6 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ The project includes a Docker setup for unattended operation with a cron-based s │ PID 1: python3 -m http.server :8080 │ │ serves /app/data/ │ │ │ -│ crond: runs run_all.sh at 06:00/18:00 │ +│ crond: runs run_all.sh every 4 hours │ │ Europe/Prague timezone │ │ │ │ /app/ -- scripts (.py, .sh) │ @@ -160,7 +160,7 @@ The project includes a Docker setup for unattended operation with a cron-based s └─────────────────────────────────────────┘ ``` -On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place twice daily at 06:00 and 18:00 CET/CEST. +On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place every 4 hours. ### Quick start @@ -208,7 +208,7 @@ Validation targets run scrapers with `--max-pages 1 --max-properties 10` for a f ├── build/ │ ├── Dockerfile # Container image definition (python:3.13-alpine) │ ├── entrypoint.sh # Container entrypoint (HTTP server + cron + initial scrape) -│ ├── crontab # Cron schedule (06:00 and 18:00 CET) +│ ├── crontab # Cron schedule (every 4 hours) │ └── CONTAINER.md # Container-specific documentation └── .gitignore # Ignores byty_*.json, __pycache__, .vscode ``` diff --git a/merge_and_map.py b/merge_and_map.py index 74fceaa..1eb9406 100644 --- a/merge_and_map.py +++ b/merge_and_map.py @@ -9,6 +9,7 @@ from __future__ import annotations import json import re +import unicodedata from pathlib import Path from scrape_and_map import generate_map, format_price @@ -19,14 +20,8 @@ def normalize_street(locality: str) -> str: # "Studentská, Praha 6 - Dejvice" → "studentska" # "Rýnská, Praha" → "rynska" street = locality.split(",")[0].strip().lower() - # Remove diacritics (simple Czech) - replacements = { - "á": "a", "č": "c", "ď": "d", "é": "e", "ě": "e", - "í": "i", "ň": "n", "ó": "o", "ř": "r", "š": "s", - "ť": "t", "ú": "u", "ů": "u", "ý": "y", "ž": "z", - } - for src, dst in replacements.items(): - street = street.replace(src, dst) + # Remove diacritics using Unicode decomposition (handles all Czech characters) + street = unicodedata.normalize("NFKD", street).encode("ascii", "ignore").decode("ascii") # Remove non-alphanumeric street = re.sub(r"[^a-z0-9]", "", street) return street diff --git a/scrape_and_map.py b/scrape_and_map.py index c129bb8..0b49717 100644 --- a/scrape_and_map.py +++ b/scrape_and_map.py @@ -15,7 +15,7 @@ import urllib.request import urllib.parse from datetime import datetime, timedelta from pathlib import Path -from scraper_stats import write_stats +from scraper_stats import write_stats, validate_listing STATS_FILE = "stats_sreality.json" @@ -45,19 +45,26 @@ HEADERS = { def api_get(url: str) -> dict: - """Fetch JSON from Sreality API.""" - logger.debug(f"HTTP GET request: {url}") - logger.debug(f"Headers: {HEADERS}") - req = urllib.request.Request(url, headers=HEADERS) - try: - with urllib.request.urlopen(req, timeout=30) as resp: - response_data = resp.read().decode("utf-8") - logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes") - logger.debug(f"Response preview: {response_data[:200]}") - return json.loads(response_data) - except (urllib.error.URLError, ConnectionError, OSError) as e: - logger.error(f"HTTP request failed for {url}: {e}", exc_info=True) - raise + """Fetch JSON from Sreality API with retry.""" + for attempt in range(3): + logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}") + req = urllib.request.Request(url, headers=HEADERS) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + response_data = resp.read().decode("utf-8") + logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes") + logger.debug(f"Response preview: {response_data[:200]}") + return json.loads(response_data) + except urllib.error.HTTPError: + raise + except (urllib.error.URLError, ConnectionError, OSError) as e: + if attempt < 2: + wait = (attempt + 1) * 2 + logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}") + time.sleep(wait) + else: + logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True) + raise def build_list_url(disposition: int, page: int = 1) -> str: @@ -356,6 +363,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"), "last_changed": datetime.now().strftime("%Y-%m-%d"), } + if not validate_listing(result, "sreality"): + continue results.append(result) details_fetched += 1 @@ -476,7 +485,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): source_label = source_labels.get(source, source) source_color = source_colors.get(source, "#999") - hash_id = e.get("hash_id", "") + hash_id = f"{source}_{e.get('hash_id', '')}" first_seen = e.get("first_seen", "") last_changed = e.get("last_changed", "") @@ -864,6 +873,11 @@ function loadRatings() {{ function saveRatings(ratings) {{ localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings)); + fetch('/api/ratings', {{ + method: 'POST', + headers: {{'Content-Type': 'application/json'}}, + body: JSON.stringify(ratings) + }}).catch(function() {{}}); }} function addRejectStrike(marker) {{ @@ -1167,8 +1181,25 @@ function applyFilters() {{ document.getElementById('visible-count').textContent = visible; }} -// Initialize ratings on load -restoreRatings(); +// Initialize ratings: load from server, merge with localStorage, then restore +function initRatings() {{ + var local = loadRatings(); + fetch('/api/ratings') + .then(function(r) {{ return r.ok ? r.json() : null; }}) + .then(function(server) {{ + if (server && typeof server === 'object') {{ + var merged = Object.assign({{}}, local, server); + localStorage.setItem(RATINGS_KEY, JSON.stringify(merged)); + }} + restoreRatings(); + updateRatingCounts(); + }}) + .catch(function() {{ + restoreRatings(); + updateRatingCounts(); + }}); +}} +initRatings(); // ── Panel toggle ────────────────────────────────────────────── function togglePanel() {{ diff --git a/scrape_bezrealitky.py b/scrape_bezrealitky.py index bd0761e..4bb3dd4 100644 --- a/scrape_bezrealitky.py +++ b/scrape_bezrealitky.py @@ -15,7 +15,7 @@ import re import time import urllib.request from pathlib import Path -from scraper_stats import write_stats +from scraper_stats import write_stats, validate_listing STATS_FILE = "stats_bezrealitky.json" @@ -71,62 +71,71 @@ HEADERS = { BASE_URL = "https://www.bezrealitky.cz" +def fetch_url(url: str, retries: int = 3) -> str: + """Fetch URL and return HTML string with retry on transient errors.""" + for attempt in range(retries): + try: + logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}") + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + html = resp.read().decode("utf-8") + logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") + return html + except urllib.error.HTTPError: + raise + except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e: + if attempt < retries - 1: + wait = (attempt + 1) * 2 + logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}") + time.sleep(wait) + else: + logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True) + raise + + def fetch_page(page: int) -> tuple[list[dict], int]: """ Fetch a listing page from Bezrealitky. Returns (list of advert dicts from Apollo cache, total count). """ url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}" - logger.debug(f"HTTP GET request: {url}") - logger.debug(f"Headers: {HEADERS}") - req = urllib.request.Request(url, headers=HEADERS) - try: - resp = urllib.request.urlopen(req, timeout=30) - html = resp.read().decode("utf-8") - logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") + html = fetch_url(url) - match = re.search( - r'', - html, re.DOTALL - ) - if not match: - logger.debug("No __NEXT_DATA__ script found in HTML") - return [], 0 + match = re.search( + r'', + html, re.DOTALL + ) + if not match: + logger.debug("No __NEXT_DATA__ script found in HTML") + return [], 0 - data = json.loads(match.group(1)) - cache = data["props"]["pageProps"]["apolloCache"] + data = json.loads(match.group(1)) + cache = data["props"]["pageProps"]["apolloCache"] - # Extract adverts from cache - adverts = [] - for key, val in cache.items(): - if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert": - adverts.append(val) + # Extract adverts from cache + adverts = [] + for key, val in cache.items(): + if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert": + adverts.append(val) - # Get total count from ROOT_QUERY - total = 0 - root = cache.get("ROOT_QUERY", {}) - for key, val in root.items(): - if "listAdverts" in key and isinstance(val, dict): - tc = val.get("totalCount") - if tc and tc > total: - total = tc + # Get total count from ROOT_QUERY + total = 0 + root = cache.get("ROOT_QUERY", {}) + for key, val in root.items(): + if "listAdverts" in key and isinstance(val, dict): + tc = val.get("totalCount") + if tc and tc > total: + total = tc - logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}") - return adverts, total - except (urllib.error.URLError, ConnectionError, OSError) as e: - logger.error(f"HTTP request failed for {url}: {e}", exc_info=True) - raise + logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}") + return adverts, total def fetch_detail(uri: str) -> dict | None: """Fetch detail page for a listing.""" try: url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}" - logger.debug(f"HTTP GET request: {url}") - req = urllib.request.Request(url, headers=HEADERS) - resp = urllib.request.urlopen(req, timeout=30) - html = resp.read().decode("utf-8") - logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") + html = fetch_url(url) match = re.search( r'', @@ -365,6 +374,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"), "last_changed": datetime.now().strftime("%Y-%m-%d"), } + if not validate_listing(result, "bezrealitky"): + continue results.append(result) properties_fetched += 1 diff --git a/scrape_cityhome.py b/scrape_cityhome.py index dc2da92..d39f735 100644 --- a/scrape_cityhome.py +++ b/scrape_cityhome.py @@ -14,7 +14,7 @@ import time import urllib.request from datetime import datetime from pathlib import Path -from scraper_stats import write_stats +from scraper_stats import write_stats, validate_listing STATS_FILE = "stats_cityhome.json" @@ -375,6 +375,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "first_seen": _prev_cache.get(f"cityhome_{slug}_{listing['unit_name']}", {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")), "last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(f"cityhome_{slug}_{listing['unit_name']}", {}).get("price") != price else _prev_cache[f"cityhome_{slug}_{listing['unit_name']}"].get("last_changed", datetime.now().strftime("%Y-%m-%d")), } + if not validate_listing(result, "cityhome"): + continue results.append(result) properties_fetched += 1 diff --git a/scrape_idnes.py b/scrape_idnes.py index 88f17ff..b788acc 100644 --- a/scrape_idnes.py +++ b/scrape_idnes.py @@ -16,7 +16,7 @@ import time import urllib.request import urllib.parse from pathlib import Path -from scraper_stats import write_stats +from scraper_stats import write_stats, validate_listing STATS_FILE = "stats_idnes.json" @@ -467,6 +467,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"), "last_changed": datetime.now().strftime("%Y-%m-%d"), } + if not validate_listing(result, "idnes"): + continue results.append(result) properties_fetched += 1 diff --git a/scrape_psn.py b/scrape_psn.py index 71cde1a..b4ce6f8 100644 --- a/scrape_psn.py +++ b/scrape_psn.py @@ -15,7 +15,7 @@ import time from datetime import datetime from pathlib import Path from urllib.parse import urlencode -from scraper_stats import write_stats +from scraper_stats import write_stats, validate_listing STATS_FILE = "stats_psn.json" @@ -38,19 +38,25 @@ BASE_URL = "https://psn.cz" UNITS_API = f"{BASE_URL}/api/units-list" -def fetch_json(url: str) -> dict: - """Fetch JSON via curl (urllib SSL may fail on Cloudflare).""" - logger.debug(f"HTTP GET: {url}") - result = subprocess.run( - ["curl", "-s", "-L", "--max-time", "30", - "-H", f"User-Agent: {UA}", - "-H", "Accept: application/json", - url], - capture_output=True, text=True, timeout=60 - ) - if result.returncode != 0: - raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") - return json.loads(result.stdout) +def fetch_json(url: str, retries: int = 3) -> dict: + """Fetch JSON via curl (urllib SSL may fail on Cloudflare) with retry.""" + for attempt in range(retries): + logger.debug(f"HTTP GET (attempt {attempt + 1}/{retries}): {url}") + result = subprocess.run( + ["curl", "-s", "-L", "--max-time", "30", + "-H", f"User-Agent: {UA}", + "-H", "Accept: application/json", + url], + capture_output=True, text=True, timeout=60 + ) + if result.returncode == 0: + return json.loads(result.stdout) + if attempt < retries - 1: + wait = (attempt + 1) * 2 + logger.warning(f"curl failed (retry {attempt + 1}/{retries} after {wait}s): {result.stderr[:200]}") + time.sleep(wait) + else: + raise RuntimeError(f"curl failed after {retries} attempts ({result.returncode}): {result.stderr[:200]}") def fix_gps(lat, lng): @@ -255,6 +261,8 @@ def scrape(max_properties: int | None = None): "first_seen": _prev_cache.get(str(unit_id), {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")), "last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(str(unit_id), {}).get("price") != int(price) else _prev_cache[str(unit_id)].get("last_changed", datetime.now().strftime("%Y-%m-%d")), } + if not validate_listing(result, "psn"): + continue results.append(result) properties_fetched += 1 diff --git a/scrape_realingo.py b/scrape_realingo.py index 096d78d..8dd3d84 100644 --- a/scrape_realingo.py +++ b/scrape_realingo.py @@ -15,7 +15,7 @@ import re import time import urllib.request from pathlib import Path -from scraper_stats import write_stats +from scraper_stats import write_stats, validate_listing STATS_FILE = "stats_realingo.json" @@ -56,6 +56,28 @@ HEADERS = { BASE_URL = "https://www.realingo.cz" +def fetch_url(url: str, retries: int = 3) -> str: + """Fetch URL and return HTML string with retry on transient errors.""" + for attempt in range(retries): + try: + logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}") + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + html = resp.read().decode("utf-8") + logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") + return html + except urllib.error.HTTPError: + raise + except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e: + if attempt < retries - 1: + wait = (attempt + 1) * 2 + logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}") + time.sleep(wait) + else: + logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True) + raise + + def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]: """Fetch a page of Prague listings. Returns (items, total_count).""" if page == 1: @@ -63,41 +85,26 @@ def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]: else: url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/" - logger.debug(f"HTTP GET request: {url}") - logger.debug(f"Headers: {HEADERS}") - req = urllib.request.Request(url, headers=HEADERS) - try: - resp = urllib.request.urlopen(req, timeout=30) - html = resp.read().decode("utf-8") - logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") + html = fetch_url(url) + match = re.search( + r'', + html, re.DOTALL + ) + if not match: + logger.debug("No __NEXT_DATA__ script found in HTML") + return [], 0 - match = re.search( - r'', - html, re.DOTALL - ) - if not match: - logger.debug("No __NEXT_DATA__ script found in HTML") - return [], 0 - - data = json.loads(match.group(1)) - offer_list = data["props"]["pageProps"]["store"]["offer"]["list"] - logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}") - return offer_list["data"], offer_list["total"] - except (urllib.error.URLError, ConnectionError, OSError) as e: - logger.error(f"HTTP request failed for {url}: {e}", exc_info=True) - raise + data = json.loads(match.group(1)) + offer_list = data["props"]["pageProps"]["store"]["offer"]["list"] + logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}") + return offer_list["data"], offer_list["total"] def fetch_detail(listing_url: str) -> dict | None: """Fetch detail page for a listing to get floor, building type, etc.""" try: url = f"{BASE_URL}{listing_url}" - logger.debug(f"HTTP GET request: {url}") - req = urllib.request.Request(url, headers=HEADERS) - resp = urllib.request.urlopen(req, timeout=30) - html = resp.read().decode("utf-8") - logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") - + html = fetch_url(url) match = re.search( r'', html, re.DOTALL @@ -324,6 +331,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"), "last_changed": datetime.now().strftime("%Y-%m-%d"), } + if not validate_listing(result, "realingo"): + continue results.append(result) properties_fetched += 1 diff --git a/scraper_stats.py b/scraper_stats.py index b605533..c3d234f 100644 --- a/scraper_stats.py +++ b/scraper_stats.py @@ -1,13 +1,53 @@ -"""Shared utility for writing per-scraper run statistics to JSON.""" +"""Shared utilities for scraper run statistics and listing validation.""" from __future__ import annotations import json +import logging import os from pathlib import Path HERE = Path(__file__).parent DATA_DIR = Path(os.environ.get("DATA_DIR", HERE)) +_val_log = logging.getLogger(__name__) + +_REQUIRED_FIELDS = ("hash_id", "price", "locality", "lat", "lon", "url", "source") + + +def validate_listing(listing: dict, context: str = "") -> bool: + """ + Validate a listing dict before it is written to the output JSON. + Returns True if valid, False if the listing should be skipped. + Logs a warning for each invalid listing. + """ + prefix = f"[{context}] " if context else "" + + for field in _REQUIRED_FIELDS: + val = listing.get(field) + if val is None or val == "": + _val_log.warning(f"{prefix}Skipping listing — missing field '{field}': {listing.get('hash_id', '?')}") + return False + + price = listing.get("price") + if not isinstance(price, (int, float)) or price <= 0: + _val_log.warning(f"{prefix}Skipping listing — invalid price={price!r}: {listing.get('hash_id', '?')}") + return False + + lat, lon = listing.get("lat"), listing.get("lon") + if not isinstance(lat, (int, float)) or not isinstance(lon, (int, float)): + _val_log.warning(f"{prefix}Skipping listing — non-numeric GPS lat={lat!r} lon={lon!r}: {listing.get('hash_id', '?')}") + return False + if not (47.0 <= lat <= 52.0) or not (12.0 <= lon <= 19.0): + _val_log.warning(f"{prefix}Skipping listing — GPS outside Czech Republic lat={lat} lon={lon}: {listing.get('hash_id', '?')}") + return False + + area = listing.get("area") + if area is not None and (not isinstance(area, (int, float)) or area <= 0): + _val_log.warning(f"{prefix}Skipping listing — invalid area={area!r}: {listing.get('hash_id', '?')}") + return False + + return True + def write_stats(filename: str, stats: dict) -> None: """Write scraper run stats dict to the data directory.""" -- 2.49.1 From fd3991f8d6b29bb782ab61cc6812a897bbc82768 Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Fri, 27 Feb 2026 10:44:08 +0100 Subject: [PATCH 2/2] Remove regen_map.py references from Dockerfile and README Co-Authored-By: Claude Sonnet 4.6 --- README.md | 5 ----- build/Dockerfile | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/README.md b/README.md index 850b6c6..f80a326 100644 --- a/README.md +++ b/README.md @@ -83,10 +83,6 @@ Merges all `byty_*.json` files into `byty_merged.json` and generates `mapa_bytu. **Deduplication logic:** Two listings are considered duplicates if they share the same normalized street name + price + area. PSN and CityHome have priority during dedup (loaded first), so their listings are kept over duplicates from other portals. -### `regen_map.py` - -Regenerates the map from existing `byty_sreality.json` data without re-scraping. Fetches missing area values from the Sreality API, fixes URLs, and re-applies the area filter. Useful for tweaking map output after data has already been collected. - ## Interactive map (`mapa_bytu.html`) The generated map is a standalone HTML file using Leaflet.js with CARTO basemap tiles. Features: @@ -201,7 +197,6 @@ Validation targets run scrapers with `--max-pages 1 --max-properties 10` for a f ├── scrape_psn.py # PSN scraper ├── scrape_cityhome.py # CityHome scraper ├── merge_and_map.py # Merge all sources + generate final map -├── regen_map.py # Regenerate map from cached Sreality data ├── run_all.sh # Orchestrator script (runs all scrapers + merge) ├── mapa_bytu.html # Generated interactive map (output) ├── Makefile # Docker management + validation shortcuts diff --git a/build/Dockerfile b/build/Dockerfile index 541f268..b5a2b91 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -11,7 +11,7 @@ WORKDIR /app COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \ - merge_and_map.py regen_map.py generate_status.py scraper_stats.py \ + merge_and_map.py generate_status.py scraper_stats.py \ run_all.sh server.py ./ COPY build/crontab /etc/crontabs/root -- 2.49.1