1 Commits

Author SHA1 Message Date
7d3021efbf Remove tracked generated/data files and fix map link on status page
- Remove byty_*.json, mapa_bytu.html, .DS_Store and settings.local.json from git tracking
  (already in .gitignore, files kept locally)
- Fix "Otevřít mapu" link on scraper status page: / → /mapa_bytu.html

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 18:50:16 +01:00
12 changed files with 272 additions and 392 deletions

View File

@@ -83,6 +83,10 @@ Merges all `byty_*.json` files into `byty_merged.json` and generates `mapa_bytu.
**Deduplication logic:** Two listings are considered duplicates if they share the same normalized street name + price + area. PSN and CityHome have priority during dedup (loaded first), so their listings are kept over duplicates from other portals. **Deduplication logic:** Two listings are considered duplicates if they share the same normalized street name + price + area. PSN and CityHome have priority during dedup (loaded first), so their listings are kept over duplicates from other portals.
### `regen_map.py`
Regenerates the map from existing `byty_sreality.json` data without re-scraping. Fetches missing area values from the Sreality API, fixes URLs, and re-applies the area filter. Useful for tweaking map output after data has already been collected.
## Interactive map (`mapa_bytu.html`) ## Interactive map (`mapa_bytu.html`)
The generated map is a standalone HTML file using Leaflet.js with CARTO basemap tiles. Features: The generated map is a standalone HTML file using Leaflet.js with CARTO basemap tiles. Features:
@@ -147,7 +151,7 @@ The project includes a Docker setup for unattended operation with a cron-based s
│ PID 1: python3 -m http.server :8080 │ │ PID 1: python3 -m http.server :8080 │
│ serves /app/data/ │ │ serves /app/data/ │
│ │ │ │
│ crond: runs run_all.sh every 4 hours │ crond: runs run_all.sh at 06:00/18:00
│ Europe/Prague timezone │ │ Europe/Prague timezone │
│ │ │ │
│ /app/ -- scripts (.py, .sh) │ │ /app/ -- scripts (.py, .sh) │
@@ -156,7 +160,7 @@ The project includes a Docker setup for unattended operation with a cron-based s
└─────────────────────────────────────────┘ └─────────────────────────────────────────┘
``` ```
On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place every 4 hours. On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place twice daily at 06:00 and 18:00 CET/CEST.
### Quick start ### Quick start
@@ -197,13 +201,14 @@ Validation targets run scrapers with `--max-pages 1 --max-properties 10` for a f
├── scrape_psn.py # PSN scraper ├── scrape_psn.py # PSN scraper
├── scrape_cityhome.py # CityHome scraper ├── scrape_cityhome.py # CityHome scraper
├── merge_and_map.py # Merge all sources + generate final map ├── merge_and_map.py # Merge all sources + generate final map
├── regen_map.py # Regenerate map from cached Sreality data
├── run_all.sh # Orchestrator script (runs all scrapers + merge) ├── run_all.sh # Orchestrator script (runs all scrapers + merge)
├── mapa_bytu.html # Generated interactive map (output) ├── mapa_bytu.html # Generated interactive map (output)
├── Makefile # Docker management + validation shortcuts ├── Makefile # Docker management + validation shortcuts
├── build/ ├── build/
│ ├── Dockerfile # Container image definition (python:3.13-alpine) │ ├── Dockerfile # Container image definition (python:3.13-alpine)
│ ├── entrypoint.sh # Container entrypoint (HTTP server + cron + initial scrape) │ ├── entrypoint.sh # Container entrypoint (HTTP server + cron + initial scrape)
│ ├── crontab # Cron schedule (every 4 hours) │ ├── crontab # Cron schedule (06:00 and 18:00 CET)
│ └── CONTAINER.md # Container-specific documentation │ └── CONTAINER.md # Container-specific documentation
└── .gitignore # Ignores byty_*.json, __pycache__, .vscode └── .gitignore # Ignores byty_*.json, __pycache__, .vscode
``` ```

View File

@@ -11,7 +11,7 @@ WORKDIR /app
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
scrape_idnes.py scrape_psn.py scrape_cityhome.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \
merge_and_map.py generate_status.py scraper_stats.py \ merge_and_map.py regen_map.py generate_status.py scraper_stats.py \
run_all.sh server.py ./ run_all.sh server.py ./
COPY build/crontab /etc/crontabs/root COPY build/crontab /etc/crontabs/root

View File

@@ -9,7 +9,6 @@ from __future__ import annotations
import json import json
import re import re
import unicodedata
from pathlib import Path from pathlib import Path
from scrape_and_map import generate_map, format_price from scrape_and_map import generate_map, format_price
@@ -20,8 +19,14 @@ def normalize_street(locality: str) -> str:
# "Studentská, Praha 6 - Dejvice" → "studentska" # "Studentská, Praha 6 - Dejvice" → "studentska"
# "Rýnská, Praha" → "rynska" # "Rýnská, Praha" → "rynska"
street = locality.split(",")[0].strip().lower() street = locality.split(",")[0].strip().lower()
# Remove diacritics using Unicode decomposition (handles all Czech characters) # Remove diacritics (simple Czech)
street = unicodedata.normalize("NFKD", street).encode("ascii", "ignore").decode("ascii") replacements = {
"á": "a", "č": "c", "ď": "d", "é": "e", "ě": "e",
"í": "i", "ň": "n", "ó": "o", "ř": "r", "š": "s",
"ť": "t", "ú": "u", "ů": "u", "ý": "y", "ž": "z",
}
for src, dst in replacements.items():
street = street.replace(src, dst)
# Remove non-alphanumeric # Remove non-alphanumeric
street = re.sub(r"[^a-z0-9]", "", street) street = re.sub(r"[^a-z0-9]", "", street)
return street return street
@@ -74,10 +79,6 @@ def main():
if key in seen_keys: if key in seen_keys:
dupes += 1 dupes += 1
existing = seen_keys[key] existing = seen_keys[key]
# Preserve earliest first_seen across sources
dup_fs = e.get("first_seen", "")
if dup_fs and (not existing.get("first_seen") or dup_fs < existing["first_seen"]):
existing["first_seen"] = dup_fs
# Log it # Log it
print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')}" print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')}"
f"({e.get('source', '?')} vs {existing.get('source', '?')})") f"({e.get('source', '?')} vs {existing.get('source', '?')})")

114
regen_map.py Normal file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""
Přegeneruje mapu z již stažených dat (byty_sreality.json).
Doplní chybějící plochy ze Sreality API, opraví URL, aplikuje filtry.
"""
from __future__ import annotations
import json
import time
import urllib.request
from pathlib import Path
from scrape_and_map import (
generate_map, format_price, MIN_AREA, HEADERS, DETAIL_API
)
def api_get(url: str) -> dict:
req = urllib.request.Request(url, headers=HEADERS)
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode("utf-8"))
def fix_sreality_url(estate: dict) -> str:
"""Fix the Sreality URL to include disposition segment (only if missing)."""
disp = estate.get("disposition", "")
slug_map = {
"1+kk": "1+kk", "1+1": "1+1", "2+kk": "2+kk", "2+1": "2+1",
"3+kk": "3+kk", "3+1": "3+1", "4+kk": "4+kk", "4+1": "4+1",
"5+kk": "5+kk", "5+1": "5+1", "6+": "6-a-vice", "Atypický": "atypicky",
}
slug = slug_map.get(disp, "byt")
old_url = estate.get("url", "")
parts = old_url.split("/")
try:
byt_idx = parts.index("byt")
# Only insert if disposition slug is not already there
if byt_idx + 1 < len(parts) and parts[byt_idx + 1] == slug:
return old_url # already correct
parts.insert(byt_idx + 1, slug)
return "/".join(parts)
except ValueError:
return old_url
def fetch_area(hash_id: int) -> int | None:
"""Fetch area from detail API."""
try:
url = DETAIL_API.format(hash_id)
detail = api_get(url)
for item in detail.get("items", []):
name = item.get("name", "")
if "žitná ploch" in name or "zitna ploch" in name.lower():
return int(item["value"])
except Exception:
pass
return None
def main():
json_path = Path("byty_sreality.json")
if not json_path.exists():
print("Soubor byty_sreality.json nenalezen. Nejprve spusť scrape_and_map.py")
return
estates = json.loads(json_path.read_text(encoding="utf-8"))
print(f"Načteno {len(estates)} bytů z byty_sreality.json")
# Step 1: Fetch missing areas
missing_area = [e for e in estates if e.get("area") is None]
print(f"Doplňuji plochu u {len(missing_area)} bytů...")
for i, e in enumerate(missing_area):
time.sleep(0.3)
area = fetch_area(e["hash_id"])
if area is not None:
e["area"] = area
if (i + 1) % 50 == 0:
print(f" {i + 1}/{len(missing_area)} ...")
# Count results
with_area = sum(1 for e in estates if e.get("area") is not None)
print(f"Plocha doplněna: {with_area}/{len(estates)}")
# Step 2: Fix URLs
for e in estates:
e["url"] = fix_sreality_url(e)
# Step 3: Filter by min area
filtered = []
excluded = 0
for e in estates:
area = e.get("area")
if area is not None and area < MIN_AREA:
excluded += 1
continue
filtered.append(e)
print(f"Vyloučeno (< {MIN_AREA} m²): {excluded}")
print(f"Zbývá: {len(filtered)} bytů")
# Save updated data
filtered_path = Path("byty_sreality.json")
filtered_path.write_text(
json.dumps(filtered, ensure_ascii=False, indent=2),
encoding="utf-8",
)
# Generate map
generate_map(filtered)
if __name__ == "__main__":
main()

View File

@@ -84,6 +84,9 @@ exec > >(tee -a "$LOG_FILE") 2>&1
step "Sreality" step "Sreality"
python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); } python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); }
step "Realingo"
python3 scrape_realingo.py $SCRAPER_ARGS || { echo -e "${RED}✗ Realingo selhalo${NC}"; FAILED=$((FAILED + 1)); }
step "Bezrealitky" step "Bezrealitky"
python3 scrape_bezrealitky.py $SCRAPER_ARGS || { echo -e "${RED}✗ Bezrealitky selhalo${NC}"; FAILED=$((FAILED + 1)); } python3 scrape_bezrealitky.py $SCRAPER_ARGS || { echo -e "${RED}✗ Bezrealitky selhalo${NC}"; FAILED=$((FAILED + 1)); }
@@ -98,9 +101,6 @@ PID_CH=$!
wait $PID_PSN || { echo -e "${RED}✗ PSN selhalo${NC}"; FAILED=$((FAILED + 1)); } wait $PID_PSN || { echo -e "${RED}✗ PSN selhalo${NC}"; FAILED=$((FAILED + 1)); }
wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED + 1)); } wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED + 1)); }
step "Realingo"
python3 scrape_realingo.py $SCRAPER_ARGS || { echo -e "${RED}✗ Realingo selhalo${NC}"; FAILED=$((FAILED + 1)); }
# ── Sloučení + mapa ────────────────────────────────────────── # ── Sloučení + mapa ──────────────────────────────────────────
step "Sloučení dat a generování mapy" step "Sloučení dat a generování mapy"

View File

@@ -13,9 +13,9 @@ import math
import time import time
import urllib.request import urllib.request
import urllib.parse import urllib.parse
from datetime import datetime, timedelta from datetime import datetime
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats, validate_listing from scraper_stats import write_stats
STATS_FILE = "stats_sreality.json" STATS_FILE = "stats_sreality.json"
@@ -45,9 +45,9 @@ HEADERS = {
def api_get(url: str) -> dict: def api_get(url: str) -> dict:
"""Fetch JSON from Sreality API with retry.""" """Fetch JSON from Sreality API."""
for attempt in range(3): logger.debug(f"HTTP GET request: {url}")
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}") logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS) req = urllib.request.Request(url, headers=HEADERS)
try: try:
with urllib.request.urlopen(req, timeout=30) as resp: with urllib.request.urlopen(req, timeout=30) as resp:
@@ -55,15 +55,8 @@ def api_get(url: str) -> dict:
logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes") logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes")
logger.debug(f"Response preview: {response_data[:200]}") logger.debug(f"Response preview: {response_data[:200]}")
return json.loads(response_data) return json.loads(response_data)
except urllib.error.HTTPError:
raise
except (urllib.error.URLError, ConnectionError, OSError) as e: except (urllib.error.URLError, ConnectionError, OSError) as e:
if attempt < 2: logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
time.sleep(wait)
else:
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
raise raise
@@ -360,11 +353,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": sreality_url(hash_id, seo), "url": sreality_url(hash_id, seo),
"image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""), "image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""),
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
"last_changed": datetime.now().strftime("%Y-%m-%d"),
} }
if not validate_listing(result, "sreality"):
continue
results.append(result) results.append(result)
details_fetched += 1 details_fetched += 1
@@ -440,30 +429,18 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
] ]
for bcolor, blabel in bands: for bcolor, blabel in bands:
price_legend_items += ( price_legend_items += (
f'<div class="price-band" data-color="{bcolor}" onclick="toggleColorFilter(\'{bcolor}\')" ' f'<div style="display:flex;align-items:center;gap:6px;margin:2px 0;">'
f'style="display:flex;align-items:center;gap:6px;margin:2px 0;padding:2px 4px;'
f'border-radius:4px;border:2px solid transparent;">'
f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};' f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};'
f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>' f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>'
f'<span>{blabel}</span></div>' f'<span>{blabel}</span></div>'
) )
price_legend_items += (
'<div id="price-filter-reset" style="display:none;margin:3px 0 0 4px;">'
'<a href="#" onclick="resetColorFilter();return false;" '
'style="font-size:11px;color:#1976D2;text-decoration:none;">✕ Zobrazit všechny ceny</a>'
'</div>'
)
# New marker indicator — bigger dot, no extra border # New marker indicator — bigger dot, no extra border
price_legend_items += ( price_legend_items += (
'<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;' '<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;'
'padding-top:6px;border-top:1px solid #eee;">' 'padding-top:6px;border-top:1px solid #eee;">'
'<span style="display:inline-flex;align-items:center;gap:3px;flex-shrink:0;">' '<span style="width:18px;height:18px;border-radius:50%;background:#66BB6A;'
'<span style="width:14px;height:14px;border-radius:50%;background:#66BB6A;' 'display:inline-block;box-shadow:0 1px 4px rgba(0,0,0,0.35);flex-shrink:0;"></span>'
'display:inline-block;box-shadow:0 1px 3px rgba(0,0,0,0.3);"></span>' '<span>Nové (z dnešního scrapu) — větší</span></div>'
'<span style="font-size:8px;font-weight:700;background:#FFD600;color:#333;'
'padding:1px 3px;border-radius:2px;">NEW</span>'
'</span>'
'<span>Nové (≤ 1 den)</span></div>'
) )
markers_js = "" markers_js = ""
@@ -485,32 +462,18 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
source_label = source_labels.get(source, source) source_label = source_labels.get(source, source)
source_color = source_colors.get(source, "#999") source_color = source_colors.get(source, "#999")
hash_id = f"{source}_{e.get('hash_id', '')}" hash_id = e.get("hash_id", "")
first_seen = e.get("first_seen", "") scraped_at = e.get("scraped_at", "")
last_changed = e.get("last_changed", "") is_new = scraped_at == datetime.now().strftime("%Y-%m-%d")
today = datetime.now().strftime("%Y-%m-%d")
yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
is_new = first_seen in (today, yesterday)
new_badge = ( new_badge = (
'<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;' '<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;'
'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>' 'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>'
if is_new else "" if is_new else ""
) )
date_parts = []
if first_seen:
date_parts.append(f'Přidáno: {first_seen}')
if last_changed and last_changed != first_seen:
date_parts.append(f'Změněno: {last_changed}')
date_row = (
f'<span style="font-size:11px;color:#888;">{"&nbsp;·&nbsp;".join(date_parts)}</span><br>'
if date_parts else ""
)
popup = ( popup = (
f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}" data-first-seen="{first_seen}" data-last-changed="{last_changed}">' f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">'
f'<b style="font-size:14px;">{format_price(e["price"])}</b>' f'<b style="font-size:14px;">{format_price(e["price"])}</b>'
f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;' f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;'
f'padding:1px 6px;border-radius:3px;">{source_label}</span>{new_badge}<br>' f'padding:1px 6px;border-radius:3px;">{source_label}</span>{new_badge}<br>'
@@ -518,9 +481,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
f'{floor_note}<br><br>' f'{floor_note}<br><br>'
f'<b>{e["locality"]}</b><br>' f'<b>{e["locality"]}</b><br>'
f'Stavba: {building_text}<br>' f'Stavba: {building_text}<br>'
f'Vlastnictví: {ownership_text}<br>' f'Vlastnictví: {ownership_text}<br><br>'
f'{date_row}'
f'<br>'
f'<a href="{e["url"]}" target="_blank" ' f'<a href="{e["url"]}" target="_blank" '
f'style="color:{source_color};text-decoration:none;font-weight:bold;">' f'style="color:{source_color};text-decoration:none;font-weight:bold;">'
f'→ Otevřít na {source_label}</a>' f'→ Otevřít na {source_label}</a>'
@@ -552,7 +513,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
else: else:
marker_fn = "addMarker" marker_fn = "addMarker"
markers_js += ( markers_js += (
f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}', '{first_seen}', '{last_changed}');\n" f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n"
) )
# Build legend — price per m² bands + disposition counts # Build legend — price per m² bands + disposition counts
@@ -618,12 +579,12 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
.heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }} .heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }}
.heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }} .heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }}
.reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }} .reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }}
.new-badge-icon {{ background: none !important; border: none !important; pointer-events: none !important; }} @keyframes pulse-new {{
.new-badge {{ 0% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
font-size: 9px; font-weight: 700; color: #333; background: #FFD600; 50% {{ stroke-opacity: 0.4; stroke-width: 6px; r: 12; }}
padding: 1px 4px; border-radius: 3px; white-space: nowrap; 100% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
box-shadow: 0 1px 3px rgba(0,0,0,0.3); letter-spacing: 0.5px;
}} }}
.marker-new {{ animation: pulse-new 2s ease-in-out infinite; }}
.info-panel {{ .info-panel {{
position: absolute; top: 10px; right: 10px; z-index: 1000; position: absolute; top: 10px; right: 10px; z-index: 1000;
background: white; padding: 16px; border-radius: 10px; background: white; padding: 16px; border-radius: 10px;
@@ -656,10 +617,6 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
.info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }} .info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }}
.filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }} .filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }}
.filter-section label {{ display: flex; align-items: center; gap: 6px; margin: 3px 0; cursor: pointer; }} .filter-section label {{ display: flex; align-items: center; gap: 6px; margin: 3px 0; cursor: pointer; }}
.price-band {{ cursor: pointer; transition: background 0.12s; }}
.price-band:hover {{ background: #f0f0f0; }}
.price-band.active {{ border-color: #333 !important; background: #e8f0fe; }}
.price-band.dimmed {{ opacity: 0.35; }}
.filter-section input[type="checkbox"] {{ accent-color: #1976D2; }} .filter-section input[type="checkbox"] {{ accent-color: #1976D2; }}
#floor-filter {{ margin-top: 8px; }} #floor-filter {{ margin-top: 8px; }}
#floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }} #floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }}
@@ -698,23 +655,11 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
</div> </div>
<div style="margin-top:6px;"> <div style="margin-top:6px;">
<label>Max cena: <label>Max cena:
<input type="number" id="max-price" value="13500000" max="14000000" step="500000" <select id="max-price" onchange="applyFilters()">
style="width:130px;padding:2px 4px;border:1px solid #ccc;border-radius:3px;" <option value="13500000">13 500 000 Kč</option>
onchange="applyFilters()" onkeyup="applyFilters()"> Kč <option value="12000000">12 000 000 Kč</option>
</label> <option value="10000000">10 000 000 Kč</option>
</div> <option value="8000000">8 000 000 Kč</option>
<div style="margin-top:6px;">
<label>Přidáno / změněno:
<select id="days-filter" onchange="applyFilters()" style="width:100%;padding:4px;border-radius:4px;border:1px solid #ccc;">
<option value="0">Vše</option>
<option value="1">za 1 den</option>
<option value="2">za 2 dny</option>
<option value="3">za 3 dny</option>
<option value="4">za 4 dny</option>
<option value="5">za 5 dní</option>
<option value="7">za 7 dní</option>
<option value="14">za 14 dní</option>
<option value="30">za 30 dní</option>
</select> </select>
</label> </label>
</div> </div>
@@ -748,39 +693,9 @@ L.tileLayer('https://{{s}}.basemaps.cartocdn.com/light_only_labels/{{z}}/{{x}}/{
pane: 'shadowPane', pane: 'shadowPane',
}}).addTo(map); }}).addTo(map);
var selectedColors = [];
function toggleColorFilter(color) {{
var idx = selectedColors.indexOf(color);
if (idx >= 0) selectedColors.splice(idx, 1);
else selectedColors.push(color);
document.querySelectorAll('.price-band').forEach(function(el) {{
var c = el.getAttribute('data-color');
if (selectedColors.length === 0) {{
el.classList.remove('active', 'dimmed');
}} else if (selectedColors.indexOf(c) >= 0) {{
el.classList.add('active'); el.classList.remove('dimmed');
}} else {{
el.classList.add('dimmed'); el.classList.remove('active');
}}
}});
document.getElementById('price-filter-reset').style.display =
selectedColors.length > 0 ? 'block' : 'none';
applyFilters();
}}
function resetColorFilter() {{
selectedColors = [];
document.querySelectorAll('.price-band').forEach(function(el) {{
el.classList.remove('active', 'dimmed');
}});
document.getElementById('price-filter-reset').style.display = 'none';
applyFilters();
}}
var allMarkers = []; var allMarkers = [];
function addMarker(lat, lon, color, popup, hashId, firstSeen, lastChanged) {{ function addMarker(lat, lon, color, popup, hashId) {{
var marker = L.circleMarker([lat, lon], {{ var marker = L.circleMarker([lat, lon], {{
radius: 8, radius: 8,
fillColor: color, fillColor: color,
@@ -789,35 +704,26 @@ function addMarker(lat, lon, color, popup, hashId, firstSeen, lastChanged) {{
opacity: 1, opacity: 1,
fillOpacity: 0.85, fillOpacity: 0.85,
}}).bindPopup(popup); }}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, firstSeen: firstSeen || '', lastChanged: lastChanged || '' }}; marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId }};
allMarkers.push(marker); allMarkers.push(marker);
marker.addTo(map); marker.addTo(map);
}} }}
function addNewMarker(lat, lon, color, popup, hashId, firstSeen, lastChanged) {{ function addNewMarker(lat, lon, color, popup, hashId) {{
var marker = L.circleMarker([lat, lon], {{ var marker = L.circleMarker([lat, lon], {{
radius: 8, radius: 12,
fillColor: color, fillColor: color,
color: '#fff', color: color,
weight: 2, weight: 4,
opacity: 1, opacity: 0.35,
fillOpacity: 0.85, fillOpacity: 0.95,
}}).bindPopup(popup); }}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true, firstSeen: firstSeen || '', lastChanged: lastChanged || '' }}; marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true }};
allMarkers.push(marker); allMarkers.push(marker);
marker.addTo(map); marker.addTo(map);
var badge = L.marker([lat, lon], {{ marker.on('add', function() {{
icon: L.divIcon({{ if (marker._path) marker._path.classList.add('marker-new');
className: 'new-badge-icon',
html: '<span class="new-badge">NEW</span>',
iconSize: [32, 14],
iconAnchor: [-6, 7],
}}),
interactive: false,
pane: 'markerPane',
}}); }});
badge.addTo(map);
marker._newBadge = badge;
}} }}
function heartIcon(color) {{ function heartIcon(color) {{
@@ -850,11 +756,11 @@ function starIcon() {{
}}); }});
}} }}
function addHeartMarker(lat, lon, color, popup, hashId, firstSeen, lastChanged) {{ function addHeartMarker(lat, lon, color, popup, hashId) {{
var marker = L.marker([lat, lon], {{ var marker = L.marker([lat, lon], {{
icon: heartIcon(color), icon: heartIcon(color),
}}).bindPopup(popup); }}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isHeart: true, firstSeen: firstSeen || '', lastChanged: lastChanged || '' }}; marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isHeart: true }};
allMarkers.push(marker); allMarkers.push(marker);
marker.addTo(map); marker.addTo(map);
}} }}
@@ -873,11 +779,6 @@ function loadRatings() {{
function saveRatings(ratings) {{ function saveRatings(ratings) {{
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings)); localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
fetch('/api/ratings', {{
method: 'POST',
headers: {{'Content-Type': 'application/json'}},
body: JSON.stringify(ratings)
}}).catch(function() {{}});
}} }}
function addRejectStrike(marker) {{ function addRejectStrike(marker) {{
@@ -925,7 +826,6 @@ function applyMarkerStyle(marker, status) {{
}} else {{ }} else {{
if (status === 'fav') {{ if (status === 'fav') {{
removeRejectStrike(marker); removeRejectStrike(marker);
if (marker._newBadge && map.hasLayer(marker._newBadge)) map.removeLayer(marker._newBadge);
if (!marker._data._origCircle) marker._data._origCircle = true; if (!marker._data._origCircle) marker._data._origCircle = true;
var popup = marker.getPopup(); var popup = marker.getPopup();
var popupContent = popup ? popup.getContent() : ''; var popupContent = popup ? popup.getContent() : '';
@@ -949,7 +849,6 @@ function applyMarkerStyle(marker, status) {{
}} }}
// Add strikethrough line over the marker // Add strikethrough line over the marker
addRejectStrike(marker); addRejectStrike(marker);
if (marker._newBadge && map.hasLayer(marker._newBadge)) map.removeLayer(marker._newBadge);
}} else {{ }} else {{
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{ if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }}); revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }});
@@ -962,7 +861,6 @@ function applyMarkerStyle(marker, status) {{
}} }}
if (marker._path) marker._path.classList.remove('marker-rejected'); if (marker._path) marker._path.classList.remove('marker-rejected');
removeRejectStrike(marker); removeRejectStrike(marker);
if (marker._newBadge && !map.hasLayer(marker._newBadge)) marker._newBadge.addTo(map);
}} }}
}} }}
}} }}
@@ -1118,21 +1016,11 @@ map.on('popupopen', function(e) {{
// ── Filters ──────────────────────────────────────────────────── // ── Filters ────────────────────────────────────────────────────
function applyFilters() {{ function applyFilters() {{
var minFloor = parseInt(document.getElementById('min-floor').value); var minFloor = parseInt(document.getElementById('min-floor').value);
var maxPriceEl = document.getElementById('max-price'); var maxPrice = parseInt(document.getElementById('max-price').value);
var maxPrice = parseInt(maxPriceEl.value) || 14000000;
if (maxPrice > 14000000) {{ maxPrice = 14000000; maxPriceEl.value = 14000000; }}
var hideRejected = document.getElementById('hide-rejected').checked; var hideRejected = document.getElementById('hide-rejected').checked;
var daysFilter = parseInt(document.getElementById('days-filter').value) || 0;
var ratings = loadRatings(); var ratings = loadRatings();
var visible = 0; var visible = 0;
var cutoff = null;
if (daysFilter > 0) {{
cutoff = new Date();
cutoff.setDate(cutoff.getDate() - daysFilter);
cutoff.setHours(0, 0, 0, 0);
}}
allMarkers.forEach(function(m) {{ allMarkers.forEach(function(m) {{
var popup = m.getPopup().getContent(); var popup = m.getPopup().getContent();
var floorMatch = popup.match(/(\\d+)\\. NP/); var floorMatch = popup.match(/(\\d+)\\. NP/);
@@ -1145,14 +1033,6 @@ function applyFilters() {{
if (floor !== null && floor < minFloor) show = false; if (floor !== null && floor < minFloor) show = false;
if (price > maxPrice) show = false; if (price > maxPrice) show = false;
if (cutoff) {{
var fs = m._data.firstSeen ? new Date(m._data.firstSeen) : null;
var lc = m._data.lastChanged ? new Date(m._data.lastChanged) : null;
if (!((fs && fs >= cutoff) || (lc && lc >= cutoff))) show = false;
}}
if (selectedColors.length > 0 && selectedColors.indexOf(m._data.color) < 0) show = false;
var r = ratings[m._data.hashId]; var r = ratings[m._data.hashId];
if (hideRejected && r && r.status === 'reject') show = false; if (hideRejected && r && r.status === 'reject') show = false;
@@ -1161,12 +1041,10 @@ function applyFilters() {{
visible++; visible++;
// Show strike line if rejected and visible // Show strike line if rejected and visible
if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map); if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map);
if (m._newBadge && !map.hasLayer(m._newBadge)) m._newBadge.addTo(map);
}} else {{ }} else {{
if (map.hasLayer(m)) map.removeLayer(m); if (map.hasLayer(m)) map.removeLayer(m);
// Hide strike line when marker hidden // Hide strike line when marker hidden
if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike); if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike);
if (m._newBadge && map.hasLayer(m._newBadge)) map.removeLayer(m._newBadge);
}} }}
}}); }});
@@ -1181,25 +1059,8 @@ function applyFilters() {{
document.getElementById('visible-count').textContent = visible; document.getElementById('visible-count').textContent = visible;
}} }}
// Initialize ratings: load from server, merge with localStorage, then restore // Initialize ratings on load
function initRatings() {{ restoreRatings();
var local = loadRatings();
fetch('/api/ratings')
.then(function(r) {{ return r.ok ? r.json() : null; }})
.then(function(server) {{
if (server && typeof server === 'object') {{
var merged = Object.assign({{}}, local, server);
localStorage.setItem(RATINGS_KEY, JSON.stringify(merged));
}}
restoreRatings();
updateRatingCounts();
}})
.catch(function() {{
restoreRatings();
updateRatingCounts();
}});
}}
initRatings();
// ── Panel toggle ────────────────────────────────────────────── // ── Panel toggle ──────────────────────────────────────────────
function togglePanel() {{ function togglePanel() {{

View File

@@ -15,7 +15,7 @@ import re
import time import time
import urllib.request import urllib.request
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats, validate_listing from scraper_stats import write_stats
STATS_FILE = "stats_bezrealitky.json" STATS_FILE = "stats_bezrealitky.json"
@@ -71,35 +71,19 @@ HEADERS = {
BASE_URL = "https://www.bezrealitky.cz" BASE_URL = "https://www.bezrealitky.cz"
def fetch_url(url: str, retries: int = 3) -> str:
"""Fetch URL and return HTML string with retry on transient errors."""
for attempt in range(retries):
try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html
except urllib.error.HTTPError:
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
if attempt < retries - 1:
wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
time.sleep(wait)
else:
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
raise
def fetch_page(page: int) -> tuple[list[dict], int]: def fetch_page(page: int) -> tuple[list[dict], int]:
""" """
Fetch a listing page from Bezrealitky. Fetch a listing page from Bezrealitky.
Returns (list of advert dicts from Apollo cache, total count). Returns (list of advert dicts from Apollo cache, total count).
""" """
url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}" url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
html = fetch_url(url) logger.debug(f"HTTP GET request: {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS)
try:
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
match = re.search( match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
@@ -129,13 +113,20 @@ def fetch_page(page: int) -> tuple[list[dict], int]:
logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}") logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
return adverts, total return adverts, total
except (urllib.error.URLError, ConnectionError, OSError) as e:
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
raise
def fetch_detail(uri: str) -> dict | None: def fetch_detail(uri: str) -> dict | None:
"""Fetch detail page for a listing.""" """Fetch detail page for a listing."""
try: try:
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}" url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
html = fetch_url(url) logger.debug(f"HTTP GET request: {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
match = re.search( match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
@@ -371,11 +362,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"source": "bezrealitky", "source": "bezrealitky",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
"last_changed": datetime.now().strftime("%Y-%m-%d"),
} }
if not validate_listing(result, "bezrealitky"):
continue
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -14,7 +14,7 @@ import time
import urllib.request import urllib.request
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats, validate_listing from scraper_stats import write_stats
STATS_FILE = "stats_cityhome.json" STATS_FILE = "stats_cityhome.json"
@@ -255,16 +255,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
else: else:
logger.info(f"{slug}: GPS nenalezeno") logger.info(f"{slug}: GPS nenalezeno")
# Load previous output for first_seen/last_changed tracking
_prev_cache: dict[str, dict] = {}
_prev_path = Path("byty_cityhome.json")
if _prev_path.exists():
try:
for _item in json.loads(_prev_path.read_text(encoding="utf-8")):
_prev_cache[str(_item["hash_id"])] = _item
except Exception:
pass
# Step 3: Filter listings # Step 3: Filter listings
logger.info(f"\nFáze 3: Filtrování...") logger.info(f"\nFáze 3: Filtrování...")
results = [] results = []
@@ -372,11 +362,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"source": "cityhome", "source": "cityhome",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"first_seen": _prev_cache.get(f"cityhome_{slug}_{listing['unit_name']}", {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")),
"last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(f"cityhome_{slug}_{listing['unit_name']}", {}).get("price") != price else _prev_cache[f"cityhome_{slug}_{listing['unit_name']}"].get("last_changed", datetime.now().strftime("%Y-%m-%d")),
} }
if not validate_listing(result, "cityhome"):
continue
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -15,8 +15,9 @@ import re
import time import time
import urllib.request import urllib.request
import urllib.parse import urllib.parse
from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats, validate_listing from scraper_stats import write_stats
STATS_FILE = "stats_idnes.json" STATS_FILE = "stats_idnes.json"
@@ -464,11 +465,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"source": "idnes", "source": "idnes",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
"last_changed": datetime.now().strftime("%Y-%m-%d"),
} }
if not validate_listing(result, "idnes"):
continue
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -15,7 +15,7 @@ import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from urllib.parse import urlencode from urllib.parse import urlencode
from scraper_stats import write_stats, validate_listing from scraper_stats import write_stats
STATS_FILE = "stats_psn.json" STATS_FILE = "stats_psn.json"
@@ -38,10 +38,9 @@ BASE_URL = "https://psn.cz"
UNITS_API = f"{BASE_URL}/api/units-list" UNITS_API = f"{BASE_URL}/api/units-list"
def fetch_json(url: str, retries: int = 3) -> dict: def fetch_json(url: str) -> dict:
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare) with retry.""" """Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
for attempt in range(retries): logger.debug(f"HTTP GET: {url}")
logger.debug(f"HTTP GET (attempt {attempt + 1}/{retries}): {url}")
result = subprocess.run( result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30", ["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}", "-H", f"User-Agent: {UA}",
@@ -49,14 +48,9 @@ def fetch_json(url: str, retries: int = 3) -> dict:
url], url],
capture_output=True, text=True, timeout=60 capture_output=True, text=True, timeout=60
) )
if result.returncode == 0: if result.returncode != 0:
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
return json.loads(result.stdout) return json.loads(result.stdout)
if attempt < retries - 1:
wait = (attempt + 1) * 2
logger.warning(f"curl failed (retry {attempt + 1}/{retries} after {wait}s): {result.stderr[:200]}")
time.sleep(wait)
else:
raise RuntimeError(f"curl failed after {retries} attempts ({result.returncode}): {result.stderr[:200]}")
def fix_gps(lat, lng): def fix_gps(lat, lng):
@@ -118,16 +112,6 @@ def scrape(max_properties: int | None = None):
all_units = data.get("units", {}).get("data", []) all_units = data.get("units", {}).get("data", [])
logger.info(f"Staženo jednotek celkem: {len(all_units)}") logger.info(f"Staženo jednotek celkem: {len(all_units)}")
# Load previous output for first_seen/last_changed tracking
_prev_cache: dict[str, dict] = {}
_prev_path = Path("byty_psn.json")
if _prev_path.exists():
try:
for _item in json.loads(_prev_path.read_text(encoding="utf-8")):
_prev_cache[str(_item["hash_id"])] = _item
except Exception:
pass
# Filtrování # Filtrování
results = [] results = []
excluded = { excluded = {
@@ -258,11 +242,7 @@ def scrape(max_properties: int | None = None):
"source": "psn", "source": "psn",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"first_seen": _prev_cache.get(str(unit_id), {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")),
"last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(str(unit_id), {}).get("price") != int(price) else _prev_cache[str(unit_id)].get("last_changed", datetime.now().strftime("%Y-%m-%d")),
} }
if not validate_listing(result, "psn"):
continue
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -15,7 +15,7 @@ import re
import time import time
import urllib.request import urllib.request
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats, validate_listing from scraper_stats import write_stats
STATS_FILE = "stats_realingo.json" STATS_FILE = "stats_realingo.json"
@@ -56,28 +56,6 @@ HEADERS = {
BASE_URL = "https://www.realingo.cz" BASE_URL = "https://www.realingo.cz"
def fetch_url(url: str, retries: int = 3) -> str:
"""Fetch URL and return HTML string with retry on transient errors."""
for attempt in range(retries):
try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html
except urllib.error.HTTPError:
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
if attempt < retries - 1:
wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
time.sleep(wait)
else:
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
raise
def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]: def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
"""Fetch a page of Prague listings. Returns (items, total_count).""" """Fetch a page of Prague listings. Returns (items, total_count)."""
if page == 1: if page == 1:
@@ -85,7 +63,14 @@ def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
else: else:
url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/" url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
html = fetch_url(url) logger.debug(f"HTTP GET request: {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS)
try:
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
match = re.search( match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL html, re.DOTALL
@@ -98,13 +83,21 @@ def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"] offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}") logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
return offer_list["data"], offer_list["total"] return offer_list["data"], offer_list["total"]
except (urllib.error.URLError, ConnectionError, OSError) as e:
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
raise
def fetch_detail(listing_url: str) -> dict | None: def fetch_detail(listing_url: str) -> dict | None:
"""Fetch detail page for a listing to get floor, building type, etc.""" """Fetch detail page for a listing to get floor, building type, etc."""
try: try:
url = f"{BASE_URL}{listing_url}" url = f"{BASE_URL}{listing_url}"
html = fetch_url(url) logger.debug(f"HTTP GET request: {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
match = re.search( match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL html, re.DOTALL
@@ -328,11 +321,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"source": "realingo", "source": "realingo",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
"last_changed": datetime.now().strftime("%Y-%m-%d"),
} }
if not validate_listing(result, "realingo"):
continue
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -1,53 +1,13 @@
"""Shared utilities for scraper run statistics and listing validation.""" """Shared utility for writing per-scraper run statistics to JSON."""
from __future__ import annotations from __future__ import annotations
import json import json
import logging
import os import os
from pathlib import Path from pathlib import Path
HERE = Path(__file__).parent HERE = Path(__file__).parent
DATA_DIR = Path(os.environ.get("DATA_DIR", HERE)) DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
_val_log = logging.getLogger(__name__)
_REQUIRED_FIELDS = ("hash_id", "price", "locality", "lat", "lon", "url", "source")
def validate_listing(listing: dict, context: str = "") -> bool:
"""
Validate a listing dict before it is written to the output JSON.
Returns True if valid, False if the listing should be skipped.
Logs a warning for each invalid listing.
"""
prefix = f"[{context}] " if context else ""
for field in _REQUIRED_FIELDS:
val = listing.get(field)
if val is None or val == "":
_val_log.warning(f"{prefix}Skipping listing — missing field '{field}': {listing.get('hash_id', '?')}")
return False
price = listing.get("price")
if not isinstance(price, (int, float)) or price <= 0:
_val_log.warning(f"{prefix}Skipping listing — invalid price={price!r}: {listing.get('hash_id', '?')}")
return False
lat, lon = listing.get("lat"), listing.get("lon")
if not isinstance(lat, (int, float)) or not isinstance(lon, (int, float)):
_val_log.warning(f"{prefix}Skipping listing — non-numeric GPS lat={lat!r} lon={lon!r}: {listing.get('hash_id', '?')}")
return False
if not (47.0 <= lat <= 52.0) or not (12.0 <= lon <= 19.0):
_val_log.warning(f"{prefix}Skipping listing — GPS outside Czech Republic lat={lat} lon={lon}: {listing.get('hash_id', '?')}")
return False
area = listing.get("area")
if area is not None and (not isinstance(area, (int, float)) or area <= 0):
_val_log.warning(f"{prefix}Skipping listing — invalid area={area!r}: {listing.get('hash_id', '?')}")
return False
return True
def write_stats(filename: str, stats: dict) -> None: def write_stats(filename: str, stats: dict) -> None:
"""Write scraper run stats dict to the data directory.""" """Write scraper run stats dict to the data directory."""