Some checks failed
Build and Push / build (push) Failing after 4s
- Add 3-attempt retry with exponential backoff to Sreality, Realingo,
Bezrealitky, and PSN scrapers (CityHome and iDNES already had it)
- Add shared validate_listing() in scraper_stats.py; all 6 scrapers now
validate GPS bounds, price, area, and required fields before output
- Wire ratings to server /api/ratings on page load (merge with
localStorage) and save (async POST); ratings now persist across
browsers and devices
- Namespace JS hash IDs as {source}_{id} to prevent rating collisions
between listings from different portals with the same numeric ID
- Replace manual Czech diacritic table with unicodedata.normalize()
in merge_and_map.py for correct deduplication of all edge cases
- Correct README schedule docs: every 4 hours, not twice daily
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
56 lines
2.1 KiB
Python
56 lines
2.1 KiB
Python
"""Shared utilities for scraper run statistics and listing validation."""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
|
|
HERE = Path(__file__).parent
|
|
DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
|
|
|
|
_val_log = logging.getLogger(__name__)
|
|
|
|
_REQUIRED_FIELDS = ("hash_id", "price", "locality", "lat", "lon", "url", "source")
|
|
|
|
|
|
def validate_listing(listing: dict, context: str = "") -> bool:
|
|
"""
|
|
Validate a listing dict before it is written to the output JSON.
|
|
Returns True if valid, False if the listing should be skipped.
|
|
Logs a warning for each invalid listing.
|
|
"""
|
|
prefix = f"[{context}] " if context else ""
|
|
|
|
for field in _REQUIRED_FIELDS:
|
|
val = listing.get(field)
|
|
if val is None or val == "":
|
|
_val_log.warning(f"{prefix}Skipping listing — missing field '{field}': {listing.get('hash_id', '?')}")
|
|
return False
|
|
|
|
price = listing.get("price")
|
|
if not isinstance(price, (int, float)) or price <= 0:
|
|
_val_log.warning(f"{prefix}Skipping listing — invalid price={price!r}: {listing.get('hash_id', '?')}")
|
|
return False
|
|
|
|
lat, lon = listing.get("lat"), listing.get("lon")
|
|
if not isinstance(lat, (int, float)) or not isinstance(lon, (int, float)):
|
|
_val_log.warning(f"{prefix}Skipping listing — non-numeric GPS lat={lat!r} lon={lon!r}: {listing.get('hash_id', '?')}")
|
|
return False
|
|
if not (47.0 <= lat <= 52.0) or not (12.0 <= lon <= 19.0):
|
|
_val_log.warning(f"{prefix}Skipping listing — GPS outside Czech Republic lat={lat} lon={lon}: {listing.get('hash_id', '?')}")
|
|
return False
|
|
|
|
area = listing.get("area")
|
|
if area is not None and (not isinstance(area, (int, float)) or area <= 0):
|
|
_val_log.warning(f"{prefix}Skipping listing — invalid area={area!r}: {listing.get('hash_id', '?')}")
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def write_stats(filename: str, stats: dict) -> None:
|
|
"""Write scraper run stats dict to the data directory."""
|
|
path = DATA_DIR / filename
|
|
path.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")
|