Reliability improvements: retry logic, validation, ratings sync
Some checks failed
Build and Push / build (push) Failing after 4s
Some checks failed
Build and Push / build (push) Failing after 4s
- Add 3-attempt retry with exponential backoff to Sreality, Realingo,
Bezrealitky, and PSN scrapers (CityHome and iDNES already had it)
- Add shared validate_listing() in scraper_stats.py; all 6 scrapers now
validate GPS bounds, price, area, and required fields before output
- Wire ratings to server /api/ratings on page load (merge with
localStorage) and save (async POST); ratings now persist across
browsers and devices
- Namespace JS hash IDs as {source}_{id} to prevent rating collisions
between listings from different portals with the same numeric ID
- Replace manual Czech diacritic table with unicodedata.normalize()
in merge_and_map.py for correct deduplication of all edge cases
- Correct README schedule docs: every 4 hours, not twice daily
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,7 +15,7 @@ import urllib.request
|
||||
import urllib.parse
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from scraper_stats import write_stats
|
||||
from scraper_stats import write_stats, validate_listing
|
||||
|
||||
STATS_FILE = "stats_sreality.json"
|
||||
|
||||
@@ -45,19 +45,26 @@ HEADERS = {
|
||||
|
||||
|
||||
def api_get(url: str) -> dict:
|
||||
"""Fetch JSON from Sreality API."""
|
||||
logger.debug(f"HTTP GET request: {url}")
|
||||
logger.debug(f"Headers: {HEADERS}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
response_data = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes")
|
||||
logger.debug(f"Response preview: {response_data[:200]}")
|
||||
return json.loads(response_data)
|
||||
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
||||
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
|
||||
raise
|
||||
"""Fetch JSON from Sreality API with retry."""
|
||||
for attempt in range(3):
|
||||
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
response_data = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes")
|
||||
logger.debug(f"Response preview: {response_data[:200]}")
|
||||
return json.loads(response_data)
|
||||
except urllib.error.HTTPError:
|
||||
raise
|
||||
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
||||
if attempt < 2:
|
||||
wait = (attempt + 1) * 2
|
||||
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
|
||||
time.sleep(wait)
|
||||
else:
|
||||
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
def build_list_url(disposition: int, page: int = 1) -> str:
|
||||
@@ -356,6 +363,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
|
||||
"last_changed": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
if not validate_listing(result, "sreality"):
|
||||
continue
|
||||
results.append(result)
|
||||
details_fetched += 1
|
||||
|
||||
@@ -476,7 +485,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
||||
source_label = source_labels.get(source, source)
|
||||
source_color = source_colors.get(source, "#999")
|
||||
|
||||
hash_id = e.get("hash_id", "")
|
||||
hash_id = f"{source}_{e.get('hash_id', '')}"
|
||||
|
||||
first_seen = e.get("first_seen", "")
|
||||
last_changed = e.get("last_changed", "")
|
||||
@@ -864,6 +873,11 @@ function loadRatings() {{
|
||||
|
||||
function saveRatings(ratings) {{
|
||||
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
|
||||
fetch('/api/ratings', {{
|
||||
method: 'POST',
|
||||
headers: {{'Content-Type': 'application/json'}},
|
||||
body: JSON.stringify(ratings)
|
||||
}}).catch(function() {{}});
|
||||
}}
|
||||
|
||||
function addRejectStrike(marker) {{
|
||||
@@ -1167,8 +1181,25 @@ function applyFilters() {{
|
||||
document.getElementById('visible-count').textContent = visible;
|
||||
}}
|
||||
|
||||
// Initialize ratings on load
|
||||
restoreRatings();
|
||||
// Initialize ratings: load from server, merge with localStorage, then restore
|
||||
function initRatings() {{
|
||||
var local = loadRatings();
|
||||
fetch('/api/ratings')
|
||||
.then(function(r) {{ return r.ok ? r.json() : null; }})
|
||||
.then(function(server) {{
|
||||
if (server && typeof server === 'object') {{
|
||||
var merged = Object.assign({{}}, local, server);
|
||||
localStorage.setItem(RATINGS_KEY, JSON.stringify(merged));
|
||||
}}
|
||||
restoreRatings();
|
||||
updateRatingCounts();
|
||||
}})
|
||||
.catch(function() {{
|
||||
restoreRatings();
|
||||
updateRatingCounts();
|
||||
}});
|
||||
}}
|
||||
initRatings();
|
||||
|
||||
// ── Panel toggle ──────────────────────────────────────────────
|
||||
function togglePanel() {{
|
||||
|
||||
Reference in New Issue
Block a user