Add first_seen/last_updated timestamps to track property freshness
Each property record now carries two date fields:
- first_seen: date the listing first appeared (preserved across runs)
- last_updated: date of the most recent scrape that included it
All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome)
set these fields during scraping. Cached results preserve first_seen and
refresh last_updated. PSN and CityHome gain a load_previous() helper to
track first_seen across runs (they lacked caching before).
The merge script keeps the earliest first_seen and latest last_updated
when deduplicating listings across sources.
The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays
a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno"
dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,6 +13,7 @@ import math
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -238,10 +239,14 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
# Check cache — if hash_id exists and price unchanged, reuse
|
||||
item_id = int(item["id"])
|
||||
item_price = item.get("price", {}).get("total", 0) or 0
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
cached = cache.get(item_id)
|
||||
if cached and cached.get("price") == item_price:
|
||||
cache_hits += 1
|
||||
logger.debug(f"Cache hit for id={item_id}")
|
||||
cached["last_updated"] = today
|
||||
if "first_seen" not in cached:
|
||||
cached["first_seen"] = today
|
||||
results.append(cached)
|
||||
continue
|
||||
|
||||
@@ -298,6 +303,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
cat = item.get("category", "")
|
||||
loc = item.get("location", {})
|
||||
|
||||
# Preserve first_seen from cache if this is a price-changed re-fetch
|
||||
first_seen = today
|
||||
if cached and "first_seen" in cached:
|
||||
first_seen = cached["first_seen"]
|
||||
|
||||
result = {
|
||||
"hash_id": int(item["id"]),
|
||||
"name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')} m²",
|
||||
@@ -314,6 +324,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
"url": f"{BASE_URL}{item['url']}",
|
||||
"source": "realingo",
|
||||
"image": "",
|
||||
"first_seen": first_seen,
|
||||
"last_updated": today,
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
Reference in New Issue
Block a user