Add first_seen/last_updated timestamps to track property freshness
Each property record now carries two date fields:
- first_seen: date the listing first appeared (preserved across runs)
- last_updated: date of the most recent scrape that included it
All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome)
set these fields during scraping. Cached results preserve first_seen and
refresh last_updated. PSN and CityHome gain a load_previous() helper to
track first_seen across runs (they lacked caching before).
The merge script keeps the earliest first_seen and latest last_updated
when deduplicating listings across sources.
The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays
a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno"
dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ import logging
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -109,7 +110,20 @@ def format_price(price: int) -> str:
|
||||
return " ".join(reversed(parts)) + " Kč"
|
||||
|
||||
|
||||
def load_previous(json_path: str = "byty_psn.json") -> dict[str, str]:
|
||||
"""Load first_seen dates from previous run, keyed by hash_id."""
|
||||
path = Path(json_path)
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
return {str(e["hash_id"]): e.get("first_seen", "") for e in data if "hash_id" in e}
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
return {}
|
||||
|
||||
|
||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
previous_first_seen = load_previous()
|
||||
logger.info("=" * 60)
|
||||
logger.info("Stahuji inzeráty z PSN.cz")
|
||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
@@ -282,8 +296,12 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
project_slug = unit.get("_project_slug", "")
|
||||
detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
|
||||
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
hash_id = unit.get("id", slug)
|
||||
first_seen = previous_first_seen.get(str(hash_id), "") or today
|
||||
|
||||
result = {
|
||||
"hash_id": unit.get("id", slug),
|
||||
"hash_id": hash_id,
|
||||
"name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
|
||||
"price": int(price),
|
||||
"price_formatted": format_price(int(price)),
|
||||
@@ -298,6 +316,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
"url": detail_url,
|
||||
"source": "psn",
|
||||
"image": "",
|
||||
"first_seen": first_seen,
|
||||
"last_updated": today,
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
Reference in New Issue
Block a user