2 Commits
main ... 0.01

Author SHA1 Message Date
Jan Novak
a1212c6312 Tag Docker images with both git tag and latest
All checks were successful
Build and Push / build (push) Successful in 8s
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 21:05:51 +01:00
Jan Novak
0b95c847c4 Add first_seen/last_updated timestamps to track property freshness
Each property record now carries two date fields:
- first_seen: date the listing first appeared (preserved across runs)
- last_updated: date of the most recent scrape that included it

All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome)
set these fields during scraping. Cached results preserve first_seen and
refresh last_updated. PSN and CityHome gain a load_previous() helper to
track first_seen across runs (they lacked caching before).

The merge script keeps the earliest first_seen and latest last_updated
when deduplicating listings across sources.

The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays
a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno"
dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 21:03:08 +01:00
16 changed files with 1927 additions and 13358 deletions

View File

@@ -30,6 +30,7 @@ jobs:
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
TAG=${{ inputs.tag }} TAG=${{ inputs.tag }}
fi fi
IMAGE=gitea.home.hrajfrisbee.cz/${{ github.repository }}:$TAG REPO=gitea.home.hrajfrisbee.cz/${{ github.repository }}
docker build -f build/Dockerfile -t $IMAGE . docker build -f build/Dockerfile -t $REPO:$TAG -t $REPO:latest .
docker push $IMAGE docker push $REPO:$TAG
docker push $REPO:latest

View File

@@ -10,7 +10,7 @@ WORKDIR /app
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
scrape_idnes.py scrape_psn.py scrape_cityhome.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \
merge_and_map.py regen_map.py run_all.sh ratings_server.py ./ merge_and_map.py regen_map.py run_all.sh ./
COPY build/crontab /etc/crontabs/root COPY build/crontab /etc/crontabs/root
COPY build/entrypoint.sh /entrypoint.sh COPY build/entrypoint.sh /entrypoint.sh
@@ -18,7 +18,7 @@ RUN chmod +x /entrypoint.sh run_all.sh
RUN mkdir -p /app/data RUN mkdir -p /app/data
EXPOSE 8080 8081 EXPOSE 8080
HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \
CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 CMD wget -q -O /dev/null http://localhost:8080/ || exit 1

View File

@@ -6,7 +6,7 @@ DATA_DIR="/app/data"
# Create symlinks so scripts (which write to /app/) persist data to the volume # Create symlinks so scripts (which write to /app/) persist data to the volume
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \ byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \
mapa_bytu.html ratings.json; do mapa_bytu.html; do
# Remove real file if it exists (e.g. baked into image) # Remove real file if it exists (e.g. baked into image)
[ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f" [ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f"
ln -sf "$DATA_DIR/$f" "/app/$f" ln -sf "$DATA_DIR/$f" "/app/$f"
@@ -18,8 +18,5 @@ crond -b -l 2
echo "[entrypoint] Starting initial scrape in background..." echo "[entrypoint] Starting initial scrape in background..."
bash /app/run_all.sh & bash /app/run_all.sh &
echo "[entrypoint] Starting ratings API server on port 8081..."
DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py &
echo "[entrypoint] Starting HTTP server on port 8080..." echo "[entrypoint] Starting HTTP server on port 8080..."
exec python3 -m http.server 8080 --directory "$DATA_DIR" exec python3 -m http.server 8080 --directory "$DATA_DIR"

File diff suppressed because it is too large Load Diff

View File

@@ -1,202 +0,0 @@
#!/usr/bin/env python3
"""Generate status.json from scraper JSON outputs and run log."""
from __future__ import annotations
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
HERE = Path(__file__).parent
SOURCE_FILES = {
"Sreality": "byty_sreality.json",
"Realingo": "byty_realingo.json",
"Bezrealitky": "byty_bezrealitky.json",
"iDNES": "byty_idnes.json",
"PSN": "byty_psn.json",
"CityHome": "byty_cityhome.json",
}
MERGED_FILE = "byty_merged.json"
def count_source(path: Path) -> dict:
"""Read a scraper JSON and return accepted count + file mtime."""
if not path.exists():
return {"accepted": 0, "error": "soubor nenalezen"}
try:
data = json.loads(path.read_text(encoding="utf-8"))
mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds")
return {"accepted": len(data), "updated_at": mtime}
except Exception as e:
return {"accepted": 0, "error": str(e)}
def parse_log(log_path: str) -> dict[str, dict]:
"""Parse scraper run log and extract per-source statistics.
Scrapers log summary lines like:
✓ Vyhovující byty: 12
Vyloučeno (prodáno): 5
Staženo stránek: 3
Staženo inzerátů: 48
Celkem bytů v cache: 120
and section headers like:
[2/6] Realingo
"""
if not log_path or not os.path.exists(log_path):
return {}
with open(log_path, encoding="utf-8") as f:
content = f.read()
# Split into per-source sections by the [N/6] Step header
# Each section header looks like "[2/6] Realingo\n----..."
section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE)
sections_found = list(section_pattern.finditer(content))
if not sections_found:
return {}
stats = {}
for i, match in enumerate(sections_found):
step_name = match.group(2).strip()
start = match.end()
end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content)
section_text = content[start:end]
# Identify which sources this section covers
# "PSN + CityHome" covers both
source_names = []
for name in SOURCE_FILES:
if name.lower() in step_name.lower():
source_names.append(name)
if not source_names:
continue
# Parse numeric summary lines
def extract(pattern: str) -> Optional[int]:
m = re.search(pattern, section_text)
return int(m.group(1)) if m else None
# Lines present in all/most scrapers
accepted = extract(r'Vyhovující byty[:\s]+(\d+)')
fetched = extract(r'Staženo inzerátů[:\s]+(\d+)')
pages = extract(r'Staženo stránek[:\s]+(\d+)')
cached = extract(r'Celkem bytů v cache[:\s]+(\d+)')
cache_hits = extract(r'Cache hit[:\s]+(\d+)')
# Rejection reasons — collect all into a dict
excluded = {}
for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text):
excluded[m.group(1)] = int(m.group(2))
# Also PSN-style "Vyloučeno (prodáno): N"
total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)')
entry = {}
if accepted is not None:
entry["accepted"] = accepted
if fetched is not None:
entry["fetched"] = fetched
if pages is not None:
entry["pages"] = pages
if cached is not None:
entry["cached"] = cached
if cache_hits is not None:
entry["cache_hits"] = cache_hits
if excluded:
entry["excluded"] = excluded
elif total_excluded is not None:
entry["excluded_total"] = total_excluded
for name in source_names:
stats[name] = entry
return stats
def main():
start_time = None
duration_sec = None
if len(sys.argv) >= 3:
start_time = sys.argv[1]
try:
duration_sec = int(sys.argv[2])
except ValueError:
pass
if not start_time:
start_time = datetime.now().isoformat(timespec="seconds")
log_path = sys.argv[3] if len(sys.argv) >= 4 else None
log_stats = parse_log(log_path)
sources = []
for name, filename in SOURCE_FILES.items():
path = HERE / filename
info = count_source(path)
info["name"] = name
# Merge log stats
ls = log_stats.get(name, {})
for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"):
if k in ls:
info[k] = ls[k]
# Override accepted from log if available (log is authoritative for latest run)
if "accepted" in ls:
info["accepted"] = ls["accepted"]
sources.append(info)
# Total accepted before dedup
total_accepted = sum(s.get("accepted", 0) for s in sources)
# Merged / deduplicated count
merged_path = HERE / MERGED_FILE
deduplicated = 0
if merged_path.exists():
try:
merged = json.loads(merged_path.read_text(encoding="utf-8"))
deduplicated = len(merged)
except Exception:
pass
duplicates_removed = total_accepted - deduplicated if deduplicated else 0
status = {
"status": "done",
"timestamp": start_time,
"duration_sec": duration_sec,
"total_accepted": total_accepted,
"deduplicated": deduplicated,
"duplicates_removed": duplicates_removed,
"sources": sources,
}
out = HERE / "status.json"
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Status uložen: {out}")
print(f" Celkem bytů (před dedup): {total_accepted}")
print(f" Po deduplikaci: {deduplicated}")
if duplicates_removed:
print(f" Odstraněno duplikátů: {duplicates_removed}")
for s in sources:
acc = s.get("accepted", 0)
err = s.get("error", "")
exc = s.get("excluded", {})
exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0)
parts = [f"{s['name']:12s}: {acc} bytů"]
if exc_total:
parts.append(f"({exc_total} vyloučeno)")
if err:
parts.append(f"[CHYBA: {err}]")
print(" " + " ".join(parts))
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -79,6 +79,19 @@ def main():
if key in seen_keys: if key in seen_keys:
dupes += 1 dupes += 1
existing = seen_keys[key] existing = seen_keys[key]
# Merge timestamps: keep earliest first_seen, latest last_updated
e_first = e.get("first_seen", "")
ex_first = existing.get("first_seen", "")
if e_first and ex_first:
existing["first_seen"] = min(e_first, ex_first)
elif e_first:
existing["first_seen"] = e_first
e_updated = e.get("last_updated", "")
ex_updated = existing.get("last_updated", "")
if e_updated and ex_updated:
existing["last_updated"] = max(e_updated, ex_updated)
elif e_updated:
existing["last_updated"] = e_updated
# Log it # Log it
print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')}" print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')}"
f"({e.get('source', '?')} vs {existing.get('source', '?')})") f"({e.get('source', '?')} vs {existing.get('source', '?')})")

View File

@@ -1,116 +0,0 @@
#!/usr/bin/env python3
"""
Minimal HTTP API server for persisting apartment ratings.
GET /api/ratings → returns ratings.json contents
POST /api/ratings → saves entire ratings object
GET /api/ratings/export → same as GET, but with download header
Ratings file: /app/data/ratings.json (or ./ratings.json locally)
"""
import json
import logging
import os
import sys
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
PORT = int(os.environ.get("RATINGS_PORT", 8081))
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
RATINGS_FILE = DATA_DIR / "ratings.json"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [ratings] %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
log = logging.getLogger(__name__)
def load_ratings() -> dict:
try:
if RATINGS_FILE.exists():
return json.loads(RATINGS_FILE.read_text(encoding="utf-8"))
except Exception as e:
log.error("Failed to load ratings: %s", e)
return {}
def save_ratings(data: dict) -> None:
RATINGS_FILE.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
class RatingsHandler(BaseHTTPRequestHandler):
def log_message(self, format, *args):
# Suppress default HTTP access log (we use our own)
pass
def _send_json(self, status: int, body: dict, extra_headers=None):
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
if extra_headers:
for k, v in extra_headers.items():
self.send_header(k, v)
self.end_headers()
self.wfile.write(payload)
def do_OPTIONS(self):
# CORS preflight
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
if self.path in ("/api/ratings", "/api/ratings/export"):
ratings = load_ratings()
extra = None
if self.path == "/api/ratings/export":
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
log.info("GET %s%d ratings", self.path, len(ratings))
self._send_json(200, ratings, extra)
else:
self._send_json(404, {"error": "not found"})
def do_POST(self):
if self.path == "/api/ratings":
length = int(self.headers.get("Content-Length", 0))
if length == 0:
self._send_json(400, {"error": "empty body"})
return
try:
raw = self.rfile.read(length)
data = json.loads(raw.decode("utf-8"))
except Exception as e:
log.warning("Bad request body: %s", e)
self._send_json(400, {"error": "invalid JSON"})
return
if not isinstance(data, dict):
self._send_json(400, {"error": "expected JSON object"})
return
save_ratings(data)
log.info("POST /api/ratings → saved %d ratings", len(data))
self._send_json(200, {"ok": True, "count": len(data)})
else:
self._send_json(404, {"error": "not found"})
if __name__ == "__main__":
log.info("Ratings server starting on port %d, data dir: %s", PORT, DATA_DIR)
log.info("Ratings file: %s", RATINGS_FILE)
server = HTTPServer(("0.0.0.0", PORT), RatingsHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
log.info("Stopped.")
sys.exit(0)

View File

@@ -16,12 +16,6 @@ NC='\033[0m'
TOTAL=6 TOTAL=6
CURRENT=0 CURRENT=0
FAILED=0 FAILED=0
START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S")
START_EPOCH=$(date +%s)
LOG_FILE="$(pwd)/scrape_run.log"
# Mark status as running
echo '{"status":"running"}' > status.json
show_help() { show_help() {
echo "Usage: ./run_all.sh [OPTIONS]" echo "Usage: ./run_all.sh [OPTIONS]"
@@ -69,8 +63,6 @@ step() {
} }
# ── Scrapery (paralelně kde to jde) ───────────────────────── # ── Scrapery (paralelně kde to jde) ─────────────────────────
# Tee all output to log file for status generation
exec > >(tee -a "$LOG_FILE") 2>&1
step "Sreality" step "Sreality"
python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); } python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); }
@@ -99,12 +91,6 @@ python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((F
# ── Otevření mapy ──────────────────────────────────────────── # ── Otevření mapy ────────────────────────────────────────────
# ── Generování statusu ─────────────────────────────────────
END_EPOCH=$(date +%s)
DURATION=$((END_EPOCH - START_EPOCH))
python3 generate_status.py "$START_TIME" "$DURATION" "$LOG_FILE"
echo "" echo ""
echo "============================================================" echo "============================================================"
if [ $FAILED -eq 0 ]; then if [ $FAILED -eq 0 ]; then

View File

@@ -272,9 +272,13 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
# Check cache — if hash_id exists and price unchanged, reuse # Check cache — if hash_id exists and price unchanged, reuse
cached = cache.get(hash_id) cached = cache.get(hash_id)
today = datetime.now().strftime("%Y-%m-%d")
if cached and cached.get("price") == estate.get("price", 0): if cached and cached.get("price") == estate.get("price", 0):
cache_hits += 1 cache_hits += 1
logger.debug(f"Cache hit for hash_id={hash_id}") logger.debug(f"Cache hit for hash_id={hash_id}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached) results.append(cached)
continue continue
@@ -332,6 +336,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
disp_cb = estate.get("_disposition_cb") or estate.get("seo", {}).get("category_sub_cb") disp_cb = estate.get("_disposition_cb") or estate.get("seo", {}).get("category_sub_cb")
seo = estate.get("seo", {}) seo = estate.get("seo", {})
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = { result = {
"hash_id": hash_id, "hash_id": hash_id,
"name": estate.get("name", ""), "name": estate.get("name", ""),
@@ -347,7 +356,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"ownership": ownership, "ownership": ownership,
"url": sreality_url(hash_id, seo), "url": sreality_url(hash_id, seo),
"image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""), "image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""),
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "first_seen": first_seen,
"last_updated": today,
} }
results.append(result) results.append(result)
details_fetched += 1 details_fetched += 1
@@ -374,58 +384,26 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
"""Generate an interactive Leaflet.js HTML map.""" """Generate an interactive Leaflet.js HTML map."""
# Color by price per m² — cool blue→warm red scale, no yellow # Color by disposition
# Thresholds based on Prague market distribution (p25=120k, p50=144k, p75=162k) color_map = {
price_color_scale = [ "3+kk": "#2196F3", # blue
(110_000, "#1565C0"), # < 110k/m² → deep blue (levné) "3+1": "#4CAF50", # green
(130_000, "#42A5F5"), # 110130k → light blue "4+kk": "#FF9800", # orange
(150_000, "#66BB6A"), # 130150k → green (střed) "4+1": "#F44336", # red
(165_000, "#EF6C00"), # 150165k → dark orange "5+kk": "#9C27B0", # purple
(float("inf"), "#C62828"), # > 165k → dark red (drahé) "5+1": "#795548", # brown
] "6+": "#607D8B", # grey-blue
}
def price_color(estate: dict) -> str: def fmt_date(d):
price = estate.get("price") or 0 """Format ISO date (YYYY-MM-DD) to Czech format (DD.MM.YYYY)."""
area = estate.get("area") or 0 if d and len(d) == 10:
if not area: return f"{d[8:10]}.{d[5:7]}.{d[:4]}"
return "#9E9E9E" return ""
ppm2 = price / area
for threshold, color in price_color_scale:
if ppm2 < threshold:
return color
return "#E53935"
# Legend bands for info panel (built once)
price_legend_items = (
'<div style="margin-bottom:4px;font-size:12px;color:#555;font-weight:600;">Cena / m²:</div>'
)
bands = [
("#1565C0", "< 110 000 Kč/m²"),
("#42A5F5", "110 130 000 Kč/m²"),
("#66BB6A", "130 150 000 Kč/m²"),
("#EF6C00", "150 165 000 Kč/m²"),
("#C62828", "> 165 000 Kč/m²"),
("#9E9E9E", "cena/plocha neuvedena"),
]
for bcolor, blabel in bands:
price_legend_items += (
f'<div style="display:flex;align-items:center;gap:6px;margin:2px 0;">'
f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};'
f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>'
f'<span>{blabel}</span></div>'
)
# New marker indicator — bigger dot, no extra border
price_legend_items += (
'<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;'
'padding-top:6px;border-top:1px solid #eee;">'
'<span style="width:18px;height:18px;border-radius:50%;background:#66BB6A;'
'display:inline-block;box-shadow:0 1px 4px rgba(0,0,0,0.35);flex-shrink:0;"></span>'
'<span>Nové (z dnešního scrapu) — větší</span></div>'
)
markers_js = "" markers_js = ""
for e in estates: for e in estates:
color = price_color(e) color = color_map.get(e["disposition"], "#999999")
floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno" floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno"
area_text = f'{e["area"]}' if e["area"] else "neuvedeno" area_text = f'{e["area"]}' if e["area"] else "neuvedeno"
building_text = e["building_type"] or "neuvedeno" building_text = e["building_type"] or "neuvedeno"
@@ -443,15 +421,31 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
source_color = source_colors.get(source, "#999") source_color = source_colors.get(source, "#999")
hash_id = e.get("hash_id", "") hash_id = e.get("hash_id", "")
first_seen = e.get("first_seen", "")
last_updated = e.get("last_updated", "")
scraped_at = e.get("scraped_at", "") first_seen_fmt = fmt_date(first_seen)
is_new = scraped_at == datetime.now().strftime("%Y-%m-%d") last_updated_fmt = fmt_date(last_updated)
# "NOVÉ" badge if first_seen equals latest scrape date
new_badge = ""
if first_seen and first_seen == last_updated:
new_badge = ( new_badge = (
'<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;' '<span style="margin-left:6px;font-size:10px;background:#4CAF50;color:white;'
'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>' 'padding:1px 5px;border-radius:3px;font-weight:bold;">NOVÉ</span>'
if is_new else ""
) )
# Date info line
date_line = ""
if first_seen_fmt:
date_line = (
f'<div style="margin-top:4px;font-size:11px;color:#888;">'
f'Přidáno: {first_seen_fmt}'
)
if last_updated_fmt and last_updated != first_seen:
date_line += f' · Aktualizace: {last_updated_fmt}'
date_line += '</div>'
popup = ( popup = (
f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">' f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">'
f'<b style="font-size:14px;">{format_price(e["price"])}</b>' f'<b style="font-size:14px;">{format_price(e["price"])}</b>'
@@ -461,7 +455,8 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
f'{floor_note}<br><br>' f'{floor_note}<br><br>'
f'<b>{e["locality"]}</b><br>' f'<b>{e["locality"]}</b><br>'
f'Stavba: {building_text}<br>' f'Stavba: {building_text}<br>'
f'Vlastnictví: {ownership_text}<br><br>' f'Vlastnictví: {ownership_text}'
f'{date_line}<br>'
f'<a href="{e["url"]}" target="_blank" ' f'<a href="{e["url"]}" target="_blank" '
f'style="color:{source_color};text-decoration:none;font-weight:bold;">' f'style="color:{source_color};text-decoration:none;font-weight:bold;">'
f'→ Otevřít na {source_label}</a>' f'→ Otevřít na {source_label}</a>'
@@ -485,32 +480,26 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
popup = popup.replace("'", "\\'").replace("\n", "") popup = popup.replace("'", "\\'").replace("\n", "")
is_fav = source in ("psn", "cityhome") is_fav = source in ("psn", "cityhome")
marker_fn = "addHeartMarker" if is_fav else "addMarker"
if is_fav:
marker_fn = "addHeartMarker"
elif is_new:
marker_fn = "addNewMarker"
else:
marker_fn = "addMarker"
markers_js += ( markers_js += (
f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n" f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}', '{first_seen}');\n"
) )
# Build legend — price per m² bands + disposition counts # Build legend
legend_items = price_legend_items legend_items = ""
# Disposition counts below the color legend
disp_counts = {} disp_counts = {}
for e in estates: for e in estates:
d = e["disposition"] d = e["disposition"]
disp_counts[d] = disp_counts.get(d, 0) + 1 disp_counts[d] = disp_counts.get(d, 0) + 1
disp_order = ["3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+"] for disp, color in color_map.items():
disp_summary = ", ".join( count = disp_counts.get(disp, 0)
f"{d} ({disp_counts[d]})" for d in disp_order if d in disp_counts if count > 0:
)
legend_items += ( legend_items += (
f'<div style="margin-top:8px;padding-top:6px;border-top:1px solid #eee;' f'<div style="display:flex;align-items:center;gap:6px;margin:3px 0;">'
f'font-size:12px;color:#666;">{disp_summary}</div>' f'<span style="width:14px;height:14px;border-radius:50%;'
f'background:{color};display:inline-block;border:2px solid white;'
f'box-shadow:0 1px 3px rgba(0,0,0,0.3);"></span>'
f'<span>{disp} ({count})</span></div>'
) )
# Heart marker legend for PSN/CityHome # Heart marker legend for PSN/CityHome
@@ -546,7 +535,6 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
body {{ font-family: system-ui, -apple-system, sans-serif; }} body {{ font-family: system-ui, -apple-system, sans-serif; }}
#map {{ width: 100%; height: 100vh; }} #map {{ width: 100%; height: 100vh; }}
.heart-icon {{ background: none !important; border: none !important; }} .heart-icon {{ background: none !important; border: none !important; }}
.star-icon {{ background: none !important; border: none !important; }}
.rate-btn:hover {{ background: #f0f0f0 !important; }} .rate-btn:hover {{ background: #f0f0f0 !important; }}
.rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }} .rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }}
.rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }} .rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }}
@@ -557,42 +545,13 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
}} }}
.marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }} .marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }}
.heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }} .heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }}
.heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }} .heart-icon-rej {{ opacity: 0.2 !important; }}
.reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }}
@keyframes pulse-new {{
0% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
50% {{ stroke-opacity: 0.4; stroke-width: 6px; r: 12; }}
100% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
}}
.marker-new {{ animation: pulse-new 2s ease-in-out infinite; }}
.info-panel {{ .info-panel {{
position: absolute; top: 10px; right: 10px; z-index: 1000; position: absolute; top: 10px; right: 10px; z-index: 1000;
background: white; padding: 16px; border-radius: 10px; background: white; padding: 16px; border-radius: 10px;
box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px; box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px;
font-size: 13px; line-height: 1.5; font-size: 13px; line-height: 1.5;
transition: transform 0.3s ease, opacity 0.3s ease;
}} }}
.info-panel.collapsed {{
transform: translateX(calc(100% + 20px));
opacity: 0; pointer-events: none;
}}
.panel-open-btn {{
position: absolute; top: 10px; right: 10px; z-index: 1001;
width: 40px; height: 40px; border-radius: 8px;
background: white; border: none; cursor: pointer;
box-shadow: 0 2px 12px rgba(0,0,0,0.15);
font-size: 20px; display: flex; align-items: center; justify-content: center;
transition: opacity 0.3s ease;
}}
.panel-open-btn.hidden {{ opacity: 0; pointer-events: none; }}
.panel-close-btn {{
position: absolute; top: 8px; right: 8px;
width: 28px; height: 28px; border-radius: 6px;
background: none; border: 1px solid #ddd; cursor: pointer;
font-size: 16px; display: flex; align-items: center; justify-content: center;
color: #888;
}}
.panel-close-btn:hover {{ background: #f0f0f0; color: #333; }}
.info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }} .info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }}
.info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }} .info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }}
.filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }} .filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }}
@@ -600,26 +559,18 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
.filter-section input[type="checkbox"] {{ accent-color: #1976D2; }} .filter-section input[type="checkbox"] {{ accent-color: #1976D2; }}
#floor-filter {{ margin-top: 8px; }} #floor-filter {{ margin-top: 8px; }}
#floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }} #floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }}
.status-link {{ display: block; margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; text-align: center; }}
.status-link a {{ color: #1976D2; text-decoration: none; font-size: 12px; }}
@media (max-width: 600px) {{
.info-panel {{ max-width: calc(100vw - 60px); right: 10px; }}
.info-panel.collapsed {{ transform: translateX(calc(100% + 20px)); }}
.panel-close-btn {{ top: 6px; right: 6px; }}
}}
</style> </style>
</head> </head>
<body> <body>
<div id="map"></div> <div id="map"></div>
<button class="panel-open-btn hidden" id="panel-open-btn" onclick="togglePanel()">☰</button> <div class="info-panel">
<div class="info-panel" id="info-panel">
<button class="panel-close-btn" id="panel-close-btn" onclick="togglePanel()">✕</button>
<h2>Byty v Praze</h2> <h2>Byty v Praze</h2>
<div class="stats"> <div class="stats">
<div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div> <div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div>
<div>Cena: {min_price}{max_price}</div> <div>Cena: {min_price}{max_price}</div>
<div>Průměr: {avg_price}</div> <div>Průměr: {avg_price}</div>
</div> </div>
<div><b>Dispozice:</b></div>
{legend_items} {legend_items}
<div class="filter-section"> <div class="filter-section">
<b>Filtry:</b> <b>Filtry:</b>
@@ -643,6 +594,17 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
</select> </select>
</label> </label>
</div> </div>
<div style="margin-top:6px;">
<label>Přidáno:
<select id="first-seen-filter" onchange="applyFilters()">
<option value="all">Vše</option>
<option value="1">Posledních 24h</option>
<option value="3">Poslední 3 dny</option>
<option value="7">Poslední týden</option>
<option value="14">Posledních 14 dní</option>
</select>
</label>
</div>
</div> </div>
<div class="filter-section"> <div class="filter-section">
<div id="rating-counts" style="margin-bottom:6px;font-size:12px;color:#666;"> <div id="rating-counts" style="margin-bottom:6px;font-size:12px;color:#666;">
@@ -653,7 +615,6 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
Skrýt zamítnuté Skrýt zamítnuté
</label> </label>
</div> </div>
<div class="status-link"><a href="status.html">Scraper status</a></div>
</div> </div>
<script> <script>
@@ -675,7 +636,7 @@ L.tileLayer('https://{{s}}.basemaps.cartocdn.com/light_only_labels/{{z}}/{{x}}/{
var allMarkers = []; var allMarkers = [];
function addMarker(lat, lon, color, popup, hashId) {{ function addMarker(lat, lon, color, popup, hashId, firstSeen) {{
var marker = L.circleMarker([lat, lon], {{ var marker = L.circleMarker([lat, lon], {{
radius: 8, radius: 8,
fillColor: color, fillColor: color,
@@ -684,28 +645,11 @@ function addMarker(lat, lon, color, popup, hashId) {{
opacity: 1, opacity: 1,
fillOpacity: 0.85, fillOpacity: 0.85,
}}).bindPopup(popup); }}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId }}; marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, firstSeen: firstSeen }};
allMarkers.push(marker); allMarkers.push(marker);
marker.addTo(map); marker.addTo(map);
}} }}
function addNewMarker(lat, lon, color, popup, hashId) {{
var marker = L.circleMarker([lat, lon], {{
radius: 12,
fillColor: color,
color: color,
weight: 4,
opacity: 0.35,
fillOpacity: 0.95,
}}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true }};
allMarkers.push(marker);
marker.addTo(map);
marker.on('add', function() {{
if (marker._path) marker._path.classList.add('marker-new');
}});
}}
function heartIcon(color) {{ function heartIcon(color) {{
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">' var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">'
+ '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 ' + '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 '
@@ -721,26 +665,11 @@ function heartIcon(color) {{
}}); }});
}} }}
function starIcon() {{ function addHeartMarker(lat, lon, color, popup, hashId, firstSeen) {{
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="28" height="28" viewBox="0 0 24 24">'
+ '<path d="M12 2l3.09 6.26L22 9.27l-5 4.87L18.18 22 12 18.27 '
+ '5.82 22 7 14.14 2 9.27l6.91-1.01L12 2z" '
+ 'fill="#FFC107" stroke="#F57F17" stroke-width="1" '
+ 'filter="drop-shadow(0 1px 3px rgba(0,0,0,0.3))"/></svg>';
return L.divIcon({{
html: svg,
className: 'star-icon',
iconSize: [28, 28],
iconAnchor: [14, 14],
popupAnchor: [0, -14],
}});
}}
function addHeartMarker(lat, lon, color, popup, hashId) {{
var marker = L.marker([lat, lon], {{ var marker = L.marker([lat, lon], {{
icon: heartIcon(color), icon: heartIcon(color),
}}).bindPopup(popup); }}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isHeart: true }}; marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isHeart: true, firstSeen: firstSeen }};
allMarkers.push(marker); allMarkers.push(marker);
marker.addTo(map); marker.addTo(map);
}} }}
@@ -761,36 +690,6 @@ function saveRatings(ratings) {{
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings)); localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
}} }}
function addRejectStrike(marker) {{
removeRejectStrike(marker);
var color = marker._data.color || '#999';
// SVG "no entry" icon — circle with diagonal line, colored to match marker
var svg = '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20">'
+ '<circle cx="12" cy="12" r="10" fill="none" stroke="' + color + '" stroke-width="2.5" opacity="0.85"/>'
+ '<line x1="5.5" y1="5.5" x2="18.5" y2="18.5" stroke="' + color + '" stroke-width="2.5" stroke-linecap="round" opacity="0.85"/>'
+ '</svg>';
var icon = L.divIcon({{
className: 'reject-overlay',
html: svg,
iconSize: [20, 20],
iconAnchor: [10, 10],
}});
var m = L.marker([marker._data.lat, marker._data.lon], {{
icon: icon,
interactive: false,
pane: 'markerPane',
}});
m.addTo(map);
marker._rejectStrike = m;
}}
function removeRejectStrike(marker) {{
if (marker._rejectStrike) {{
map.removeLayer(marker._rejectStrike);
marker._rejectStrike = null;
}}
}}
function applyMarkerStyle(marker, status) {{ function applyMarkerStyle(marker, status) {{
if (marker._data.isHeart) {{ if (marker._data.isHeart) {{
var el = marker._icon; var el = marker._icon;
@@ -805,33 +704,16 @@ function applyMarkerStyle(marker, status) {{
}} }}
}} else {{ }} else {{
if (status === 'fav') {{ if (status === 'fav') {{
removeRejectStrike(marker);
if (!marker._data._origCircle) marker._data._origCircle = true;
var popup = marker.getPopup();
var popupContent = popup ? popup.getContent() : '';
var wasOnMap = map.hasLayer(marker);
if (wasOnMap) map.removeLayer(marker);
var starMarker = L.marker([marker._data.lat, marker._data.lon], {{
icon: starIcon(),
}}).bindPopup(popupContent);
starMarker._data = marker._data;
var idx = allMarkers.indexOf(marker);
if (idx !== -1) allMarkers[idx] = starMarker;
if (wasOnMap) starMarker.addTo(map);
}} else if (status === 'reject') {{
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
revertToCircle(marker, {{ radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1 }});
}} else {{
marker.setStyle({{ marker.setStyle({{
radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1, radius: 12, fillOpacity: 1, weight: 3,
fillColor: marker._data.color, color: '#fff',
}});
if (marker._path) marker._path.classList.add('marker-favorite');
}} else if (status === 'reject') {{
marker.setStyle({{
radius: 6, fillOpacity: 0.15, fillColor: '#999', color: '#bbb', weight: 1,
}}); }});
if (marker._path) marker._path.classList.remove('marker-favorite'); if (marker._path) marker._path.classList.remove('marker-favorite');
}}
// Add strikethrough line over the marker
addRejectStrike(marker);
}} else {{
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }});
}} else {{ }} else {{
marker.setStyle({{ marker.setStyle({{
radius: 8, fillColor: marker._data.color, color: '#fff', radius: 8, fillColor: marker._data.color, color: '#fff',
@@ -839,23 +721,7 @@ function applyMarkerStyle(marker, status) {{
}}); }});
if (marker._path) marker._path.classList.remove('marker-favorite'); if (marker._path) marker._path.classList.remove('marker-favorite');
}} }}
if (marker._path) marker._path.classList.remove('marker-rejected');
removeRejectStrike(marker);
}} }}
}}
}}
function revertToCircle(marker, style) {{
var popup = marker.getPopup();
var popupContent = popup ? popup.getContent() : '';
var wasOnMap = map.hasLayer(marker);
if (wasOnMap) map.removeLayer(marker);
var cm = L.circleMarker([marker._data.lat, marker._data.lon], style).bindPopup(popupContent);
cm._data = marker._data;
delete cm._data._starRef;
var idx = allMarkers.indexOf(marker);
if (idx !== -1) allMarkers[idx] = cm;
if (wasOnMap) cm.addTo(map);
}} }}
function rateMarker(marker, action) {{ function rateMarker(marker, action) {{
@@ -994,13 +860,25 @@ map.on('popupopen', function(e) {{
}}); }});
// ── Filters ──────────────────────────────────────────────────── // ── Filters ────────────────────────────────────────────────────
function daysAgoDate(days) {{
var d = new Date();
d.setDate(d.getDate() - days);
return d.toISOString().slice(0, 10);
}}
function applyFilters() {{ function applyFilters() {{
var minFloor = parseInt(document.getElementById('min-floor').value); var minFloor = parseInt(document.getElementById('min-floor').value);
var maxPrice = parseInt(document.getElementById('max-price').value); var maxPrice = parseInt(document.getElementById('max-price').value);
var hideRejected = document.getElementById('hide-rejected').checked; var hideRejected = document.getElementById('hide-rejected').checked;
var firstSeenVal = document.getElementById('first-seen-filter').value;
var ratings = loadRatings(); var ratings = loadRatings();
var visible = 0; var visible = 0;
var minFirstSeen = '';
if (firstSeenVal !== 'all') {{
minFirstSeen = daysAgoDate(parseInt(firstSeenVal));
}}
allMarkers.forEach(function(m) {{ allMarkers.forEach(function(m) {{
var popup = m.getPopup().getContent(); var popup = m.getPopup().getContent();
var floorMatch = popup.match(/(\\d+)\\. NP/); var floorMatch = popup.match(/(\\d+)\\. NP/);
@@ -1013,18 +891,19 @@ function applyFilters() {{
if (floor !== null && floor < minFloor) show = false; if (floor !== null && floor < minFloor) show = false;
if (price > maxPrice) show = false; if (price > maxPrice) show = false;
// Date filter
if (minFirstSeen && m._data.firstSeen) {{
if (m._data.firstSeen < minFirstSeen) show = false;
}}
var r = ratings[m._data.hashId]; var r = ratings[m._data.hashId];
if (hideRejected && r && r.status === 'reject') show = false; if (hideRejected && r && r.status === 'reject') show = false;
if (show) {{ if (show) {{
if (!map.hasLayer(m)) m.addTo(map); if (!map.hasLayer(m)) m.addTo(map);
visible++; visible++;
// Show strike line if rejected and visible
if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map);
}} else {{ }} else {{
if (map.hasLayer(m)) map.removeLayer(m); if (map.hasLayer(m)) map.removeLayer(m);
// Hide strike line when marker hidden
if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike);
}} }}
}}); }});
@@ -1042,26 +921,6 @@ function applyFilters() {{
// Initialize ratings on load // Initialize ratings on load
restoreRatings(); restoreRatings();
// ── Panel toggle ──────────────────────────────────────────────
function togglePanel() {{
var panel = document.getElementById('info-panel');
var openBtn = document.getElementById('panel-open-btn');
var isOpen = !panel.classList.contains('collapsed');
if (isOpen) {{
panel.classList.add('collapsed');
openBtn.classList.remove('hidden');
}} else {{
panel.classList.remove('collapsed');
openBtn.classList.add('hidden');
}}
}}
// On mobile, start with panel collapsed
if (window.innerWidth <= 600) {{
document.getElementById('info-panel').classList.add('collapsed');
document.getElementById('panel-open-btn').classList.remove('hidden');
}}
</script> </script>
</body> </body>
</html>""" </html>"""

View File

@@ -7,13 +7,13 @@ Výstup: byty_bezrealitky.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
import re import re
import time import time
import urllib.request import urllib.request
from datetime import datetime
from pathlib import Path from pathlib import Path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -285,10 +285,14 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
# Check cache — if hash_id exists and price unchanged, reuse # Check cache — if hash_id exists and price unchanged, reuse
adv_id = int(adv["id"]) adv_id = int(adv["id"])
adv_price = adv.get("price", 0) or 0 adv_price = adv.get("price", 0) or 0
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(adv_id) cached = cache.get(adv_id)
if cached and cached.get("price") == adv_price: if cached and cached.get("price") == adv_price:
cache_hits += 1 cache_hits += 1
logger.debug(f"Cache hit for id={adv_id}") logger.debug(f"Cache hit for id={adv_id}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached) results.append(cached)
continue continue
@@ -340,6 +344,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
if not address: if not address:
address = adv.get('address({"locale":"CS"})', "Praha") address = adv.get('address({"locale":"CS"})', "Praha")
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = { result = {
"hash_id": int(adv["id"]), "hash_id": int(adv["id"]),
"name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')}", "name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')}",
@@ -356,7 +365,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}", "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
"source": "bezrealitky", "source": "bezrealitky",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "first_seen": first_seen,
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -34,26 +34,24 @@ HEADERS = {
BASE_URL = "https://www.city-home.cz" BASE_URL = "https://www.city-home.cz"
def fetch_url(url: str, retries: int = 3) -> str: def fetch_url(url: str) -> str:
"""Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx.""" """Fetch URL and return HTML string."""
for attempt in range(retries): for attempt in range(3):
try: try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}") logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS) req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30) resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8") html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html return html
except urllib.error.HTTPError:
# Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e: except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
if attempt < retries - 1: if attempt < 2:
wait = (attempt + 1) * 2 wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}") logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
time.sleep(wait) time.sleep(wait)
else: else:
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True) logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
raise raise
@@ -127,21 +125,31 @@ def parse_filter_page(html: str) -> list[dict]:
if detail_url and not detail_url.startswith("http"): if detail_url and not detail_url.startswith("http"):
detail_url = BASE_URL + detail_url detail_url = BASE_URL + detail_url
# Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price] # Extract floor from cells — look for pattern like "3.NP" or "2.PP"
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL) cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
# Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
project_address = cell_texts[2] if len(cell_texts) > 2 else ""
floor = None floor = None
if len(cell_texts) > 3: floor_text = ""
np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3]) project_name = ""
pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
# Floor pattern
np_match = re.search(r'(\d+)\.\s*NP', cell_text)
pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
if np_match: if np_match:
floor = int(np_match.group(1)) floor = int(np_match.group(1))
floor_text = cell_text
elif pp_match: elif pp_match:
floor = -int(pp_match.group(1)) floor = -int(pp_match.group(1)) # Underground
floor_text = cell_text
# Extract project name — usually in a cell that's not a number/price/floor
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "" not in cell_text and "" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
if len(cell_text) > 3 and cell_text != unit_name:
project_name = cell_text
break
listing = { listing = {
"price": int(cena.group(1)), "price": int(cena.group(1)),
@@ -151,58 +159,43 @@ def parse_filter_page(html: str) -> list[dict]:
"project_id": project.group(1) if project else "", "project_id": project.group(1) if project else "",
"transaction": transaction.group(1) if transaction else "", "transaction": transaction.group(1) if transaction else "",
"disposition": dispozition.group(1) if dispozition else "", "disposition": dispozition.group(1) if dispozition else "",
"location": location.group(1) if location else "",
"url": detail_url, "url": detail_url,
"unit_name": unit_name, "unit_name": unit_name,
"floor": floor, "floor": floor,
"project_address": project_address, "project_name": project_name,
} }
listings.append(listing) listings.append(listing)
return listings return listings
def get_lokalita_urls(slug: str) -> list[str]: def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
"""Return candidate lokalita URLs to try in order.""" """Extract GPS coordinates for projects from locality pages."""
return [ # Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
f"{BASE_URL}/projekty/{slug}/lokalita", gps_data = {}
f"{BASE_URL}/bytove-domy/{slug}/lokalita", for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
f"{BASE_URL}/bytove-domy/{slug}/lokalita1", name = match.group(1).strip()
] lat = float(match.group(2))
lon = float(match.group(3))
gps_data[name] = (lat, lon)
return gps_data
def extract_project_gps(html: str) -> tuple[float, float] | None: def load_previous(json_path: str = "byty_cityhome.json") -> dict[str, str]:
"""Extract project GPS from lokalita page JS variable. """Load first_seen dates from previous run, keyed by hash_id."""
path = Path(json_path)
The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...] if not path.exists():
Category '1' = the project's own marker. Some projects have two cat-1 entries (data error); return {}
in that case we pick the one whose name contains a digit and is not a transit landmark. try:
""" data = json.loads(path.read_text(encoding="utf-8"))
block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL) return {str(e["hash_id"]): e.get("first_seen", "") for e in data if "hash_id" in e}
if not block: except (json.JSONDecodeError, KeyError):
return None return {}
entries = re.findall(
r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
block.group(0),
re.DOTALL,
)
if not entries:
return None
if len(entries) == 1:
return float(entries[0][1]), float(entries[0][2])
# Multiple cat-1 entries: pick the real project marker
transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
for name, lat, lng in entries:
if re.search(r'\d', name) and not transit_re.search(name):
return float(lat), float(lng)
# Fallback: first entry
return float(entries[0][1]), float(entries[0][2])
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
previous_first_seen = load_previous()
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z CityHome (city-home.cz)") logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Cena: do {format_price(MAX_PRICE)}")
@@ -231,24 +224,22 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
# Fetch GPS for each project from locality pages # Fetch GPS for each project from locality pages
project_gps = {} project_gps = {}
for slug in sorted(project_slugs): for slug in sorted(project_slugs):
time.sleep(0.3) time.sleep(0.5)
gps = None
for url in get_lokalita_urls(slug):
try: try:
logger.debug(f"Fetching project GPS: {url}") locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
loc_html = fetch_url(url) logger.debug(f"Fetching project GPS: {locality_url}")
loc_html = fetch_url(locality_url)
gps = extract_project_gps(loc_html) gps = extract_project_gps(loc_html)
if gps: if gps:
break # Take first entry (the project itself)
except Exception as e: first_name, (lat, lon) = next(iter(gps.items()))
logger.debug(f"GPS fetch failed for {url}: {e}") project_gps[slug] = (lat, lon)
continue logger.info(f"{slug}: {lat}, {lon}")
if gps:
project_gps[slug] = gps
logger.info(f"{slug}: {gps[0]}, {gps[1]}")
else: else:
logger.info(f"{slug}: GPS nenalezeno") logger.info(f"{slug}: GPS nenalezeno")
except Exception as e:
logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
logger.info(f"{slug}: chyba ({e})")
# Step 3: Filter listings # Step 3: Filter listings
logger.info(f"\nFáze 3: Filtrování...") logger.info(f"\nFáze 3: Filtrování...")
@@ -326,37 +317,28 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
lat, lon = gps lat, lon = gps
# locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup today = datetime.now().strftime("%Y-%m-%d")
project_address = listing.get("project_address", "") hash_id = f"cityhome_{slug}_{listing['unit_name']}"
# derive city from slug (GPS lookup key) first_seen = previous_first_seen.get(str(hash_id), "") or today
city_map = {
"karlinske-namesti-5": "Praha 8",
"melnicka-12": "Praha 7",
"na-vaclavce-34": "Praha 5",
"nad-kajetankou-12": "Praha 6",
"vosmikovych-3": "Praha 9",
"zateckych-14": "Praha 2",
}
city_str = city_map.get(slug, "Praha")
locality_str = f"{project_address}, {city_str}" if project_address else city_str
result = { result = {
"hash_id": f"cityhome_{slug}_{listing['unit_name']}", "hash_id": hash_id,
"name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}", "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
"price": price, "price": price,
"price_formatted": format_price(price), "price_formatted": format_price(price),
"locality": locality_str, "locality": f"{listing['project_name']}, Praha",
"lat": lat, "lat": lat,
"lon": lon, "lon": lon,
"disposition": disp, "disposition": disp,
"floor": floor, "floor": floor,
"area": float(area), "area": area,
"building_type": "Cihlová", # CityHome renovuje cihlové domy "building_type": "Cihlová", # CityHome renovuje cihlové domy
"ownership": "neuvedeno", "ownership": "neuvedeno",
"url": url, "url": url,
"source": "cityhome", "source": "cityhome",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "first_seen": first_seen,
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -7,7 +7,6 @@ Výstup: byty_idnes.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
@@ -15,6 +14,7 @@ import re
import time import time
import urllib.request import urllib.request
import urllib.parse import urllib.parse
from datetime import datetime
from html.parser import HTMLParser from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
@@ -379,10 +379,14 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.debug(f"Max properties limit reached: {max_properties}") logger.debug(f"Max properties limit reached: {max_properties}")
break break
# Check cache — if hash_id exists and price unchanged, reuse # Check cache — if hash_id exists and price unchanged, reuse
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(str(item["id"])) cached = cache.get(str(item["id"]))
if cached and cached.get("price") == item["price"]: if cached and cached.get("price") == item["price"]:
cache_hits += 1 cache_hits += 1
logger.debug(f"Cache hit for id={item['id']}") logger.debug(f"Cache hit for id={item['id']}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached) results.append(cached)
continue continue
@@ -443,6 +447,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
else: else:
building_type = construction.capitalize() building_type = construction.capitalize()
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = { result = {
"hash_id": item["id"], "hash_id": item["id"],
"name": f"Prodej bytu {item['disposition']} {item.get('area', '?')}", "name": f"Prodej bytu {item['disposition']} {item.get('area', '?')}",
@@ -459,7 +468,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": item["url"], "url": item["url"],
"source": "idnes", "source": "idnes",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "first_seen": first_seen,
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
PSN.cz scraper. PSN.cz scraper.
Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování. Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
Výstup: byty_psn.json Výstup: byty_psn.json
""" """
from __future__ import annotations from __future__ import annotations
@@ -14,7 +14,6 @@ import subprocess
import time import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from urllib.parse import urlencode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -24,37 +23,82 @@ MAX_PRICE = 14_000_000
MIN_AREA = 69 MIN_AREA = 69
MIN_FLOOR = 2 MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"} WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
WANTED_CITIES = {"Praha"}
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://psn.cz" BASE_URL = "https://psn.cz"
UNITS_API = f"{BASE_URL}/api/units-list"
# Known Prague project slugs with GPS (from research)
PRAGUE_PROJECTS = [
{"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
{"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
{"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
{"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
{"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
{"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
{"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
{"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
{"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
{"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
{"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
{"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
{"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
{"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
{"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
{"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
]
def fetch_json(url: str) -> dict: def fetch_url(url: str) -> str:
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare).""" """Fetch URL via curl (urllib SSL too old for Cloudflare)."""
logger.debug(f"HTTP GET: {url}") logger.debug(f"HTTP GET request (via curl): {url}")
logger.debug(f"User-Agent: {UA}")
result = subprocess.run( result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30", ["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}", "-H", f"User-Agent: {UA}",
"-H", "Accept: application/json", "-H", "Accept: text/html",
url], url],
capture_output=True, text=True, timeout=60 capture_output=True, text=True, timeout=60
) )
if result.returncode != 0: if result.returncode != 0:
logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}")
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
return json.loads(result.stdout) logger.debug(f"HTTP response: size={len(result.stdout)} bytes")
return result.stdout
def fix_gps(lat, lng): def extract_units_from_html(html: str) -> list[dict]:
"""PSN má u některých projektů prohozené lat/lng — opravíme.""" """Extract unit JSON objects from raw HTML with escaped quotes."""
if lat is not None and lng is not None and lat < 20 and lng > 20: # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
return lng, lat # Step 1: Unescape the double-backslash-quotes to regular quotes
return lat, lng cleaned = html.replace('\\"', '"')
# Step 2: Find each unit by looking for "title":"Byt and walking back to {
units = []
decoder = json.JSONDecoder()
for m in re.finditer(r'"title":"Byt', cleaned):
pos = m.start()
# Walk backwards to find the opening brace
depth = 0
found = False
for i in range(pos - 1, max(pos - 3000, 0), -1):
if cleaned[i] == '}':
depth += 1
elif cleaned[i] == '{':
if depth == 0:
try:
obj, end = decoder.raw_decode(cleaned, i)
if isinstance(obj, dict) and 'price_czk' in obj:
units.append(obj)
found = True
except (json.JSONDecodeError, ValueError):
pass
break
depth -= 1
return units
def format_price(price: int) -> str: def format_price(price: int) -> str:
@@ -66,178 +110,228 @@ def format_price(price: int) -> str:
return " ".join(reversed(parts)) + "" return " ".join(reversed(parts)) + ""
def scrape(max_properties: int | None = None): def load_previous(json_path: str = "byty_psn.json") -> dict[str, str]:
"""Load first_seen dates from previous run, keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {str(e["hash_id"]): e.get("first_seen", "") for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
previous_first_seen = load_previous()
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z PSN.cz") logger.info("Stahuji inzeráty z PSN.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}") logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP") logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha") logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
if max_pages:
logger.info(f"Max. stran: {max_pages}")
if max_properties: if max_properties:
logger.info(f"Max. bytů: {max_properties}") logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60) logger.info("=" * 60)
# Jediný API požadavek — vrátí všechny jednotky (cca 236) # Fetch units from each Prague project
params = urlencode({ all_units = []
"locale": "cs",
"filters": "{}", for proj in PRAGUE_PROJECTS:
"type": "list", page = 1
"order": "price-asc", project_units = []
"offset": 0,
"limit": 500, while True:
}) if max_pages and page > max_pages:
url = f"{UNITS_API}?{params}" logger.debug(f"Max pages limit reached: {max_pages}")
logger.info("Stahuji jednotky z API ...") break
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
logger.info(f"{proj['name']} — strana {page} ...")
time.sleep(0.5)
try: try:
data = fetch_json(url) html = fetch_url(url)
except Exception as e: except Exception as e:
logger.error(f"Chyba při stahování: {e}", exc_info=True) logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True)
return []
all_units = data.get("units", {}).get("data", [])
logger.info(f"Staženo jednotek celkem: {len(all_units)}")
# Filtrování
results = []
excluded = {
"prodáno": 0,
"typ": 0,
"město": 0,
"dispozice": 0,
"cena": 0,
"plocha": 0,
"patro": 0,
}
properties_fetched = 0
for unit in all_units:
if max_properties and properties_fetched >= max_properties:
break break
unit_id = unit.get("id", "?") units = extract_units_from_html(html)
logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units")
# Pouze prodej bytů (type_id=0) if not units:
if unit.get("type_id") != 0: if page == 1:
excluded["typ"] += 1 logger.info(f"→ 0 jednotek")
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)") break
continue
# Pouze volné (ne rezervované, prodané, v přípravě) # Add project info to each unit
sale_status = unit.get("sale_status", "") for unit in units:
if not unit.get("latitude") or not unit.get("longitude"):
unit["latitude"] = proj["lat"]
unit["longitude"] = proj["lon"]
unit["_project_name"] = proj["name"]
unit["_project_slug"] = proj["slug"]
project_units.extend(units)
if page == 1:
logger.info(f"{len(units)} jednotek na stránce")
# Check if there might be more pages
# If we got fewer than expected or same units, stop
if len(units) < 10:
break
page += 1
if page > 10: # Safety limit
break
all_units.extend(project_units)
# Deduplicate by slug
seen_slugs = set()
unique_units = []
for u in all_units:
slug = u.get("slug", "")
if slug and slug not in seen_slugs:
seen_slugs.add(slug)
unique_units.append(u)
elif not slug:
unique_units.append(u)
logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek")
# Filter
logger.info(f"\nFiltrování...")
results = []
excluded_sold = 0
excluded_type = 0
excluded_disp = 0
excluded_price = 0
excluded_area = 0
excluded_floor = 0
excluded_panel = 0
properties_fetched = 0
for unit in unique_units:
if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break
unit_id = unit.get("id", unit.get("slug", "unknown"))
# Only free units
is_free = unit.get("is_free", False) is_free = unit.get("is_free", False)
is_sold = unit.get("is_sold", False) is_sold = unit.get("is_sold", False)
if is_sold or not is_free: if is_sold or not is_free:
excluded["prodáno"] += 1 excluded_sold += 1
logger.debug(f"id={unit_id}: přeskočen (status={sale_status})") logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)")
continue continue
# Pouze Praha # Only apartments
city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip() category = str(unit.get("category", "")).lower()
# location field je typicky "Praha 4", "Praha 7" atd. if "byt" not in category and "ateliér" not in category:
city_base = city.split(" ")[0] if city else "" excluded_type += 1
if city_base not in WANTED_CITIES: logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})")
excluded["město"] += 1
logger.debug(f"id={unit_id}: přeskočen (město={city})")
continue continue
# Dispozice # Disposition
disp = unit.get("disposition", "") disp = unit.get("disposition", "")
if disp not in WANTED_DISPOSITIONS: if disp not in WANTED_DISPOSITIONS:
excluded["dispozice"] += 1 excluded_disp += 1
logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})") logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})")
continue continue
# Cena # Price
price = unit.get("action_price_czk") or unit.get("price_czk") or 0 price = unit.get("price_czk") or unit.get("action_price_czk") or 0
if not price or price <= 0 or price > MAX_PRICE: if price <= 0 or price > MAX_PRICE:
excluded["cena"] += 1 excluded_price += 1
logger.debug(f"id={unit_id}: přeskočen (cena={price})") logger.debug(f"Filter: id={unit_id} - excluded (price {price})")
continue continue
# Plocha # Area
area = unit.get("total_area") or unit.get("floor_area") or 0 area = unit.get("total_area") or unit.get("floor_area") or 0
if area < MIN_AREA: if area < MIN_AREA:
excluded["plocha"] += 1 excluded_area += 1
logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)") logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)")
continue continue
# Patro # Floor
floor_str = str(unit.get("floor", "")) floor_str = str(unit.get("floor", ""))
floor = None floor = None
if floor_str: if floor_str:
try: try:
floor = int(floor_str) floor = int(floor_str)
except ValueError: except ValueError:
m = re.search(r'(-?\d+)', floor_str) floor_match = re.search(r'(-?\d+)', floor_str)
if m: if floor_match:
floor = int(m.group(1)) floor = int(floor_match.group(1))
if floor is not None and floor < MIN_FLOOR: if floor is not None and floor < MIN_FLOOR:
excluded["patro"] += 1 excluded_floor += 1
logger.debug(f"id={unit_id}: přeskočen (patro={floor})") logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})")
continue continue
# GPS — opravit prohozené souřadnice # Construction — check for panel
lat_raw = unit.get("latitude") build_type = str(unit.get("build_type", "")).lower()
lng_raw = unit.get("longitude") if "panel" in build_type:
lat, lng = fix_gps(lat_raw, lng_raw) excluded_panel += 1
if not lat or not lng: logger.debug(f"Filter: id={unit_id} - excluded (panel construction)")
logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji") logger.info(f"✗ Vyloučen: panel ({build_type})")
continue continue
# Sestavit adresu pro locality # Build construction label
addr = unit.get("address") or {} building_type = "neuvedeno"
street = addr.get("street", "") if build_type and build_type != "nevybráno":
street_no = addr.get("street_no", "") if "cihlo" in build_type or "cihla" in build_type:
if street and street_no: building_type = "Cihlová"
locality_str = f"{street} {street_no}, {city}" elif "skelet" in build_type:
elif street: building_type = "Skeletová"
locality_str = f"{street}, {city}"
else: else:
project_name = unit.get("project", "") building_type = build_type.capitalize()
locality_str = f"{project_name}, {city}" if project_name else city
# URL na detail jednotky lat = unit.get("latitude", 0)
unit_slug = unit.get("slug", "") lon = unit.get("longitude", 0)
project_slug = ""
# project_slug lze odvodit z projektu nebo z reference_no slug = unit.get("slug", "")
# API nevrací project_slug přímo — použijeme reference_no nebo jen ID project_slug = unit.get("_project_slug", "")
reference_no = unit.get("reference_no", "") detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
if unit_slug:
detail_url = f"{BASE_URL}/prodej/{unit_slug}" today = datetime.now().strftime("%Y-%m-%d")
elif reference_no: hash_id = unit.get("id", slug)
detail_url = f"{BASE_URL}/prodej/{reference_no}" first_seen = previous_first_seen.get(str(hash_id), "") or today
else:
detail_url = BASE_URL
result = { result = {
"hash_id": str(unit_id), "hash_id": hash_id,
"name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}", "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
"price": int(price), "price": int(price),
"price_formatted": format_price(int(price)), "price_formatted": format_price(int(price)),
"locality": locality_str, "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
"lat": lat, "lat": lat,
"lon": lng, "lon": lon,
"disposition": disp, "disposition": disp,
"floor": floor, "floor": floor,
"area": float(area), "area": area,
"building_type": "neuvedeno", "building_type": building_type,
"ownership": "osobní", "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
"url": detail_url, "url": detail_url,
"source": "psn", "source": "psn",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "first_seen": first_seen,
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1
logger.info(f"\n{'=' * 60}") logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky PSN:") logger.info(f"Výsledky PSN:")
logger.info(f" Staženo jednotek: {len(all_units)}") logger.info(f" Celkem jednotek: {len(unique_units)}")
for reason, count in excluded.items(): logger.info(f" Vyloučeno (prodáno): {excluded_sold}")
if count: logger.info(f" Vyloučeno (typ): {excluded_type}")
logger.info(f" Vyloučeno ({reason}): {count}") logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (panel): {excluded_panel}")
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
@@ -246,13 +340,15 @@ def scrape(max_properties: int | None = None):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz") parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages per project to scrape")
parser.add_argument("--max-properties", type=int, default=None, parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to include in results") help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO", parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging
logging.basicConfig( logging.basicConfig(
level=getattr(logging, args.log_level), level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
@@ -260,7 +356,7 @@ if __name__ == "__main__":
) )
start = time.time() start = time.time()
estates = scrape(max_properties=args.max_properties) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates: if estates:
json_path = Path("byty_psn.json") json_path = Path("byty_psn.json")
@@ -270,6 +366,6 @@ if __name__ == "__main__":
) )
elapsed = time.time() - start elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.1f} s") logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
else: else:
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(") logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")

View File

@@ -7,13 +7,13 @@ Výstup: byty_realingo.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
import re import re
import time import time
import urllib.request import urllib.request
from datetime import datetime
from pathlib import Path from pathlib import Path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -239,10 +239,14 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
# Check cache — if hash_id exists and price unchanged, reuse # Check cache — if hash_id exists and price unchanged, reuse
item_id = int(item["id"]) item_id = int(item["id"])
item_price = item.get("price", {}).get("total", 0) or 0 item_price = item.get("price", {}).get("total", 0) or 0
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(item_id) cached = cache.get(item_id)
if cached and cached.get("price") == item_price: if cached and cached.get("price") == item_price:
cache_hits += 1 cache_hits += 1
logger.debug(f"Cache hit for id={item_id}") logger.debug(f"Cache hit for id={item_id}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached) results.append(cached)
continue continue
@@ -299,6 +303,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
cat = item.get("category", "") cat = item.get("category", "")
loc = item.get("location", {}) loc = item.get("location", {})
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = { result = {
"hash_id": int(item["id"]), "hash_id": int(item["id"]),
"name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')}", "name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')}",
@@ -315,7 +324,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": f"{BASE_URL}{item['url']}", "url": f"{BASE_URL}{item['url']}",
"source": "realingo", "source": "realingo",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"), "first_seen": first_seen,
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -1,204 +0,0 @@
<!DOCTYPE html>
<html lang="cs">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Scraper status</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: system-ui, -apple-system, sans-serif;
background: #f5f5f5; color: #333;
padding: 24px; max-width: 640px; margin: 0 auto;
}
h1 { font-size: 22px; margin-bottom: 4px; }
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
.card {
background: white; border-radius: 12px; padding: 20px;
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
}
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
.timestamp {
font-size: 28px; font-weight: 700; color: #1976D2;
}
.timestamp-ago { font-size: 13px; color: #999; margin-top: 2px; }
/* Source table */
.source-table { width: 100%; border-collapse: collapse; }
.source-table td { padding: 8px 0; border-bottom: 1px solid #f0f0f0; font-size: 14px; }
.source-table tr:last-child td { border-bottom: none; }
.source-table .name { font-weight: 600; }
.source-table .count { text-align: right; font-variant-numeric: tabular-nums; }
.source-table .rejected { text-align: right; color: #999; font-size: 12px; }
.badge {
display: inline-block; padding: 2px 8px; border-radius: 4px;
font-size: 11px; font-weight: 600; color: white;
}
.badge-ok { background: #4CAF50; }
.badge-err { background: #F44336; }
.badge-skip { background: #FF9800; }
/* Summary bar */
.summary-row {
display: flex; justify-content: space-between; align-items: center;
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
}
.summary-row:last-child { border-bottom: none; }
.summary-label { font-size: 13px; color: #666; }
.summary-value { font-size: 18px; font-weight: 700; }
/* Source bar chart */
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
.bar-label { width: 90px; font-size: 12px; text-align: right; color: #666; }
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; position: relative; }
.bar-fill { height: 100%; border-radius: 4px; transition: width 0.5s ease; }
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
/* Loader */
.loader-wrap {
display: flex; flex-direction: column; align-items: center;
justify-content: center; padding: 60px 0;
}
.spinner {
width: 40px; height: 40px; border: 4px solid #e0e0e0;
border-top-color: #1976D2; border-radius: 50%;
animation: spin 0.8s linear infinite;
}
@keyframes spin { to { transform: rotate(360deg); } }
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
.error-msg { color: #F44336; padding: 40px 0; text-align: center; }
.link-row { text-align: center; margin-top: 8px; }
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
</style>
</head>
<body>
<h1>Scraper status</h1>
<div class="subtitle">maru-hleda-byt</div>
<div id="content">
<div class="loader-wrap">
<div class="spinner"></div>
<div class="loader-text">Nacitam status...</div>
</div>
</div>
<div class="link-row"><a href="mapa_bytu.html">Otevrit mapu</a></div>
<script>
var COLORS = {
sreality: '#1976D2',
realingo: '#7B1FA2',
bezrealitky: '#E65100',
idnes: '#C62828',
psn: '#2E7D32',
cityhome: '#00838F',
};
function timeAgo(dateStr) {
var d = new Date(dateStr);
var now = new Date();
var diff = Math.floor((now - d) / 1000);
if (diff < 60) return 'prave ted';
if (diff < 3600) return Math.floor(diff / 60) + ' min zpet';
if (diff < 86400) return Math.floor(diff / 3600) + ' hod zpet';
return Math.floor(diff / 86400) + ' dni zpet';
}
function formatDate(dateStr) {
var d = new Date(dateStr);
var day = d.getDate();
var months = ['ledna','unora','brezna','dubna','kvetna','cervna',
'cervence','srpna','zari','rijna','listopadu','prosince'];
var hh = String(d.getHours()).padStart(2, '0');
var mm = String(d.getMinutes()).padStart(2, '0');
return day + '. ' + months[d.getMonth()] + ' ' + d.getFullYear() + ', ' + hh + ':' + mm;
}
function render(data) {
// Check if scrape is currently running
if (data.status === 'running') {
document.getElementById('content').innerHTML =
'<div class="loader-wrap">' +
'<div class="spinner"></div>' +
'<div class="loader-text">Scraper prave bezi...</div>' +
'</div>';
setTimeout(loadStatus, 30000);
return;
}
var sources = data.sources || [];
var totalOk = 0, totalRej = 0;
var maxCount = 0;
sources.forEach(function(s) {
totalOk += s.accepted || 0;
totalRej += s.rejected || 0;
if (s.accepted > maxCount) maxCount = s.accepted;
});
var html = '';
// Timestamp card
html += '<div class="card">';
html += '<h2>Posledni scrape</h2>';
html += '<div class="timestamp">' + formatDate(data.timestamp) + '</div>';
html += '<div class="timestamp-ago">' + timeAgo(data.timestamp) + '</div>';
if (data.duration_sec) {
html += '<div class="timestamp-ago">Trvani: ' + Math.round(data.duration_sec) + 's</div>';
}
html += '</div>';
// Summary card
html += '<div class="card">';
html += '<h2>Souhrn</h2>';
html += '<div class="summary-row"><span class="summary-label">Vyhovujicich bytu</span><span class="summary-value" style="color:#4CAF50">' + totalOk + '</span></div>';
html += '<div class="summary-row"><span class="summary-label">Vyloucenych</span><span class="summary-value" style="color:#999">' + totalRej + '</span></div>';
if (data.deduplicated !== undefined) {
html += '<div class="summary-row"><span class="summary-label">Po deduplikaci (v mape)</span><span class="summary-value" style="color:#1976D2">' + data.deduplicated + '</span></div>';
}
html += '</div>';
// Sources card
html += '<div class="card">';
html += '<h2>Zdroje</h2>';
sources.forEach(function(s) {
var color = COLORS[s.name.toLowerCase()] || '#999';
var pct = maxCount > 0 ? Math.round((s.accepted / maxCount) * 100) : 0;
var badge = s.error
? '<span class="badge badge-err">chyba</span>'
: (s.accepted === 0 ? '<span class="badge badge-skip">0</span>' : '<span class="badge badge-ok">OK</span>');
html += '<div style="margin-bottom:12px;">';
html += '<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">';
html += '<span style="font-weight:600;font-size:14px;">' + s.name + ' ' + badge + '</span>';
html += '<span style="font-size:12px;color:#999;">' + (s.rejected || 0) + ' vyloucenych</span>';
html += '</div>';
html += '<div class="bar-row">';
html += '<div class="bar-track"><div class="bar-fill" style="width:' + pct + '%;background:' + color + ';"></div></div>';
html += '<span class="bar-count">' + (s.accepted || 0) + '</span>';
html += '</div>';
html += '</div>';
});
html += '</div>';
document.getElementById('content').innerHTML = html;
}
function loadStatus() {
fetch('status.json?t=' + Date.now())
.then(function(r) {
if (!r.ok) throw new Error(r.status);
return r.json();
})
.then(render)
.catch(function(err) {
document.getElementById('content').innerHTML =
'<div class="error-msg">Status zatim neni k dispozici.<br><small>(' + err.message + ')</small></div>';
});
}
loadStatus();
</script>
</body>
</html>