Compare commits
1 Commits
0.01
...
ui-tweaks/
| Author | SHA1 | Date | |
|---|---|---|---|
| b8d4d44164 |
@@ -10,7 +10,7 @@ WORKDIR /app
|
||||
|
||||
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
|
||||
scrape_idnes.py scrape_psn.py scrape_cityhome.py \
|
||||
merge_and_map.py regen_map.py run_all.sh ./
|
||||
merge_and_map.py regen_map.py run_all.sh ratings_server.py ./
|
||||
|
||||
COPY build/crontab /etc/crontabs/root
|
||||
COPY build/entrypoint.sh /entrypoint.sh
|
||||
@@ -18,7 +18,7 @@ RUN chmod +x /entrypoint.sh run_all.sh
|
||||
|
||||
RUN mkdir -p /app/data
|
||||
|
||||
EXPOSE 8080
|
||||
EXPOSE 8080 8081
|
||||
|
||||
HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \
|
||||
CMD wget -q -O /dev/null http://localhost:8080/ || exit 1
|
||||
|
||||
@@ -6,7 +6,7 @@ DATA_DIR="/app/data"
|
||||
# Create symlinks so scripts (which write to /app/) persist data to the volume
|
||||
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
|
||||
byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \
|
||||
mapa_bytu.html; do
|
||||
mapa_bytu.html ratings.json; do
|
||||
# Remove real file if it exists (e.g. baked into image)
|
||||
[ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f"
|
||||
ln -sf "$DATA_DIR/$f" "/app/$f"
|
||||
@@ -18,5 +18,8 @@ crond -b -l 2
|
||||
echo "[entrypoint] Starting initial scrape in background..."
|
||||
bash /app/run_all.sh &
|
||||
|
||||
echo "[entrypoint] Starting ratings API server on port 8081..."
|
||||
DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py &
|
||||
|
||||
echo "[entrypoint] Starting HTTP server on port 8080..."
|
||||
exec python3 -m http.server 8080 --directory "$DATA_DIR"
|
||||
|
||||
202
generate_status.py
Normal file
202
generate_status.py
Normal file
@@ -0,0 +1,202 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate status.json from scraper JSON outputs and run log."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
HERE = Path(__file__).parent
|
||||
|
||||
SOURCE_FILES = {
|
||||
"Sreality": "byty_sreality.json",
|
||||
"Realingo": "byty_realingo.json",
|
||||
"Bezrealitky": "byty_bezrealitky.json",
|
||||
"iDNES": "byty_idnes.json",
|
||||
"PSN": "byty_psn.json",
|
||||
"CityHome": "byty_cityhome.json",
|
||||
}
|
||||
|
||||
MERGED_FILE = "byty_merged.json"
|
||||
|
||||
|
||||
def count_source(path: Path) -> dict:
|
||||
"""Read a scraper JSON and return accepted count + file mtime."""
|
||||
if not path.exists():
|
||||
return {"accepted": 0, "error": "soubor nenalezen"}
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds")
|
||||
return {"accepted": len(data), "updated_at": mtime}
|
||||
except Exception as e:
|
||||
return {"accepted": 0, "error": str(e)}
|
||||
|
||||
|
||||
def parse_log(log_path: str) -> dict[str, dict]:
|
||||
"""Parse scraper run log and extract per-source statistics.
|
||||
|
||||
Scrapers log summary lines like:
|
||||
✓ Vyhovující byty: 12
|
||||
Vyloučeno (prodáno): 5
|
||||
Staženo stránek: 3
|
||||
Staženo inzerátů: 48
|
||||
Celkem bytů v cache: 120
|
||||
and section headers like:
|
||||
[2/6] Realingo
|
||||
"""
|
||||
if not log_path or not os.path.exists(log_path):
|
||||
return {}
|
||||
|
||||
with open(log_path, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Split into per-source sections by the [N/6] Step header
|
||||
# Each section header looks like "[2/6] Realingo\n----..."
|
||||
section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE)
|
||||
sections_found = list(section_pattern.finditer(content))
|
||||
|
||||
if not sections_found:
|
||||
return {}
|
||||
|
||||
stats = {}
|
||||
for i, match in enumerate(sections_found):
|
||||
step_name = match.group(2).strip()
|
||||
start = match.end()
|
||||
end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content)
|
||||
section_text = content[start:end]
|
||||
|
||||
# Identify which sources this section covers
|
||||
# "PSN + CityHome" covers both
|
||||
source_names = []
|
||||
for name in SOURCE_FILES:
|
||||
if name.lower() in step_name.lower():
|
||||
source_names.append(name)
|
||||
if not source_names:
|
||||
continue
|
||||
|
||||
# Parse numeric summary lines
|
||||
def extract(pattern: str) -> Optional[int]:
|
||||
m = re.search(pattern, section_text)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
# Lines present in all/most scrapers
|
||||
accepted = extract(r'Vyhovující byty[:\s]+(\d+)')
|
||||
fetched = extract(r'Staženo inzerátů[:\s]+(\d+)')
|
||||
pages = extract(r'Staženo stránek[:\s]+(\d+)')
|
||||
cached = extract(r'Celkem bytů v cache[:\s]+(\d+)')
|
||||
cache_hits = extract(r'Cache hit[:\s]+(\d+)')
|
||||
|
||||
# Rejection reasons — collect all into a dict
|
||||
excluded = {}
|
||||
for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text):
|
||||
excluded[m.group(1)] = int(m.group(2))
|
||||
# Also PSN-style "Vyloučeno (prodáno): N"
|
||||
total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)')
|
||||
|
||||
entry = {}
|
||||
if accepted is not None:
|
||||
entry["accepted"] = accepted
|
||||
if fetched is not None:
|
||||
entry["fetched"] = fetched
|
||||
if pages is not None:
|
||||
entry["pages"] = pages
|
||||
if cached is not None:
|
||||
entry["cached"] = cached
|
||||
if cache_hits is not None:
|
||||
entry["cache_hits"] = cache_hits
|
||||
if excluded:
|
||||
entry["excluded"] = excluded
|
||||
elif total_excluded is not None:
|
||||
entry["excluded_total"] = total_excluded
|
||||
|
||||
for name in source_names:
|
||||
stats[name] = entry
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
start_time = None
|
||||
duration_sec = None
|
||||
|
||||
if len(sys.argv) >= 3:
|
||||
start_time = sys.argv[1]
|
||||
try:
|
||||
duration_sec = int(sys.argv[2])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if not start_time:
|
||||
start_time = datetime.now().isoformat(timespec="seconds")
|
||||
|
||||
log_path = sys.argv[3] if len(sys.argv) >= 4 else None
|
||||
log_stats = parse_log(log_path)
|
||||
|
||||
sources = []
|
||||
for name, filename in SOURCE_FILES.items():
|
||||
path = HERE / filename
|
||||
info = count_source(path)
|
||||
info["name"] = name
|
||||
|
||||
# Merge log stats
|
||||
ls = log_stats.get(name, {})
|
||||
for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"):
|
||||
if k in ls:
|
||||
info[k] = ls[k]
|
||||
# Override accepted from log if available (log is authoritative for latest run)
|
||||
if "accepted" in ls:
|
||||
info["accepted"] = ls["accepted"]
|
||||
|
||||
sources.append(info)
|
||||
|
||||
# Total accepted before dedup
|
||||
total_accepted = sum(s.get("accepted", 0) for s in sources)
|
||||
|
||||
# Merged / deduplicated count
|
||||
merged_path = HERE / MERGED_FILE
|
||||
deduplicated = 0
|
||||
if merged_path.exists():
|
||||
try:
|
||||
merged = json.loads(merged_path.read_text(encoding="utf-8"))
|
||||
deduplicated = len(merged)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
duplicates_removed = total_accepted - deduplicated if deduplicated else 0
|
||||
|
||||
status = {
|
||||
"status": "done",
|
||||
"timestamp": start_time,
|
||||
"duration_sec": duration_sec,
|
||||
"total_accepted": total_accepted,
|
||||
"deduplicated": deduplicated,
|
||||
"duplicates_removed": duplicates_removed,
|
||||
"sources": sources,
|
||||
}
|
||||
|
||||
out = HERE / "status.json"
|
||||
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"Status uložen: {out}")
|
||||
print(f" Celkem bytů (před dedup): {total_accepted}")
|
||||
print(f" Po deduplikaci: {deduplicated}")
|
||||
if duplicates_removed:
|
||||
print(f" Odstraněno duplikátů: {duplicates_removed}")
|
||||
for s in sources:
|
||||
acc = s.get("accepted", 0)
|
||||
err = s.get("error", "")
|
||||
exc = s.get("excluded", {})
|
||||
exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0)
|
||||
parts = [f"{s['name']:12s}: {acc} bytů"]
|
||||
if exc_total:
|
||||
parts.append(f"({exc_total} vyloučeno)")
|
||||
if err:
|
||||
parts.append(f"[CHYBA: {err}]")
|
||||
print(" " + " ".join(parts))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
961
mapa_bytu.html
961
mapa_bytu.html
File diff suppressed because it is too large
Load Diff
116
ratings_server.py
Normal file
116
ratings_server.py
Normal file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Minimal HTTP API server for persisting apartment ratings.
|
||||
|
||||
GET /api/ratings → returns ratings.json contents
|
||||
POST /api/ratings → saves entire ratings object
|
||||
GET /api/ratings/export → same as GET, but with download header
|
||||
|
||||
Ratings file: /app/data/ratings.json (or ./ratings.json locally)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from pathlib import Path
|
||||
|
||||
PORT = int(os.environ.get("RATINGS_PORT", 8081))
|
||||
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
|
||||
RATINGS_FILE = DATA_DIR / "ratings.json"
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [ratings] %(levelname)s %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_ratings() -> dict:
|
||||
try:
|
||||
if RATINGS_FILE.exists():
|
||||
return json.loads(RATINGS_FILE.read_text(encoding="utf-8"))
|
||||
except Exception as e:
|
||||
log.error("Failed to load ratings: %s", e)
|
||||
return {}
|
||||
|
||||
|
||||
def save_ratings(data: dict) -> None:
|
||||
RATINGS_FILE.write_text(
|
||||
json.dumps(data, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
class RatingsHandler(BaseHTTPRequestHandler):
|
||||
def log_message(self, format, *args):
|
||||
# Suppress default HTTP access log (we use our own)
|
||||
pass
|
||||
|
||||
def _send_json(self, status: int, body: dict, extra_headers=None):
|
||||
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(payload)))
|
||||
self.send_header("Access-Control-Allow-Origin", "*")
|
||||
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||
if extra_headers:
|
||||
for k, v in extra_headers.items():
|
||||
self.send_header(k, v)
|
||||
self.end_headers()
|
||||
self.wfile.write(payload)
|
||||
|
||||
def do_OPTIONS(self):
|
||||
# CORS preflight
|
||||
self.send_response(204)
|
||||
self.send_header("Access-Control-Allow-Origin", "*")
|
||||
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||
self.end_headers()
|
||||
|
||||
def do_GET(self):
|
||||
if self.path in ("/api/ratings", "/api/ratings/export"):
|
||||
ratings = load_ratings()
|
||||
extra = None
|
||||
if self.path == "/api/ratings/export":
|
||||
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
|
||||
log.info("GET %s → %d ratings", self.path, len(ratings))
|
||||
self._send_json(200, ratings, extra)
|
||||
else:
|
||||
self._send_json(404, {"error": "not found"})
|
||||
|
||||
def do_POST(self):
|
||||
if self.path == "/api/ratings":
|
||||
length = int(self.headers.get("Content-Length", 0))
|
||||
if length == 0:
|
||||
self._send_json(400, {"error": "empty body"})
|
||||
return
|
||||
try:
|
||||
raw = self.rfile.read(length)
|
||||
data = json.loads(raw.decode("utf-8"))
|
||||
except Exception as e:
|
||||
log.warning("Bad request body: %s", e)
|
||||
self._send_json(400, {"error": "invalid JSON"})
|
||||
return
|
||||
if not isinstance(data, dict):
|
||||
self._send_json(400, {"error": "expected JSON object"})
|
||||
return
|
||||
save_ratings(data)
|
||||
log.info("POST /api/ratings → saved %d ratings", len(data))
|
||||
self._send_json(200, {"ok": True, "count": len(data)})
|
||||
else:
|
||||
self._send_json(404, {"error": "not found"})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
log.info("Ratings server starting on port %d, data dir: %s", PORT, DATA_DIR)
|
||||
log.info("Ratings file: %s", RATINGS_FILE)
|
||||
server = HTTPServer(("0.0.0.0", PORT), RatingsHandler)
|
||||
try:
|
||||
server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
log.info("Stopped.")
|
||||
sys.exit(0)
|
||||
14
run_all.sh
14
run_all.sh
@@ -16,6 +16,12 @@ NC='\033[0m'
|
||||
TOTAL=6
|
||||
CURRENT=0
|
||||
FAILED=0
|
||||
START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S")
|
||||
START_EPOCH=$(date +%s)
|
||||
LOG_FILE="$(pwd)/scrape_run.log"
|
||||
|
||||
# Mark status as running
|
||||
echo '{"status":"running"}' > status.json
|
||||
|
||||
show_help() {
|
||||
echo "Usage: ./run_all.sh [OPTIONS]"
|
||||
@@ -63,6 +69,8 @@ step() {
|
||||
}
|
||||
|
||||
# ── Scrapery (paralelně kde to jde) ─────────────────────────
|
||||
# Tee all output to log file for status generation
|
||||
exec > >(tee -a "$LOG_FILE") 2>&1
|
||||
|
||||
step "Sreality"
|
||||
python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
||||
@@ -91,6 +99,12 @@ python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((F
|
||||
|
||||
# ── Otevření mapy ────────────────────────────────────────────
|
||||
|
||||
# ── Generování statusu ─────────────────────────────────────
|
||||
|
||||
END_EPOCH=$(date +%s)
|
||||
DURATION=$((END_EPOCH - START_EPOCH))
|
||||
python3 generate_status.py "$START_TIME" "$DURATION" "$LOG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
if [ $FAILED -eq 0 ]; then
|
||||
|
||||
@@ -347,6 +347,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
"ownership": ownership,
|
||||
"url": sreality_url(hash_id, seo),
|
||||
"image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""),
|
||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
results.append(result)
|
||||
details_fetched += 1
|
||||
@@ -373,20 +374,58 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
||||
"""Generate an interactive Leaflet.js HTML map."""
|
||||
|
||||
# Color by disposition
|
||||
color_map = {
|
||||
"3+kk": "#2196F3", # blue
|
||||
"3+1": "#4CAF50", # green
|
||||
"4+kk": "#FF9800", # orange
|
||||
"4+1": "#F44336", # red
|
||||
"5+kk": "#9C27B0", # purple
|
||||
"5+1": "#795548", # brown
|
||||
"6+": "#607D8B", # grey-blue
|
||||
}
|
||||
# Color by price per m² — cool blue→warm red scale, no yellow
|
||||
# Thresholds based on Prague market distribution (p25=120k, p50=144k, p75=162k)
|
||||
price_color_scale = [
|
||||
(110_000, "#1565C0"), # < 110k/m² → deep blue (levné)
|
||||
(130_000, "#42A5F5"), # 110–130k → light blue
|
||||
(150_000, "#66BB6A"), # 130–150k → green (střed)
|
||||
(165_000, "#EF6C00"), # 150–165k → dark orange
|
||||
(float("inf"), "#C62828"), # > 165k → dark red (drahé)
|
||||
]
|
||||
|
||||
def price_color(estate: dict) -> str:
|
||||
price = estate.get("price") or 0
|
||||
area = estate.get("area") or 0
|
||||
if not area:
|
||||
return "#9E9E9E"
|
||||
ppm2 = price / area
|
||||
for threshold, color in price_color_scale:
|
||||
if ppm2 < threshold:
|
||||
return color
|
||||
return "#E53935"
|
||||
|
||||
# Legend bands for info panel (built once)
|
||||
price_legend_items = (
|
||||
'<div style="margin-bottom:4px;font-size:12px;color:#555;font-weight:600;">Cena / m²:</div>'
|
||||
)
|
||||
bands = [
|
||||
("#1565C0", "< 110 000 Kč/m²"),
|
||||
("#42A5F5", "110 – 130 000 Kč/m²"),
|
||||
("#66BB6A", "130 – 150 000 Kč/m²"),
|
||||
("#EF6C00", "150 – 165 000 Kč/m²"),
|
||||
("#C62828", "> 165 000 Kč/m²"),
|
||||
("#9E9E9E", "cena/plocha neuvedena"),
|
||||
]
|
||||
for bcolor, blabel in bands:
|
||||
price_legend_items += (
|
||||
f'<div style="display:flex;align-items:center;gap:6px;margin:2px 0;">'
|
||||
f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};'
|
||||
f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>'
|
||||
f'<span>{blabel}</span></div>'
|
||||
)
|
||||
# New marker indicator — bigger dot, no extra border
|
||||
price_legend_items += (
|
||||
'<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;'
|
||||
'padding-top:6px;border-top:1px solid #eee;">'
|
||||
'<span style="width:18px;height:18px;border-radius:50%;background:#66BB6A;'
|
||||
'display:inline-block;box-shadow:0 1px 4px rgba(0,0,0,0.35);flex-shrink:0;"></span>'
|
||||
'<span>Nové (z dnešního scrapu) — větší</span></div>'
|
||||
)
|
||||
|
||||
markers_js = ""
|
||||
for e in estates:
|
||||
color = color_map.get(e["disposition"], "#999999")
|
||||
color = price_color(e)
|
||||
floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno"
|
||||
area_text = f'{e["area"]} m²' if e["area"] else "neuvedeno"
|
||||
building_text = e["building_type"] or "neuvedeno"
|
||||
@@ -405,11 +444,19 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
||||
|
||||
hash_id = e.get("hash_id", "")
|
||||
|
||||
scraped_at = e.get("scraped_at", "")
|
||||
is_new = scraped_at == datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
new_badge = (
|
||||
'<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;'
|
||||
'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>'
|
||||
if is_new else ""
|
||||
)
|
||||
popup = (
|
||||
f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">'
|
||||
f'<b style="font-size:14px;">{format_price(e["price"])}</b>'
|
||||
f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;'
|
||||
f'padding:1px 6px;border-radius:3px;">{source_label}</span><br>'
|
||||
f'padding:1px 6px;border-radius:3px;">{source_label}</span>{new_badge}<br>'
|
||||
f'<span style="color:#666;">{e["disposition"]} | {area_text} | {floor_text}</span>'
|
||||
f'{floor_note}<br><br>'
|
||||
f'<b>{e["locality"]}</b><br>'
|
||||
@@ -438,26 +485,32 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
||||
popup = popup.replace("'", "\\'").replace("\n", "")
|
||||
|
||||
is_fav = source in ("psn", "cityhome")
|
||||
marker_fn = "addHeartMarker" if is_fav else "addMarker"
|
||||
|
||||
if is_fav:
|
||||
marker_fn = "addHeartMarker"
|
||||
elif is_new:
|
||||
marker_fn = "addNewMarker"
|
||||
else:
|
||||
marker_fn = "addMarker"
|
||||
markers_js += (
|
||||
f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n"
|
||||
)
|
||||
|
||||
# Build legend
|
||||
legend_items = ""
|
||||
# Build legend — price per m² bands + disposition counts
|
||||
legend_items = price_legend_items
|
||||
|
||||
# Disposition counts below the color legend
|
||||
disp_counts = {}
|
||||
for e in estates:
|
||||
d = e["disposition"]
|
||||
disp_counts[d] = disp_counts.get(d, 0) + 1
|
||||
for disp, color in color_map.items():
|
||||
count = disp_counts.get(disp, 0)
|
||||
if count > 0:
|
||||
disp_order = ["3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+"]
|
||||
disp_summary = ", ".join(
|
||||
f"{d} ({disp_counts[d]})" for d in disp_order if d in disp_counts
|
||||
)
|
||||
legend_items += (
|
||||
f'<div style="display:flex;align-items:center;gap:6px;margin:3px 0;">'
|
||||
f'<span style="width:14px;height:14px;border-radius:50%;'
|
||||
f'background:{color};display:inline-block;border:2px solid white;'
|
||||
f'box-shadow:0 1px 3px rgba(0,0,0,0.3);"></span>'
|
||||
f'<span>{disp} ({count})</span></div>'
|
||||
f'<div style="margin-top:8px;padding-top:6px;border-top:1px solid #eee;'
|
||||
f'font-size:12px;color:#666;">{disp_summary}</div>'
|
||||
)
|
||||
|
||||
# Heart marker legend for PSN/CityHome
|
||||
@@ -493,6 +546,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
||||
body {{ font-family: system-ui, -apple-system, sans-serif; }}
|
||||
#map {{ width: 100%; height: 100vh; }}
|
||||
.heart-icon {{ background: none !important; border: none !important; }}
|
||||
.star-icon {{ background: none !important; border: none !important; }}
|
||||
.rate-btn:hover {{ background: #f0f0f0 !important; }}
|
||||
.rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }}
|
||||
.rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }}
|
||||
@@ -503,13 +557,42 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
||||
}}
|
||||
.marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }}
|
||||
.heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }}
|
||||
.heart-icon-rej {{ opacity: 0.2 !important; }}
|
||||
.heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }}
|
||||
.reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }}
|
||||
@keyframes pulse-new {{
|
||||
0% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
|
||||
50% {{ stroke-opacity: 0.4; stroke-width: 6px; r: 12; }}
|
||||
100% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
|
||||
}}
|
||||
.marker-new {{ animation: pulse-new 2s ease-in-out infinite; }}
|
||||
.info-panel {{
|
||||
position: absolute; top: 10px; right: 10px; z-index: 1000;
|
||||
background: white; padding: 16px; border-radius: 10px;
|
||||
box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px;
|
||||
font-size: 13px; line-height: 1.5;
|
||||
transition: transform 0.3s ease, opacity 0.3s ease;
|
||||
}}
|
||||
.info-panel.collapsed {{
|
||||
transform: translateX(calc(100% + 20px));
|
||||
opacity: 0; pointer-events: none;
|
||||
}}
|
||||
.panel-open-btn {{
|
||||
position: absolute; top: 10px; right: 10px; z-index: 1001;
|
||||
width: 40px; height: 40px; border-radius: 8px;
|
||||
background: white; border: none; cursor: pointer;
|
||||
box-shadow: 0 2px 12px rgba(0,0,0,0.15);
|
||||
font-size: 20px; display: flex; align-items: center; justify-content: center;
|
||||
transition: opacity 0.3s ease;
|
||||
}}
|
||||
.panel-open-btn.hidden {{ opacity: 0; pointer-events: none; }}
|
||||
.panel-close-btn {{
|
||||
position: absolute; top: 8px; right: 8px;
|
||||
width: 28px; height: 28px; border-radius: 6px;
|
||||
background: none; border: 1px solid #ddd; cursor: pointer;
|
||||
font-size: 16px; display: flex; align-items: center; justify-content: center;
|
||||
color: #888;
|
||||
}}
|
||||
.panel-close-btn:hover {{ background: #f0f0f0; color: #333; }}
|
||||
.info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }}
|
||||
.info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }}
|
||||
.filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }}
|
||||
@@ -517,18 +600,26 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
||||
.filter-section input[type="checkbox"] {{ accent-color: #1976D2; }}
|
||||
#floor-filter {{ margin-top: 8px; }}
|
||||
#floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }}
|
||||
.status-link {{ display: block; margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; text-align: center; }}
|
||||
.status-link a {{ color: #1976D2; text-decoration: none; font-size: 12px; }}
|
||||
@media (max-width: 600px) {{
|
||||
.info-panel {{ max-width: calc(100vw - 60px); right: 10px; }}
|
||||
.info-panel.collapsed {{ transform: translateX(calc(100% + 20px)); }}
|
||||
.panel-close-btn {{ top: 6px; right: 6px; }}
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="map"></div>
|
||||
<div class="info-panel">
|
||||
<button class="panel-open-btn hidden" id="panel-open-btn" onclick="togglePanel()">☰</button>
|
||||
<div class="info-panel" id="info-panel">
|
||||
<button class="panel-close-btn" id="panel-close-btn" onclick="togglePanel()">✕</button>
|
||||
<h2>Byty v Praze</h2>
|
||||
<div class="stats">
|
||||
<div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div>
|
||||
<div>Cena: {min_price} — {max_price}</div>
|
||||
<div>Průměr: {avg_price}</div>
|
||||
</div>
|
||||
<div><b>Dispozice:</b></div>
|
||||
{legend_items}
|
||||
<div class="filter-section">
|
||||
<b>Filtry:</b>
|
||||
@@ -562,6 +653,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
||||
Skrýt zamítnuté
|
||||
</label>
|
||||
</div>
|
||||
<div class="status-link"><a href="status.html">Scraper status</a></div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
@@ -597,6 +689,23 @@ function addMarker(lat, lon, color, popup, hashId) {{
|
||||
marker.addTo(map);
|
||||
}}
|
||||
|
||||
function addNewMarker(lat, lon, color, popup, hashId) {{
|
||||
var marker = L.circleMarker([lat, lon], {{
|
||||
radius: 12,
|
||||
fillColor: color,
|
||||
color: color,
|
||||
weight: 4,
|
||||
opacity: 0.35,
|
||||
fillOpacity: 0.95,
|
||||
}}).bindPopup(popup);
|
||||
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true }};
|
||||
allMarkers.push(marker);
|
||||
marker.addTo(map);
|
||||
marker.on('add', function() {{
|
||||
if (marker._path) marker._path.classList.add('marker-new');
|
||||
}});
|
||||
}}
|
||||
|
||||
function heartIcon(color) {{
|
||||
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">'
|
||||
+ '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 '
|
||||
@@ -612,6 +721,21 @@ function heartIcon(color) {{
|
||||
}});
|
||||
}}
|
||||
|
||||
function starIcon() {{
|
||||
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="28" height="28" viewBox="0 0 24 24">'
|
||||
+ '<path d="M12 2l3.09 6.26L22 9.27l-5 4.87L18.18 22 12 18.27 '
|
||||
+ '5.82 22 7 14.14 2 9.27l6.91-1.01L12 2z" '
|
||||
+ 'fill="#FFC107" stroke="#F57F17" stroke-width="1" '
|
||||
+ 'filter="drop-shadow(0 1px 3px rgba(0,0,0,0.3))"/></svg>';
|
||||
return L.divIcon({{
|
||||
html: svg,
|
||||
className: 'star-icon',
|
||||
iconSize: [28, 28],
|
||||
iconAnchor: [14, 14],
|
||||
popupAnchor: [0, -14],
|
||||
}});
|
||||
}}
|
||||
|
||||
function addHeartMarker(lat, lon, color, popup, hashId) {{
|
||||
var marker = L.marker([lat, lon], {{
|
||||
icon: heartIcon(color),
|
||||
@@ -637,6 +761,36 @@ function saveRatings(ratings) {{
|
||||
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
|
||||
}}
|
||||
|
||||
function addRejectStrike(marker) {{
|
||||
removeRejectStrike(marker);
|
||||
var color = marker._data.color || '#999';
|
||||
// SVG "no entry" icon — circle with diagonal line, colored to match marker
|
||||
var svg = '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20">'
|
||||
+ '<circle cx="12" cy="12" r="10" fill="none" stroke="' + color + '" stroke-width="2.5" opacity="0.85"/>'
|
||||
+ '<line x1="5.5" y1="5.5" x2="18.5" y2="18.5" stroke="' + color + '" stroke-width="2.5" stroke-linecap="round" opacity="0.85"/>'
|
||||
+ '</svg>';
|
||||
var icon = L.divIcon({{
|
||||
className: 'reject-overlay',
|
||||
html: svg,
|
||||
iconSize: [20, 20],
|
||||
iconAnchor: [10, 10],
|
||||
}});
|
||||
var m = L.marker([marker._data.lat, marker._data.lon], {{
|
||||
icon: icon,
|
||||
interactive: false,
|
||||
pane: 'markerPane',
|
||||
}});
|
||||
m.addTo(map);
|
||||
marker._rejectStrike = m;
|
||||
}}
|
||||
|
||||
function removeRejectStrike(marker) {{
|
||||
if (marker._rejectStrike) {{
|
||||
map.removeLayer(marker._rejectStrike);
|
||||
marker._rejectStrike = null;
|
||||
}}
|
||||
}}
|
||||
|
||||
function applyMarkerStyle(marker, status) {{
|
||||
if (marker._data.isHeart) {{
|
||||
var el = marker._icon;
|
||||
@@ -651,16 +805,33 @@ function applyMarkerStyle(marker, status) {{
|
||||
}}
|
||||
}} else {{
|
||||
if (status === 'fav') {{
|
||||
marker.setStyle({{
|
||||
radius: 12, fillOpacity: 1, weight: 3,
|
||||
fillColor: marker._data.color, color: '#fff',
|
||||
}});
|
||||
if (marker._path) marker._path.classList.add('marker-favorite');
|
||||
removeRejectStrike(marker);
|
||||
if (!marker._data._origCircle) marker._data._origCircle = true;
|
||||
var popup = marker.getPopup();
|
||||
var popupContent = popup ? popup.getContent() : '';
|
||||
var wasOnMap = map.hasLayer(marker);
|
||||
if (wasOnMap) map.removeLayer(marker);
|
||||
var starMarker = L.marker([marker._data.lat, marker._data.lon], {{
|
||||
icon: starIcon(),
|
||||
}}).bindPopup(popupContent);
|
||||
starMarker._data = marker._data;
|
||||
var idx = allMarkers.indexOf(marker);
|
||||
if (idx !== -1) allMarkers[idx] = starMarker;
|
||||
if (wasOnMap) starMarker.addTo(map);
|
||||
}} else if (status === 'reject') {{
|
||||
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
|
||||
revertToCircle(marker, {{ radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1 }});
|
||||
}} else {{
|
||||
marker.setStyle({{
|
||||
radius: 6, fillOpacity: 0.15, fillColor: '#999', color: '#bbb', weight: 1,
|
||||
radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1,
|
||||
}});
|
||||
if (marker._path) marker._path.classList.remove('marker-favorite');
|
||||
}}
|
||||
// Add strikethrough line over the marker
|
||||
addRejectStrike(marker);
|
||||
}} else {{
|
||||
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
|
||||
revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }});
|
||||
}} else {{
|
||||
marker.setStyle({{
|
||||
radius: 8, fillColor: marker._data.color, color: '#fff',
|
||||
@@ -668,8 +839,24 @@ function applyMarkerStyle(marker, status) {{
|
||||
}});
|
||||
if (marker._path) marker._path.classList.remove('marker-favorite');
|
||||
}}
|
||||
if (marker._path) marker._path.classList.remove('marker-rejected');
|
||||
removeRejectStrike(marker);
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
|
||||
function revertToCircle(marker, style) {{
|
||||
var popup = marker.getPopup();
|
||||
var popupContent = popup ? popup.getContent() : '';
|
||||
var wasOnMap = map.hasLayer(marker);
|
||||
if (wasOnMap) map.removeLayer(marker);
|
||||
var cm = L.circleMarker([marker._data.lat, marker._data.lon], style).bindPopup(popupContent);
|
||||
cm._data = marker._data;
|
||||
delete cm._data._starRef;
|
||||
var idx = allMarkers.indexOf(marker);
|
||||
if (idx !== -1) allMarkers[idx] = cm;
|
||||
if (wasOnMap) cm.addTo(map);
|
||||
}}
|
||||
|
||||
function rateMarker(marker, action) {{
|
||||
var hashId = marker._data.hashId;
|
||||
@@ -832,8 +1019,12 @@ function applyFilters() {{
|
||||
if (show) {{
|
||||
if (!map.hasLayer(m)) m.addTo(map);
|
||||
visible++;
|
||||
// Show strike line if rejected and visible
|
||||
if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map);
|
||||
}} else {{
|
||||
if (map.hasLayer(m)) map.removeLayer(m);
|
||||
// Hide strike line when marker hidden
|
||||
if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike);
|
||||
}}
|
||||
}});
|
||||
|
||||
@@ -851,6 +1042,26 @@ function applyFilters() {{
|
||||
// Initialize ratings on load
|
||||
restoreRatings();
|
||||
|
||||
// ── Panel toggle ──────────────────────────────────────────────
|
||||
function togglePanel() {{
|
||||
var panel = document.getElementById('info-panel');
|
||||
var openBtn = document.getElementById('panel-open-btn');
|
||||
var isOpen = !panel.classList.contains('collapsed');
|
||||
if (isOpen) {{
|
||||
panel.classList.add('collapsed');
|
||||
openBtn.classList.remove('hidden');
|
||||
}} else {{
|
||||
panel.classList.remove('collapsed');
|
||||
openBtn.classList.add('hidden');
|
||||
}}
|
||||
}}
|
||||
|
||||
// On mobile, start with panel collapsed
|
||||
if (window.innerWidth <= 600) {{
|
||||
document.getElementById('info-panel').classList.add('collapsed');
|
||||
document.getElementById('panel-open-btn').classList.remove('hidden');
|
||||
}}
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
@@ -7,6 +7,7 @@ Výstup: byty_bezrealitky.json
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
@@ -355,6 +356,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
"url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
|
||||
"source": "bezrealitky",
|
||||
"image": "",
|
||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
@@ -12,6 +12,7 @@ import logging
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -33,24 +34,26 @@ HEADERS = {
|
||||
BASE_URL = "https://www.city-home.cz"
|
||||
|
||||
|
||||
def fetch_url(url: str) -> str:
|
||||
"""Fetch URL and return HTML string."""
|
||||
for attempt in range(3):
|
||||
def fetch_url(url: str, retries: int = 3) -> str:
|
||||
"""Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx."""
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
|
||||
logger.debug(f"Headers: {HEADERS}")
|
||||
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
html = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||
return html
|
||||
except urllib.error.HTTPError:
|
||||
# Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
|
||||
raise
|
||||
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
|
||||
if attempt < 2:
|
||||
if attempt < retries - 1:
|
||||
wait = (attempt + 1) * 2
|
||||
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
|
||||
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
|
||||
time.sleep(wait)
|
||||
else:
|
||||
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
|
||||
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
@@ -124,31 +127,21 @@ def parse_filter_page(html: str) -> list[dict]:
|
||||
if detail_url and not detail_url.startswith("http"):
|
||||
detail_url = BASE_URL + detail_url
|
||||
|
||||
# Extract floor from cells — look for pattern like "3.NP" or "2.PP"
|
||||
# Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price]
|
||||
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
|
||||
floor = None
|
||||
floor_text = ""
|
||||
project_name = ""
|
||||
cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
|
||||
|
||||
for cell in cells:
|
||||
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
|
||||
# Floor pattern
|
||||
np_match = re.search(r'(\d+)\.\s*NP', cell_text)
|
||||
pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
|
||||
# Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
|
||||
project_address = cell_texts[2] if len(cell_texts) > 2 else ""
|
||||
|
||||
floor = None
|
||||
if len(cell_texts) > 3:
|
||||
np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3])
|
||||
pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
|
||||
if np_match:
|
||||
floor = int(np_match.group(1))
|
||||
floor_text = cell_text
|
||||
elif pp_match:
|
||||
floor = -int(pp_match.group(1)) # Underground
|
||||
floor_text = cell_text
|
||||
|
||||
# Extract project name — usually in a cell that's not a number/price/floor
|
||||
for cell in cells:
|
||||
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
|
||||
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
|
||||
if len(cell_text) > 3 and cell_text != unit_name:
|
||||
project_name = cell_text
|
||||
break
|
||||
floor = -int(pp_match.group(1))
|
||||
|
||||
listing = {
|
||||
"price": int(cena.group(1)),
|
||||
@@ -158,27 +151,55 @@ def parse_filter_page(html: str) -> list[dict]:
|
||||
"project_id": project.group(1) if project else "",
|
||||
"transaction": transaction.group(1) if transaction else "",
|
||||
"disposition": dispozition.group(1) if dispozition else "",
|
||||
"location": location.group(1) if location else "",
|
||||
"url": detail_url,
|
||||
"unit_name": unit_name,
|
||||
"floor": floor,
|
||||
"project_name": project_name,
|
||||
"project_address": project_address,
|
||||
}
|
||||
listings.append(listing)
|
||||
|
||||
return listings
|
||||
|
||||
|
||||
def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
|
||||
"""Extract GPS coordinates for projects from locality pages."""
|
||||
# Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
|
||||
gps_data = {}
|
||||
for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
|
||||
name = match.group(1).strip()
|
||||
lat = float(match.group(2))
|
||||
lon = float(match.group(3))
|
||||
gps_data[name] = (lat, lon)
|
||||
return gps_data
|
||||
def get_lokalita_urls(slug: str) -> list[str]:
|
||||
"""Return candidate lokalita URLs to try in order."""
|
||||
return [
|
||||
f"{BASE_URL}/projekty/{slug}/lokalita",
|
||||
f"{BASE_URL}/bytove-domy/{slug}/lokalita",
|
||||
f"{BASE_URL}/bytove-domy/{slug}/lokalita1",
|
||||
]
|
||||
|
||||
|
||||
def extract_project_gps(html: str) -> tuple[float, float] | None:
|
||||
"""Extract project GPS from lokalita page JS variable.
|
||||
|
||||
The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...]
|
||||
Category '1' = the project's own marker. Some projects have two cat-1 entries (data error);
|
||||
in that case we pick the one whose name contains a digit and is not a transit landmark.
|
||||
"""
|
||||
block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL)
|
||||
if not block:
|
||||
return None
|
||||
|
||||
entries = re.findall(
|
||||
r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
|
||||
block.group(0),
|
||||
re.DOTALL,
|
||||
)
|
||||
if not entries:
|
||||
return None
|
||||
|
||||
if len(entries) == 1:
|
||||
return float(entries[0][1]), float(entries[0][2])
|
||||
|
||||
# Multiple cat-1 entries: pick the real project marker
|
||||
transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
|
||||
for name, lat, lng in entries:
|
||||
if re.search(r'\d', name) and not transit_re.search(name):
|
||||
return float(lat), float(lng)
|
||||
|
||||
# Fallback: first entry
|
||||
return float(entries[0][1]), float(entries[0][2])
|
||||
|
||||
|
||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
@@ -210,22 +231,24 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
# Fetch GPS for each project from locality pages
|
||||
project_gps = {}
|
||||
for slug in sorted(project_slugs):
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.3)
|
||||
gps = None
|
||||
for url in get_lokalita_urls(slug):
|
||||
try:
|
||||
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
|
||||
logger.debug(f"Fetching project GPS: {locality_url}")
|
||||
loc_html = fetch_url(locality_url)
|
||||
logger.debug(f"Fetching project GPS: {url}")
|
||||
loc_html = fetch_url(url)
|
||||
gps = extract_project_gps(loc_html)
|
||||
if gps:
|
||||
# Take first entry (the project itself)
|
||||
first_name, (lat, lon) = next(iter(gps.items()))
|
||||
project_gps[slug] = (lat, lon)
|
||||
logger.info(f"✓ {slug}: {lat}, {lon}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"GPS fetch failed for {url}: {e}")
|
||||
continue
|
||||
|
||||
if gps:
|
||||
project_gps[slug] = gps
|
||||
logger.info(f"✓ {slug}: {gps[0]}, {gps[1]}")
|
||||
else:
|
||||
logger.info(f"✗ {slug}: GPS nenalezeno")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
|
||||
logger.info(f"✗ {slug}: chyba ({e})")
|
||||
|
||||
# Step 3: Filter listings
|
||||
logger.info(f"\nFáze 3: Filtrování...")
|
||||
@@ -303,22 +326,37 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
|
||||
lat, lon = gps
|
||||
|
||||
# locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup
|
||||
project_address = listing.get("project_address", "")
|
||||
# derive city from slug (GPS lookup key)
|
||||
city_map = {
|
||||
"karlinske-namesti-5": "Praha 8",
|
||||
"melnicka-12": "Praha 7",
|
||||
"na-vaclavce-34": "Praha 5",
|
||||
"nad-kajetankou-12": "Praha 6",
|
||||
"vosmikovych-3": "Praha 9",
|
||||
"zateckych-14": "Praha 2",
|
||||
}
|
||||
city_str = city_map.get(slug, "Praha")
|
||||
locality_str = f"{project_address}, {city_str}" if project_address else city_str
|
||||
|
||||
result = {
|
||||
"hash_id": f"cityhome_{slug}_{listing['unit_name']}",
|
||||
"name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
|
||||
"name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}",
|
||||
"price": price,
|
||||
"price_formatted": format_price(price),
|
||||
"locality": f"{listing['project_name']}, Praha",
|
||||
"locality": locality_str,
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"disposition": disp,
|
||||
"floor": floor,
|
||||
"area": area,
|
||||
"area": float(area),
|
||||
"building_type": "Cihlová", # CityHome renovuje cihlové domy
|
||||
"ownership": "neuvedeno",
|
||||
"url": url,
|
||||
"source": "cityhome",
|
||||
"image": "",
|
||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
@@ -7,6 +7,7 @@ Výstup: byty_idnes.json
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
@@ -458,6 +459,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
"url": item["url"],
|
||||
"source": "idnes",
|
||||
"image": "",
|
||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
332
scrape_psn.py
332
scrape_psn.py
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PSN.cz scraper.
|
||||
Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
|
||||
Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování.
|
||||
Výstup: byty_psn.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
@@ -12,7 +12,9 @@ import logging
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlencode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -22,82 +24,37 @@ MAX_PRICE = 14_000_000
|
||||
MIN_AREA = 69
|
||||
MIN_FLOOR = 2
|
||||
|
||||
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
|
||||
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"}
|
||||
|
||||
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
|
||||
WANTED_CITIES = {"Praha"}
|
||||
|
||||
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
|
||||
BASE_URL = "https://psn.cz"
|
||||
|
||||
# Known Prague project slugs with GPS (from research)
|
||||
PRAGUE_PROJECTS = [
|
||||
{"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
|
||||
{"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
|
||||
{"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
|
||||
{"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
|
||||
{"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
|
||||
{"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
|
||||
{"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
|
||||
{"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
|
||||
{"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
|
||||
{"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
|
||||
{"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
|
||||
{"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
|
||||
{"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
|
||||
{"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
|
||||
{"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
|
||||
{"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
|
||||
]
|
||||
UNITS_API = f"{BASE_URL}/api/units-list"
|
||||
|
||||
|
||||
def fetch_url(url: str) -> str:
|
||||
"""Fetch URL via curl (urllib SSL too old for Cloudflare)."""
|
||||
logger.debug(f"HTTP GET request (via curl): {url}")
|
||||
logger.debug(f"User-Agent: {UA}")
|
||||
def fetch_json(url: str) -> dict:
|
||||
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
|
||||
logger.debug(f"HTTP GET: {url}")
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "-L", "--max-time", "30",
|
||||
"-H", f"User-Agent: {UA}",
|
||||
"-H", "Accept: text/html",
|
||||
"-H", "Accept: application/json",
|
||||
url],
|
||||
capture_output=True, text=True, timeout=60
|
||||
)
|
||||
if result.returncode != 0:
|
||||
logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}")
|
||||
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
|
||||
logger.debug(f"HTTP response: size={len(result.stdout)} bytes")
|
||||
return result.stdout
|
||||
return json.loads(result.stdout)
|
||||
|
||||
|
||||
def extract_units_from_html(html: str) -> list[dict]:
|
||||
"""Extract unit JSON objects from raw HTML with escaped quotes."""
|
||||
# The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
|
||||
# Step 1: Unescape the double-backslash-quotes to regular quotes
|
||||
cleaned = html.replace('\\"', '"')
|
||||
|
||||
# Step 2: Find each unit by looking for "title":"Byt and walking back to {
|
||||
units = []
|
||||
decoder = json.JSONDecoder()
|
||||
|
||||
for m in re.finditer(r'"title":"Byt', cleaned):
|
||||
pos = m.start()
|
||||
# Walk backwards to find the opening brace
|
||||
depth = 0
|
||||
found = False
|
||||
for i in range(pos - 1, max(pos - 3000, 0), -1):
|
||||
if cleaned[i] == '}':
|
||||
depth += 1
|
||||
elif cleaned[i] == '{':
|
||||
if depth == 0:
|
||||
try:
|
||||
obj, end = decoder.raw_decode(cleaned, i)
|
||||
if isinstance(obj, dict) and 'price_czk' in obj:
|
||||
units.append(obj)
|
||||
found = True
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
break
|
||||
depth -= 1
|
||||
|
||||
return units
|
||||
def fix_gps(lat, lng):
|
||||
"""PSN má u některých projektů prohozené lat/lng — opravíme."""
|
||||
if lat is not None and lng is not None and lat < 20 and lng > 20:
|
||||
return lng, lat
|
||||
return lat, lng
|
||||
|
||||
|
||||
def format_price(price: int) -> str:
|
||||
@@ -109,209 +66,178 @@ def format_price(price: int) -> str:
|
||||
return " ".join(reversed(parts)) + " Kč"
|
||||
|
||||
|
||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
def scrape(max_properties: int | None = None):
|
||||
logger.info("=" * 60)
|
||||
logger.info("Stahuji inzeráty z PSN.cz")
|
||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
||||
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
||||
logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
|
||||
if max_pages:
|
||||
logger.info(f"Max. stran: {max_pages}")
|
||||
logger.info(f"Region: Praha")
|
||||
if max_properties:
|
||||
logger.info(f"Max. bytů: {max_properties}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Fetch units from each Prague project
|
||||
all_units = []
|
||||
|
||||
for proj in PRAGUE_PROJECTS:
|
||||
page = 1
|
||||
project_units = []
|
||||
|
||||
while True:
|
||||
if max_pages and page > max_pages:
|
||||
logger.debug(f"Max pages limit reached: {max_pages}")
|
||||
break
|
||||
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
|
||||
logger.info(f"{proj['name']} — strana {page} ...")
|
||||
time.sleep(0.5)
|
||||
# Jediný API požadavek — vrátí všechny jednotky (cca 236)
|
||||
params = urlencode({
|
||||
"locale": "cs",
|
||||
"filters": "{}",
|
||||
"type": "list",
|
||||
"order": "price-asc",
|
||||
"offset": 0,
|
||||
"limit": 500,
|
||||
})
|
||||
url = f"{UNITS_API}?{params}"
|
||||
logger.info("Stahuji jednotky z API ...")
|
||||
|
||||
try:
|
||||
html = fetch_url(url)
|
||||
data = fetch_json(url)
|
||||
except Exception as e:
|
||||
logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True)
|
||||
break
|
||||
logger.error(f"Chyba při stahování: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
units = extract_units_from_html(html)
|
||||
logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units")
|
||||
all_units = data.get("units", {}).get("data", [])
|
||||
logger.info(f"Staženo jednotek celkem: {len(all_units)}")
|
||||
|
||||
if not units:
|
||||
if page == 1:
|
||||
logger.info(f"→ 0 jednotek")
|
||||
break
|
||||
|
||||
# Add project info to each unit
|
||||
for unit in units:
|
||||
if not unit.get("latitude") or not unit.get("longitude"):
|
||||
unit["latitude"] = proj["lat"]
|
||||
unit["longitude"] = proj["lon"]
|
||||
unit["_project_name"] = proj["name"]
|
||||
unit["_project_slug"] = proj["slug"]
|
||||
|
||||
project_units.extend(units)
|
||||
|
||||
if page == 1:
|
||||
logger.info(f"→ {len(units)} jednotek na stránce")
|
||||
|
||||
# Check if there might be more pages
|
||||
# If we got fewer than expected or same units, stop
|
||||
if len(units) < 10:
|
||||
break
|
||||
|
||||
page += 1
|
||||
if page > 10: # Safety limit
|
||||
break
|
||||
|
||||
all_units.extend(project_units)
|
||||
|
||||
# Deduplicate by slug
|
||||
seen_slugs = set()
|
||||
unique_units = []
|
||||
for u in all_units:
|
||||
slug = u.get("slug", "")
|
||||
if slug and slug not in seen_slugs:
|
||||
seen_slugs.add(slug)
|
||||
unique_units.append(u)
|
||||
elif not slug:
|
||||
unique_units.append(u)
|
||||
|
||||
logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek")
|
||||
|
||||
# Filter
|
||||
logger.info(f"\nFiltrování...")
|
||||
# Filtrování
|
||||
results = []
|
||||
excluded_sold = 0
|
||||
excluded_type = 0
|
||||
excluded_disp = 0
|
||||
excluded_price = 0
|
||||
excluded_area = 0
|
||||
excluded_floor = 0
|
||||
excluded_panel = 0
|
||||
excluded = {
|
||||
"prodáno": 0,
|
||||
"typ": 0,
|
||||
"město": 0,
|
||||
"dispozice": 0,
|
||||
"cena": 0,
|
||||
"plocha": 0,
|
||||
"patro": 0,
|
||||
}
|
||||
properties_fetched = 0
|
||||
|
||||
for unit in unique_units:
|
||||
for unit in all_units:
|
||||
if max_properties and properties_fetched >= max_properties:
|
||||
logger.debug(f"Max properties limit reached: {max_properties}")
|
||||
break
|
||||
unit_id = unit.get("id", unit.get("slug", "unknown"))
|
||||
# Only free units
|
||||
|
||||
unit_id = unit.get("id", "?")
|
||||
|
||||
# Pouze prodej bytů (type_id=0)
|
||||
if unit.get("type_id") != 0:
|
||||
excluded["typ"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)")
|
||||
continue
|
||||
|
||||
# Pouze volné (ne rezervované, prodané, v přípravě)
|
||||
sale_status = unit.get("sale_status", "")
|
||||
is_free = unit.get("is_free", False)
|
||||
is_sold = unit.get("is_sold", False)
|
||||
if is_sold or not is_free:
|
||||
excluded_sold += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)")
|
||||
excluded["prodáno"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (status={sale_status})")
|
||||
continue
|
||||
|
||||
# Only apartments
|
||||
category = str(unit.get("category", "")).lower()
|
||||
if "byt" not in category and "ateliér" not in category:
|
||||
excluded_type += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})")
|
||||
# Pouze Praha
|
||||
city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip()
|
||||
# location field je typicky "Praha 4", "Praha 7" atd.
|
||||
city_base = city.split(" ")[0] if city else ""
|
||||
if city_base not in WANTED_CITIES:
|
||||
excluded["město"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (město={city})")
|
||||
continue
|
||||
|
||||
# Disposition
|
||||
# Dispozice
|
||||
disp = unit.get("disposition", "")
|
||||
if disp not in WANTED_DISPOSITIONS:
|
||||
excluded_disp += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})")
|
||||
excluded["dispozice"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})")
|
||||
continue
|
||||
|
||||
# Price
|
||||
price = unit.get("price_czk") or unit.get("action_price_czk") or 0
|
||||
if price <= 0 or price > MAX_PRICE:
|
||||
excluded_price += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (price {price})")
|
||||
# Cena
|
||||
price = unit.get("action_price_czk") or unit.get("price_czk") or 0
|
||||
if not price or price <= 0 or price > MAX_PRICE:
|
||||
excluded["cena"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (cena={price})")
|
||||
continue
|
||||
|
||||
# Area
|
||||
# Plocha
|
||||
area = unit.get("total_area") or unit.get("floor_area") or 0
|
||||
if area < MIN_AREA:
|
||||
excluded_area += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)")
|
||||
excluded["plocha"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)")
|
||||
continue
|
||||
|
||||
# Floor
|
||||
# Patro
|
||||
floor_str = str(unit.get("floor", ""))
|
||||
floor = None
|
||||
if floor_str:
|
||||
try:
|
||||
floor = int(floor_str)
|
||||
except ValueError:
|
||||
floor_match = re.search(r'(-?\d+)', floor_str)
|
||||
if floor_match:
|
||||
floor = int(floor_match.group(1))
|
||||
m = re.search(r'(-?\d+)', floor_str)
|
||||
if m:
|
||||
floor = int(m.group(1))
|
||||
|
||||
if floor is not None and floor < MIN_FLOOR:
|
||||
excluded_floor += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})")
|
||||
excluded["patro"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (patro={floor})")
|
||||
continue
|
||||
|
||||
# Construction — check for panel
|
||||
build_type = str(unit.get("build_type", "")).lower()
|
||||
if "panel" in build_type:
|
||||
excluded_panel += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (panel construction)")
|
||||
logger.info(f"✗ Vyloučen: panel ({build_type})")
|
||||
# GPS — opravit prohozené souřadnice
|
||||
lat_raw = unit.get("latitude")
|
||||
lng_raw = unit.get("longitude")
|
||||
lat, lng = fix_gps(lat_raw, lng_raw)
|
||||
if not lat or not lng:
|
||||
logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji")
|
||||
continue
|
||||
|
||||
# Build construction label
|
||||
building_type = "neuvedeno"
|
||||
if build_type and build_type != "nevybráno":
|
||||
if "cihlo" in build_type or "cihla" in build_type:
|
||||
building_type = "Cihlová"
|
||||
elif "skelet" in build_type:
|
||||
building_type = "Skeletová"
|
||||
# Sestavit adresu pro locality
|
||||
addr = unit.get("address") or {}
|
||||
street = addr.get("street", "")
|
||||
street_no = addr.get("street_no", "")
|
||||
if street and street_no:
|
||||
locality_str = f"{street} {street_no}, {city}"
|
||||
elif street:
|
||||
locality_str = f"{street}, {city}"
|
||||
else:
|
||||
building_type = build_type.capitalize()
|
||||
project_name = unit.get("project", "")
|
||||
locality_str = f"{project_name}, {city}" if project_name else city
|
||||
|
||||
lat = unit.get("latitude", 0)
|
||||
lon = unit.get("longitude", 0)
|
||||
|
||||
slug = unit.get("slug", "")
|
||||
project_slug = unit.get("_project_slug", "")
|
||||
detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
|
||||
# URL na detail jednotky
|
||||
unit_slug = unit.get("slug", "")
|
||||
project_slug = ""
|
||||
# project_slug lze odvodit z projektu nebo z reference_no
|
||||
# API nevrací project_slug přímo — použijeme reference_no nebo jen ID
|
||||
reference_no = unit.get("reference_no", "")
|
||||
if unit_slug:
|
||||
detail_url = f"{BASE_URL}/prodej/{unit_slug}"
|
||||
elif reference_no:
|
||||
detail_url = f"{BASE_URL}/prodej/{reference_no}"
|
||||
else:
|
||||
detail_url = BASE_URL
|
||||
|
||||
result = {
|
||||
"hash_id": unit.get("id", slug),
|
||||
"name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
|
||||
"hash_id": str(unit_id),
|
||||
"name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}",
|
||||
"price": int(price),
|
||||
"price_formatted": format_price(int(price)),
|
||||
"locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
|
||||
"locality": locality_str,
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"lon": lng,
|
||||
"disposition": disp,
|
||||
"floor": floor,
|
||||
"area": area,
|
||||
"building_type": building_type,
|
||||
"ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
|
||||
"area": float(area),
|
||||
"building_type": "neuvedeno",
|
||||
"ownership": "osobní",
|
||||
"url": detail_url,
|
||||
"source": "psn",
|
||||
"image": "",
|
||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info(f"Výsledky PSN:")
|
||||
logger.info(f" Celkem jednotek: {len(unique_units)}")
|
||||
logger.info(f" Vyloučeno (prodáno): {excluded_sold}")
|
||||
logger.info(f" Vyloučeno (typ): {excluded_type}")
|
||||
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
|
||||
logger.info(f" Vyloučeno (cena): {excluded_price}")
|
||||
logger.info(f" Vyloučeno (plocha): {excluded_area}")
|
||||
logger.info(f" Vyloučeno (patro): {excluded_floor}")
|
||||
logger.info(f" Vyloučeno (panel): {excluded_panel}")
|
||||
logger.info(f" Staženo jednotek: {len(all_units)}")
|
||||
for reason, count in excluded.items():
|
||||
if count:
|
||||
logger.info(f" Vyloučeno ({reason}): {count}")
|
||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||
logger.info(f"{'=' * 60}")
|
||||
|
||||
@@ -320,15 +246,13 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
|
||||
parser.add_argument("--max-pages", type=int, default=None,
|
||||
help="Maximum number of listing pages per project to scrape")
|
||||
parser.add_argument("--max-properties", type=int, default=None,
|
||||
help="Maximum number of properties to include in results")
|
||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
parser.add_argument("--log-level", type=str, default="INFO",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Logging level (default: INFO)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, args.log_level),
|
||||
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
||||
@@ -336,7 +260,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||
estates = scrape(max_properties=args.max_properties)
|
||||
|
||||
if estates:
|
||||
json_path = Path("byty_psn.json")
|
||||
@@ -346,6 +270,6 @@ if __name__ == "__main__":
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
logger.info(f"⏱ Celkový čas: {elapsed:.1f} s")
|
||||
else:
|
||||
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")
|
||||
|
||||
@@ -7,6 +7,7 @@ Výstup: byty_realingo.json
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
@@ -314,6 +315,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
"url": f"{BASE_URL}{item['url']}",
|
||||
"source": "realingo",
|
||||
"image": "",
|
||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
204
status.html
Normal file
204
status.html
Normal file
@@ -0,0 +1,204 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="cs">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Scraper status</title>
|
||||
<style>
|
||||
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||||
body {
|
||||
font-family: system-ui, -apple-system, sans-serif;
|
||||
background: #f5f5f5; color: #333;
|
||||
padding: 24px; max-width: 640px; margin: 0 auto;
|
||||
}
|
||||
h1 { font-size: 22px; margin-bottom: 4px; }
|
||||
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
|
||||
.card {
|
||||
background: white; border-radius: 12px; padding: 20px;
|
||||
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
|
||||
}
|
||||
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
|
||||
.timestamp {
|
||||
font-size: 28px; font-weight: 700; color: #1976D2;
|
||||
}
|
||||
.timestamp-ago { font-size: 13px; color: #999; margin-top: 2px; }
|
||||
|
||||
/* Source table */
|
||||
.source-table { width: 100%; border-collapse: collapse; }
|
||||
.source-table td { padding: 8px 0; border-bottom: 1px solid #f0f0f0; font-size: 14px; }
|
||||
.source-table tr:last-child td { border-bottom: none; }
|
||||
.source-table .name { font-weight: 600; }
|
||||
.source-table .count { text-align: right; font-variant-numeric: tabular-nums; }
|
||||
.source-table .rejected { text-align: right; color: #999; font-size: 12px; }
|
||||
.badge {
|
||||
display: inline-block; padding: 2px 8px; border-radius: 4px;
|
||||
font-size: 11px; font-weight: 600; color: white;
|
||||
}
|
||||
.badge-ok { background: #4CAF50; }
|
||||
.badge-err { background: #F44336; }
|
||||
.badge-skip { background: #FF9800; }
|
||||
|
||||
/* Summary bar */
|
||||
.summary-row {
|
||||
display: flex; justify-content: space-between; align-items: center;
|
||||
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
|
||||
}
|
||||
.summary-row:last-child { border-bottom: none; }
|
||||
.summary-label { font-size: 13px; color: #666; }
|
||||
.summary-value { font-size: 18px; font-weight: 700; }
|
||||
|
||||
/* Source bar chart */
|
||||
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
|
||||
.bar-label { width: 90px; font-size: 12px; text-align: right; color: #666; }
|
||||
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; position: relative; }
|
||||
.bar-fill { height: 100%; border-radius: 4px; transition: width 0.5s ease; }
|
||||
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
|
||||
|
||||
/* Loader */
|
||||
.loader-wrap {
|
||||
display: flex; flex-direction: column; align-items: center;
|
||||
justify-content: center; padding: 60px 0;
|
||||
}
|
||||
.spinner {
|
||||
width: 40px; height: 40px; border: 4px solid #e0e0e0;
|
||||
border-top-color: #1976D2; border-radius: 50%;
|
||||
animation: spin 0.8s linear infinite;
|
||||
}
|
||||
@keyframes spin { to { transform: rotate(360deg); } }
|
||||
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
|
||||
|
||||
.error-msg { color: #F44336; padding: 40px 0; text-align: center; }
|
||||
.link-row { text-align: center; margin-top: 8px; }
|
||||
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>Scraper status</h1>
|
||||
<div class="subtitle">maru-hleda-byt</div>
|
||||
|
||||
<div id="content">
|
||||
<div class="loader-wrap">
|
||||
<div class="spinner"></div>
|
||||
<div class="loader-text">Nacitam status...</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="link-row"><a href="mapa_bytu.html">Otevrit mapu</a></div>
|
||||
|
||||
<script>
|
||||
var COLORS = {
|
||||
sreality: '#1976D2',
|
||||
realingo: '#7B1FA2',
|
||||
bezrealitky: '#E65100',
|
||||
idnes: '#C62828',
|
||||
psn: '#2E7D32',
|
||||
cityhome: '#00838F',
|
||||
};
|
||||
|
||||
function timeAgo(dateStr) {
|
||||
var d = new Date(dateStr);
|
||||
var now = new Date();
|
||||
var diff = Math.floor((now - d) / 1000);
|
||||
if (diff < 60) return 'prave ted';
|
||||
if (diff < 3600) return Math.floor(diff / 60) + ' min zpet';
|
||||
if (diff < 86400) return Math.floor(diff / 3600) + ' hod zpet';
|
||||
return Math.floor(diff / 86400) + ' dni zpet';
|
||||
}
|
||||
|
||||
function formatDate(dateStr) {
|
||||
var d = new Date(dateStr);
|
||||
var day = d.getDate();
|
||||
var months = ['ledna','unora','brezna','dubna','kvetna','cervna',
|
||||
'cervence','srpna','zari','rijna','listopadu','prosince'];
|
||||
var hh = String(d.getHours()).padStart(2, '0');
|
||||
var mm = String(d.getMinutes()).padStart(2, '0');
|
||||
return day + '. ' + months[d.getMonth()] + ' ' + d.getFullYear() + ', ' + hh + ':' + mm;
|
||||
}
|
||||
|
||||
function render(data) {
|
||||
// Check if scrape is currently running
|
||||
if (data.status === 'running') {
|
||||
document.getElementById('content').innerHTML =
|
||||
'<div class="loader-wrap">' +
|
||||
'<div class="spinner"></div>' +
|
||||
'<div class="loader-text">Scraper prave bezi...</div>' +
|
||||
'</div>';
|
||||
setTimeout(loadStatus, 30000);
|
||||
return;
|
||||
}
|
||||
|
||||
var sources = data.sources || [];
|
||||
var totalOk = 0, totalRej = 0;
|
||||
var maxCount = 0;
|
||||
sources.forEach(function(s) {
|
||||
totalOk += s.accepted || 0;
|
||||
totalRej += s.rejected || 0;
|
||||
if (s.accepted > maxCount) maxCount = s.accepted;
|
||||
});
|
||||
|
||||
var html = '';
|
||||
|
||||
// Timestamp card
|
||||
html += '<div class="card">';
|
||||
html += '<h2>Posledni scrape</h2>';
|
||||
html += '<div class="timestamp">' + formatDate(data.timestamp) + '</div>';
|
||||
html += '<div class="timestamp-ago">' + timeAgo(data.timestamp) + '</div>';
|
||||
if (data.duration_sec) {
|
||||
html += '<div class="timestamp-ago">Trvani: ' + Math.round(data.duration_sec) + 's</div>';
|
||||
}
|
||||
html += '</div>';
|
||||
|
||||
// Summary card
|
||||
html += '<div class="card">';
|
||||
html += '<h2>Souhrn</h2>';
|
||||
html += '<div class="summary-row"><span class="summary-label">Vyhovujicich bytu</span><span class="summary-value" style="color:#4CAF50">' + totalOk + '</span></div>';
|
||||
html += '<div class="summary-row"><span class="summary-label">Vyloucenych</span><span class="summary-value" style="color:#999">' + totalRej + '</span></div>';
|
||||
if (data.deduplicated !== undefined) {
|
||||
html += '<div class="summary-row"><span class="summary-label">Po deduplikaci (v mape)</span><span class="summary-value" style="color:#1976D2">' + data.deduplicated + '</span></div>';
|
||||
}
|
||||
html += '</div>';
|
||||
|
||||
// Sources card
|
||||
html += '<div class="card">';
|
||||
html += '<h2>Zdroje</h2>';
|
||||
sources.forEach(function(s) {
|
||||
var color = COLORS[s.name.toLowerCase()] || '#999';
|
||||
var pct = maxCount > 0 ? Math.round((s.accepted / maxCount) * 100) : 0;
|
||||
var badge = s.error
|
||||
? '<span class="badge badge-err">chyba</span>'
|
||||
: (s.accepted === 0 ? '<span class="badge badge-skip">0</span>' : '<span class="badge badge-ok">OK</span>');
|
||||
|
||||
html += '<div style="margin-bottom:12px;">';
|
||||
html += '<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">';
|
||||
html += '<span style="font-weight:600;font-size:14px;">' + s.name + ' ' + badge + '</span>';
|
||||
html += '<span style="font-size:12px;color:#999;">' + (s.rejected || 0) + ' vyloucenych</span>';
|
||||
html += '</div>';
|
||||
html += '<div class="bar-row">';
|
||||
html += '<div class="bar-track"><div class="bar-fill" style="width:' + pct + '%;background:' + color + ';"></div></div>';
|
||||
html += '<span class="bar-count">' + (s.accepted || 0) + '</span>';
|
||||
html += '</div>';
|
||||
html += '</div>';
|
||||
});
|
||||
html += '</div>';
|
||||
|
||||
document.getElementById('content').innerHTML = html;
|
||||
}
|
||||
|
||||
function loadStatus() {
|
||||
fetch('status.json?t=' + Date.now())
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error(r.status);
|
||||
return r.json();
|
||||
})
|
||||
.then(render)
|
||||
.catch(function(err) {
|
||||
document.getElementById('content').innerHTML =
|
||||
'<div class="error-msg">Status zatim neni k dispozici.<br><small>(' + err.message + ')</small></div>';
|
||||
});
|
||||
}
|
||||
|
||||
loadStatus();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user