1 Commits

Author SHA1 Message Date
b8d4d44164 Rewrite PSN + CityHome scrapers, add price/m² map coloring, ratings system, and status dashboard
- Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing)
- Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries
- Color map markers by price/m² instead of disposition (blue→green→orange→red scale)
- Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback
- Rejected markers show original color at reduced opacity with 🚫 SVG overlay
- Favorite markers shown as  star icons with gold pulse animation
- Add "new today" marker logic (scraped_at == today) with larger pulsing green outline
- Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons
- Add generate_status.py for scraper run statistics and status.html dashboard
- Add scraped_at field to all scrapers for freshness tracking
- Update run_all.sh with log capture and status generation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 15:15:25 +01:00
19 changed files with 13199 additions and 1808 deletions

View File

@@ -30,7 +30,6 @@ jobs:
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
TAG=${{ inputs.tag }} TAG=${{ inputs.tag }}
fi fi
REPO=gitea.home.hrajfrisbee.cz/${{ github.repository }} IMAGE=gitea.home.hrajfrisbee.cz/${{ github.repository }}:$TAG
docker build -f build/Dockerfile -t $REPO:$TAG -t $REPO:latest . docker build -f build/Dockerfile -t $IMAGE .
docker push $REPO:$TAG docker push $IMAGE
docker push $REPO:latest

View File

@@ -10,7 +10,7 @@ WORKDIR /app
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
scrape_idnes.py scrape_psn.py scrape_cityhome.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \
merge_and_map.py regen_map.py run_all.sh ./ merge_and_map.py regen_map.py run_all.sh ratings_server.py ./
COPY build/crontab /etc/crontabs/root COPY build/crontab /etc/crontabs/root
COPY build/entrypoint.sh /entrypoint.sh COPY build/entrypoint.sh /entrypoint.sh
@@ -18,7 +18,7 @@ RUN chmod +x /entrypoint.sh run_all.sh
RUN mkdir -p /app/data RUN mkdir -p /app/data
EXPOSE 8080 EXPOSE 8080 8081
HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \
CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 CMD wget -q -O /dev/null http://localhost:8080/ || exit 1

View File

@@ -1 +1 @@
0 6,18 * * * cd /app && bash /app/run_all.sh --data-dir /app/data >> /proc/1/fd/1 2>> /proc/1/fd/2 0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2

View File

@@ -3,11 +3,23 @@ set -euo pipefail
DATA_DIR="/app/data" DATA_DIR="/app/data"
# Create symlinks so scripts (which write to /app/) persist data to the volume
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \
mapa_bytu.html ratings.json; do
# Remove real file if it exists (e.g. baked into image)
[ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f"
ln -sf "$DATA_DIR/$f" "/app/$f"
done
echo "[entrypoint] Starting crond..." echo "[entrypoint] Starting crond..."
crond -b -l 2 crond -b -l 2
echo "[entrypoint] Starting initial scrape in background..." echo "[entrypoint] Starting initial scrape in background..."
bash /app/run_all.sh --data-dir "$DATA_DIR" & bash /app/run_all.sh &
echo "[entrypoint] Starting ratings API server on port 8081..."
DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py &
echo "[entrypoint] Starting HTTP server on port 8080..." echo "[entrypoint] Starting HTTP server on port 8080..."
exec python3 -m http.server 8080 --directory "$DATA_DIR" exec python3 -m http.server 8080 --directory "$DATA_DIR"

View File

@@ -1,9 +0,0 @@
#!/bin/bash
docker rm -f maru-hleda-byt
# gitea registry login with kacerr / token
docker run -d --name maru-hleda-byt \
-p 8080:8080 \
-v /srv/maru-hleda-byt/data:/app/data \
gitea.home.hrajfrisbee.cz/littlemeat/maru-hleda-byt:0.01

File diff suppressed because it is too large Load Diff

202
generate_status.py Normal file
View File

@@ -0,0 +1,202 @@
#!/usr/bin/env python3
"""Generate status.json from scraper JSON outputs and run log."""
from __future__ import annotations
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
HERE = Path(__file__).parent
SOURCE_FILES = {
"Sreality": "byty_sreality.json",
"Realingo": "byty_realingo.json",
"Bezrealitky": "byty_bezrealitky.json",
"iDNES": "byty_idnes.json",
"PSN": "byty_psn.json",
"CityHome": "byty_cityhome.json",
}
MERGED_FILE = "byty_merged.json"
def count_source(path: Path) -> dict:
"""Read a scraper JSON and return accepted count + file mtime."""
if not path.exists():
return {"accepted": 0, "error": "soubor nenalezen"}
try:
data = json.loads(path.read_text(encoding="utf-8"))
mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds")
return {"accepted": len(data), "updated_at": mtime}
except Exception as e:
return {"accepted": 0, "error": str(e)}
def parse_log(log_path: str) -> dict[str, dict]:
"""Parse scraper run log and extract per-source statistics.
Scrapers log summary lines like:
✓ Vyhovující byty: 12
Vyloučeno (prodáno): 5
Staženo stránek: 3
Staženo inzerátů: 48
Celkem bytů v cache: 120
and section headers like:
[2/6] Realingo
"""
if not log_path or not os.path.exists(log_path):
return {}
with open(log_path, encoding="utf-8") as f:
content = f.read()
# Split into per-source sections by the [N/6] Step header
# Each section header looks like "[2/6] Realingo\n----..."
section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE)
sections_found = list(section_pattern.finditer(content))
if not sections_found:
return {}
stats = {}
for i, match in enumerate(sections_found):
step_name = match.group(2).strip()
start = match.end()
end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content)
section_text = content[start:end]
# Identify which sources this section covers
# "PSN + CityHome" covers both
source_names = []
for name in SOURCE_FILES:
if name.lower() in step_name.lower():
source_names.append(name)
if not source_names:
continue
# Parse numeric summary lines
def extract(pattern: str) -> Optional[int]:
m = re.search(pattern, section_text)
return int(m.group(1)) if m else None
# Lines present in all/most scrapers
accepted = extract(r'Vyhovující byty[:\s]+(\d+)')
fetched = extract(r'Staženo inzerátů[:\s]+(\d+)')
pages = extract(r'Staženo stránek[:\s]+(\d+)')
cached = extract(r'Celkem bytů v cache[:\s]+(\d+)')
cache_hits = extract(r'Cache hit[:\s]+(\d+)')
# Rejection reasons — collect all into a dict
excluded = {}
for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text):
excluded[m.group(1)] = int(m.group(2))
# Also PSN-style "Vyloučeno (prodáno): N"
total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)')
entry = {}
if accepted is not None:
entry["accepted"] = accepted
if fetched is not None:
entry["fetched"] = fetched
if pages is not None:
entry["pages"] = pages
if cached is not None:
entry["cached"] = cached
if cache_hits is not None:
entry["cache_hits"] = cache_hits
if excluded:
entry["excluded"] = excluded
elif total_excluded is not None:
entry["excluded_total"] = total_excluded
for name in source_names:
stats[name] = entry
return stats
def main():
start_time = None
duration_sec = None
if len(sys.argv) >= 3:
start_time = sys.argv[1]
try:
duration_sec = int(sys.argv[2])
except ValueError:
pass
if not start_time:
start_time = datetime.now().isoformat(timespec="seconds")
log_path = sys.argv[3] if len(sys.argv) >= 4 else None
log_stats = parse_log(log_path)
sources = []
for name, filename in SOURCE_FILES.items():
path = HERE / filename
info = count_source(path)
info["name"] = name
# Merge log stats
ls = log_stats.get(name, {})
for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"):
if k in ls:
info[k] = ls[k]
# Override accepted from log if available (log is authoritative for latest run)
if "accepted" in ls:
info["accepted"] = ls["accepted"]
sources.append(info)
# Total accepted before dedup
total_accepted = sum(s.get("accepted", 0) for s in sources)
# Merged / deduplicated count
merged_path = HERE / MERGED_FILE
deduplicated = 0
if merged_path.exists():
try:
merged = json.loads(merged_path.read_text(encoding="utf-8"))
deduplicated = len(merged)
except Exception:
pass
duplicates_removed = total_accepted - deduplicated if deduplicated else 0
status = {
"status": "done",
"timestamp": start_time,
"duration_sec": duration_sec,
"total_accepted": total_accepted,
"deduplicated": deduplicated,
"duplicates_removed": duplicates_removed,
"sources": sources,
}
out = HERE / "status.json"
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Status uložen: {out}")
print(f" Celkem bytů (před dedup): {total_accepted}")
print(f" Po deduplikaci: {deduplicated}")
if duplicates_removed:
print(f" Odstraněno duplikátů: {duplicates_removed}")
for s in sources:
acc = s.get("accepted", 0)
err = s.get("error", "")
exc = s.get("excluded", {})
exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0)
parts = [f"{s['name']:12s}: {acc} bytů"]
if exc_total:
parts.append(f"({exc_total} vyloučeno)")
if err:
parts.append(f"[CHYBA: {err}]")
print(" " + " ".join(parts))
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -7,7 +7,6 @@ PSN a CityHome mají při deduplikaci prioritu (načtou se první).
""" """
from __future__ import annotations from __future__ import annotations
import argparse
import json import json
import re import re
from pathlib import Path from pathlib import Path
@@ -41,7 +40,7 @@ def dedup_key(estate: dict) -> str:
return f"{street}_{price}_{area}" return f"{street}_{price}_{area}"
def main(data_dir: str = "."): def main():
# Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci) # Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci)
sources = [ sources = [
("PSN", "byty_psn.json"), ("PSN", "byty_psn.json"),
@@ -52,11 +51,10 @@ def main(data_dir: str = "."):
("iDNES", "byty_idnes.json"), ("iDNES", "byty_idnes.json"),
] ]
data_path = Path(data_dir)
all_estates = [] all_estates = []
for label, filename in sources: for label, filename in sources:
path = data_path / filename path = Path(filename)
if path.exists(): if path.exists():
data = json.loads(path.read_text(encoding="utf-8")) data = json.loads(path.read_text(encoding="utf-8"))
# Ensure source is set (Sreality legacy) # Ensure source is set (Sreality legacy)
@@ -81,19 +79,6 @@ def main(data_dir: str = "."):
if key in seen_keys: if key in seen_keys:
dupes += 1 dupes += 1
existing = seen_keys[key] existing = seen_keys[key]
# Merge timestamps: keep earliest first_seen, latest last_updated
e_first = e.get("first_seen", "")
ex_first = existing.get("first_seen", "")
if e_first and ex_first:
existing["first_seen"] = min(e_first, ex_first)
elif e_first:
existing["first_seen"] = e_first
e_updated = e.get("last_updated", "")
ex_updated = existing.get("last_updated", "")
if e_updated and ex_updated:
existing["last_updated"] = max(e_updated, ex_updated)
elif e_updated:
existing["last_updated"] = e_updated
# Log it # Log it
print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')}" print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')}"
f"({e.get('source', '?')} vs {existing.get('source', '?')})") f"({e.get('source', '?')} vs {existing.get('source', '?')})")
@@ -113,7 +98,7 @@ def main(data_dir: str = "."):
print(f" {src}: {count}") print(f" {src}: {count}")
# Save merged data # Save merged data
merged_path = data_path / "byty_merged.json" merged_path = Path("byty_merged.json")
merged_path.write_text( merged_path.write_text(
json.dumps(deduplicated, ensure_ascii=False, indent=2), json.dumps(deduplicated, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",
@@ -121,12 +106,8 @@ def main(data_dir: str = "."):
print(f"\n✓ Sloučená data: {merged_path.resolve()}") print(f"\n✓ Sloučená data: {merged_path.resolve()}")
# Generate map # Generate map
generate_map(deduplicated, output_path=str(data_path / "mapa_bytu.html")) generate_map(deduplicated)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Merge scraped data and generate map") main()
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
main(data_dir=args.data_dir)

116
ratings_server.py Normal file
View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""
Minimal HTTP API server for persisting apartment ratings.
GET /api/ratings → returns ratings.json contents
POST /api/ratings → saves entire ratings object
GET /api/ratings/export → same as GET, but with download header
Ratings file: /app/data/ratings.json (or ./ratings.json locally)
"""
import json
import logging
import os
import sys
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
PORT = int(os.environ.get("RATINGS_PORT", 8081))
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
RATINGS_FILE = DATA_DIR / "ratings.json"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [ratings] %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
log = logging.getLogger(__name__)
def load_ratings() -> dict:
try:
if RATINGS_FILE.exists():
return json.loads(RATINGS_FILE.read_text(encoding="utf-8"))
except Exception as e:
log.error("Failed to load ratings: %s", e)
return {}
def save_ratings(data: dict) -> None:
RATINGS_FILE.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
class RatingsHandler(BaseHTTPRequestHandler):
def log_message(self, format, *args):
# Suppress default HTTP access log (we use our own)
pass
def _send_json(self, status: int, body: dict, extra_headers=None):
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
if extra_headers:
for k, v in extra_headers.items():
self.send_header(k, v)
self.end_headers()
self.wfile.write(payload)
def do_OPTIONS(self):
# CORS preflight
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
if self.path in ("/api/ratings", "/api/ratings/export"):
ratings = load_ratings()
extra = None
if self.path == "/api/ratings/export":
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
log.info("GET %s%d ratings", self.path, len(ratings))
self._send_json(200, ratings, extra)
else:
self._send_json(404, {"error": "not found"})
def do_POST(self):
if self.path == "/api/ratings":
length = int(self.headers.get("Content-Length", 0))
if length == 0:
self._send_json(400, {"error": "empty body"})
return
try:
raw = self.rfile.read(length)
data = json.loads(raw.decode("utf-8"))
except Exception as e:
log.warning("Bad request body: %s", e)
self._send_json(400, {"error": "invalid JSON"})
return
if not isinstance(data, dict):
self._send_json(400, {"error": "expected JSON object"})
return
save_ratings(data)
log.info("POST /api/ratings → saved %d ratings", len(data))
self._send_json(200, {"ok": True, "count": len(data)})
else:
self._send_json(404, {"error": "not found"})
if __name__ == "__main__":
log.info("Ratings server starting on port %d, data dir: %s", PORT, DATA_DIR)
log.info("Ratings file: %s", RATINGS_FILE)
server = HTTPServer(("0.0.0.0", PORT), RatingsHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
log.info("Stopped.")
sys.exit(0)

View File

@@ -5,7 +5,6 @@ Doplní chybějící plochy ze Sreality API, opraví URL, aplikuje filtry.
""" """
from __future__ import annotations from __future__ import annotations
import argparse
import json import json
import time import time
import urllib.request import urllib.request
@@ -58,9 +57,8 @@ def fetch_area(hash_id: int) -> int | None:
return None return None
def main(data_dir: str = "."): def main():
data_path = Path(data_dir) json_path = Path("byty_sreality.json")
json_path = data_path / "byty_sreality.json"
if not json_path.exists(): if not json_path.exists():
print("Soubor byty_sreality.json nenalezen. Nejprve spusť scrape_and_map.py") print("Soubor byty_sreality.json nenalezen. Nejprve spusť scrape_and_map.py")
return return
@@ -102,19 +100,15 @@ def main(data_dir: str = "."):
print(f"Zbývá: {len(filtered)} bytů") print(f"Zbývá: {len(filtered)} bytů")
# Save updated data # Save updated data
filtered_path = data_path / "byty_sreality.json" filtered_path = Path("byty_sreality.json")
filtered_path.write_text( filtered_path.write_text(
json.dumps(filtered, ensure_ascii=False, indent=2), json.dumps(filtered, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",
) )
# Generate map # Generate map
generate_map(filtered, output_path=str(data_path / "mapa_bytu.html")) generate_map(filtered)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Regenerate map from existing data") main()
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
main(data_dir=args.data_dir)

View File

@@ -4,7 +4,6 @@
# Použití: ./run_all.sh # Použití: ./run_all.sh
# Nebo s limity: ./run_all.sh --max-pages 1 --max-properties 10 # Nebo s limity: ./run_all.sh --max-pages 1 --max-properties 10
# Nebo s logováním: ./run_all.sh --log-level DEBUG # Nebo s logováním: ./run_all.sh --log-level DEBUG
# S vlastním adresářem: ./run_all.sh --data-dir /app/data
# ============================================================ # ============================================================
set -euo pipefail set -euo pipefail
cd "$(dirname "$0")" cd "$(dirname "$0")"
@@ -17,6 +16,12 @@ NC='\033[0m'
TOTAL=6 TOTAL=6
CURRENT=0 CURRENT=0
FAILED=0 FAILED=0
START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S")
START_EPOCH=$(date +%s)
LOG_FILE="$(pwd)/scrape_run.log"
# Mark status as running
echo '{"status":"running"}' > status.json
show_help() { show_help() {
echo "Usage: ./run_all.sh [OPTIONS]" echo "Usage: ./run_all.sh [OPTIONS]"
@@ -27,19 +32,16 @@ show_help() {
echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje" echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje"
echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje" echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje"
echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)" echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)"
echo " --data-dir DIR Adresář pro čtení/zápis datových souborů (default: .)"
echo " -h, --help Zobrazí tuto nápovědu" echo " -h, --help Zobrazí tuto nápovědu"
echo "" echo ""
echo "Examples:" echo "Examples:"
echo " ./run_all.sh # plný běh" echo " ./run_all.sh # plný běh"
echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test" echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test"
echo " ./run_all.sh --log-level DEBUG # s debug logováním" echo " ./run_all.sh --log-level DEBUG # s debug logováním"
echo " ./run_all.sh --data-dir /app/data # Docker produkce"
} }
# Parse arguments # Parse arguments
SCRAPER_ARGS="" SCRAPER_ARGS=""
DATA_DIR="."
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
-h|--help) -h|--help)
@@ -50,10 +52,6 @@ while [[ $# -gt 0 ]]; do
SCRAPER_ARGS="$SCRAPER_ARGS $1 $2" SCRAPER_ARGS="$SCRAPER_ARGS $1 $2"
shift 2 shift 2
;; ;;
--data-dir)
DATA_DIR="$2"
shift 2
;;
*) *)
echo "Unknown argument: $1" echo "Unknown argument: $1"
echo "" echo ""
@@ -63,8 +61,6 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
SCRAPER_ARGS="$SCRAPER_ARGS --data-dir $DATA_DIR"
step() { step() {
CURRENT=$((CURRENT + 1)) CURRENT=$((CURRENT + 1))
echo "" echo ""
@@ -73,6 +69,8 @@ step() {
} }
# ── Scrapery (paralelně kde to jde) ───────────────────────── # ── Scrapery (paralelně kde to jde) ─────────────────────────
# Tee all output to log file for status generation
exec > >(tee -a "$LOG_FILE") 2>&1
step "Sreality" step "Sreality"
python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); } python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); }
@@ -97,10 +95,16 @@ wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED +
# ── Sloučení + mapa ────────────────────────────────────────── # ── Sloučení + mapa ──────────────────────────────────────────
step "Sloučení dat a generování mapy" step "Sloučení dat a generování mapy"
python3 merge_and_map.py --data-dir "$DATA_DIR" || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((FAILED + 1)); } python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((FAILED + 1)); }
# ── Otevření mapy ──────────────────────────────────────────── # ── Otevření mapy ────────────────────────────────────────────
# ── Generování statusu ─────────────────────────────────────
END_EPOCH=$(date +%s)
DURATION=$((END_EPOCH - START_EPOCH))
python3 generate_status.py "$START_TIME" "$DURATION" "$LOG_FILE"
echo "" echo ""
echo "============================================================" echo "============================================================"
if [ $FAILED -eq 0 ]; then if [ $FAILED -eq 0 ]; then
@@ -110,4 +114,4 @@ else
fi fi
echo "============================================================" echo "============================================================"
command -v open &>/dev/null && open "$DATA_DIR/mapa_bytu.html" || true command -v open &>/dev/null && open mapa_bytu.html || true

View File

@@ -207,10 +207,10 @@ def load_cache(json_path: str = "byty_sreality.json") -> dict[int, dict]:
return {} return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): def scrape(max_pages: int | None = None, max_properties: int | None = None):
"""Main scraping function. Returns list of filtered estates.""" """Main scraping function. Returns list of filtered estates."""
all_estates_raw = [] all_estates_raw = []
cache = load_cache(str(Path(data_dir) / "byty_sreality.json")) cache = load_cache()
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty ze Sreality.cz") logger.info("Stahuji inzeráty ze Sreality.cz")
@@ -272,13 +272,9 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
# Check cache — if hash_id exists and price unchanged, reuse # Check cache — if hash_id exists and price unchanged, reuse
cached = cache.get(hash_id) cached = cache.get(hash_id)
today = datetime.now().strftime("%Y-%m-%d")
if cached and cached.get("price") == estate.get("price", 0): if cached and cached.get("price") == estate.get("price", 0):
cache_hits += 1 cache_hits += 1
logger.debug(f"Cache hit for hash_id={hash_id}") logger.debug(f"Cache hit for hash_id={hash_id}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached) results.append(cached)
continue continue
@@ -336,11 +332,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
disp_cb = estate.get("_disposition_cb") or estate.get("seo", {}).get("category_sub_cb") disp_cb = estate.get("_disposition_cb") or estate.get("seo", {}).get("category_sub_cb")
seo = estate.get("seo", {}) seo = estate.get("seo", {})
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = { result = {
"hash_id": hash_id, "hash_id": hash_id,
"name": estate.get("name", ""), "name": estate.get("name", ""),
@@ -356,8 +347,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
"ownership": ownership, "ownership": ownership,
"url": sreality_url(hash_id, seo), "url": sreality_url(hash_id, seo),
"image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""), "image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""),
"first_seen": first_seen, "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"last_updated": today,
} }
results.append(result) results.append(result)
details_fetched += 1 details_fetched += 1
@@ -384,26 +374,58 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
"""Generate an interactive Leaflet.js HTML map.""" """Generate an interactive Leaflet.js HTML map."""
# Color by disposition # Color by price per m² — cool blue→warm red scale, no yellow
color_map = { # Thresholds based on Prague market distribution (p25=120k, p50=144k, p75=162k)
"3+kk": "#2196F3", # blue price_color_scale = [
"3+1": "#4CAF50", # green (110_000, "#1565C0"), # < 110k/m² → deep blue (levné)
"4+kk": "#FF9800", # orange (130_000, "#42A5F5"), # 110130k → light blue
"4+1": "#F44336", # red (150_000, "#66BB6A"), # 130150k → green (střed)
"5+kk": "#9C27B0", # purple (165_000, "#EF6C00"), # 150165k → dark orange
"5+1": "#795548", # brown (float("inf"), "#C62828"), # > 165k → dark red (drahé)
"6+": "#607D8B", # grey-blue ]
}
def fmt_date(d): def price_color(estate: dict) -> str:
"""Format ISO date (YYYY-MM-DD) to Czech format (DD.MM.YYYY).""" price = estate.get("price") or 0
if d and len(d) == 10: area = estate.get("area") or 0
return f"{d[8:10]}.{d[5:7]}.{d[:4]}" if not area:
return "" return "#9E9E9E"
ppm2 = price / area
for threshold, color in price_color_scale:
if ppm2 < threshold:
return color
return "#E53935"
# Legend bands for info panel (built once)
price_legend_items = (
'<div style="margin-bottom:4px;font-size:12px;color:#555;font-weight:600;">Cena / m²:</div>'
)
bands = [
("#1565C0", "< 110 000 Kč/m²"),
("#42A5F5", "110 130 000 Kč/m²"),
("#66BB6A", "130 150 000 Kč/m²"),
("#EF6C00", "150 165 000 Kč/m²"),
("#C62828", "> 165 000 Kč/m²"),
("#9E9E9E", "cena/plocha neuvedena"),
]
for bcolor, blabel in bands:
price_legend_items += (
f'<div style="display:flex;align-items:center;gap:6px;margin:2px 0;">'
f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};'
f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>'
f'<span>{blabel}</span></div>'
)
# New marker indicator — bigger dot, no extra border
price_legend_items += (
'<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;'
'padding-top:6px;border-top:1px solid #eee;">'
'<span style="width:18px;height:18px;border-radius:50%;background:#66BB6A;'
'display:inline-block;box-shadow:0 1px 4px rgba(0,0,0,0.35);flex-shrink:0;"></span>'
'<span>Nové (z dnešního scrapu) — větší</span></div>'
)
markers_js = "" markers_js = ""
for e in estates: for e in estates:
color = color_map.get(e["disposition"], "#999999") color = price_color(e)
floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno" floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno"
area_text = f'{e["area"]}' if e["area"] else "neuvedeno" area_text = f'{e["area"]}' if e["area"] else "neuvedeno"
building_text = e["building_type"] or "neuvedeno" building_text = e["building_type"] or "neuvedeno"
@@ -421,31 +443,15 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
source_color = source_colors.get(source, "#999") source_color = source_colors.get(source, "#999")
hash_id = e.get("hash_id", "") hash_id = e.get("hash_id", "")
first_seen = e.get("first_seen", "")
last_updated = e.get("last_updated", "")
first_seen_fmt = fmt_date(first_seen) scraped_at = e.get("scraped_at", "")
last_updated_fmt = fmt_date(last_updated) is_new = scraped_at == datetime.now().strftime("%Y-%m-%d")
# "NOVÉ" badge if first_seen equals latest scrape date
new_badge = ""
if first_seen and first_seen == last_updated:
new_badge = ( new_badge = (
'<span style="margin-left:6px;font-size:10px;background:#4CAF50;color:white;' '<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;'
'padding:1px 5px;border-radius:3px;font-weight:bold;">NOVÉ</span>' 'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>'
if is_new else ""
) )
# Date info line
date_line = ""
if first_seen_fmt:
date_line = (
f'<div style="margin-top:4px;font-size:11px;color:#888;">'
f'Přidáno: {first_seen_fmt}'
)
if last_updated_fmt and last_updated != first_seen:
date_line += f' · Aktualizace: {last_updated_fmt}'
date_line += '</div>'
popup = ( popup = (
f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">' f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">'
f'<b style="font-size:14px;">{format_price(e["price"])}</b>' f'<b style="font-size:14px;">{format_price(e["price"])}</b>'
@@ -455,8 +461,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
f'{floor_note}<br><br>' f'{floor_note}<br><br>'
f'<b>{e["locality"]}</b><br>' f'<b>{e["locality"]}</b><br>'
f'Stavba: {building_text}<br>' f'Stavba: {building_text}<br>'
f'Vlastnictví: {ownership_text}' f'Vlastnictví: {ownership_text}<br><br>'
f'{date_line}<br>'
f'<a href="{e["url"]}" target="_blank" ' f'<a href="{e["url"]}" target="_blank" '
f'style="color:{source_color};text-decoration:none;font-weight:bold;">' f'style="color:{source_color};text-decoration:none;font-weight:bold;">'
f'→ Otevřít na {source_label}</a>' f'→ Otevřít na {source_label}</a>'
@@ -480,26 +485,32 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
popup = popup.replace("'", "\\'").replace("\n", "") popup = popup.replace("'", "\\'").replace("\n", "")
is_fav = source in ("psn", "cityhome") is_fav = source in ("psn", "cityhome")
marker_fn = "addHeartMarker" if is_fav else "addMarker"
if is_fav:
marker_fn = "addHeartMarker"
elif is_new:
marker_fn = "addNewMarker"
else:
marker_fn = "addMarker"
markers_js += ( markers_js += (
f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}', '{first_seen}');\n" f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n"
) )
# Build legend # Build legend — price per m² bands + disposition counts
legend_items = "" legend_items = price_legend_items
# Disposition counts below the color legend
disp_counts = {} disp_counts = {}
for e in estates: for e in estates:
d = e["disposition"] d = e["disposition"]
disp_counts[d] = disp_counts.get(d, 0) + 1 disp_counts[d] = disp_counts.get(d, 0) + 1
for disp, color in color_map.items(): disp_order = ["3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+"]
count = disp_counts.get(disp, 0) disp_summary = ", ".join(
if count > 0: f"{d} ({disp_counts[d]})" for d in disp_order if d in disp_counts
)
legend_items += ( legend_items += (
f'<div style="display:flex;align-items:center;gap:6px;margin:3px 0;">' f'<div style="margin-top:8px;padding-top:6px;border-top:1px solid #eee;'
f'<span style="width:14px;height:14px;border-radius:50%;' f'font-size:12px;color:#666;">{disp_summary}</div>'
f'background:{color};display:inline-block;border:2px solid white;'
f'box-shadow:0 1px 3px rgba(0,0,0,0.3);"></span>'
f'<span>{disp} ({count})</span></div>'
) )
# Heart marker legend for PSN/CityHome # Heart marker legend for PSN/CityHome
@@ -535,6 +546,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
body {{ font-family: system-ui, -apple-system, sans-serif; }} body {{ font-family: system-ui, -apple-system, sans-serif; }}
#map {{ width: 100%; height: 100vh; }} #map {{ width: 100%; height: 100vh; }}
.heart-icon {{ background: none !important; border: none !important; }} .heart-icon {{ background: none !important; border: none !important; }}
.star-icon {{ background: none !important; border: none !important; }}
.rate-btn:hover {{ background: #f0f0f0 !important; }} .rate-btn:hover {{ background: #f0f0f0 !important; }}
.rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }} .rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }}
.rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }} .rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }}
@@ -545,13 +557,42 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
}} }}
.marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }} .marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }}
.heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }} .heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }}
.heart-icon-rej {{ opacity: 0.2 !important; }} .heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }}
.reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }}
@keyframes pulse-new {{
0% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
50% {{ stroke-opacity: 0.4; stroke-width: 6px; r: 12; }}
100% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
}}
.marker-new {{ animation: pulse-new 2s ease-in-out infinite; }}
.info-panel {{ .info-panel {{
position: absolute; top: 10px; right: 10px; z-index: 1000; position: absolute; top: 10px; right: 10px; z-index: 1000;
background: white; padding: 16px; border-radius: 10px; background: white; padding: 16px; border-radius: 10px;
box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px; box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px;
font-size: 13px; line-height: 1.5; font-size: 13px; line-height: 1.5;
transition: transform 0.3s ease, opacity 0.3s ease;
}} }}
.info-panel.collapsed {{
transform: translateX(calc(100% + 20px));
opacity: 0; pointer-events: none;
}}
.panel-open-btn {{
position: absolute; top: 10px; right: 10px; z-index: 1001;
width: 40px; height: 40px; border-radius: 8px;
background: white; border: none; cursor: pointer;
box-shadow: 0 2px 12px rgba(0,0,0,0.15);
font-size: 20px; display: flex; align-items: center; justify-content: center;
transition: opacity 0.3s ease;
}}
.panel-open-btn.hidden {{ opacity: 0; pointer-events: none; }}
.panel-close-btn {{
position: absolute; top: 8px; right: 8px;
width: 28px; height: 28px; border-radius: 6px;
background: none; border: 1px solid #ddd; cursor: pointer;
font-size: 16px; display: flex; align-items: center; justify-content: center;
color: #888;
}}
.panel-close-btn:hover {{ background: #f0f0f0; color: #333; }}
.info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }} .info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }}
.info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }} .info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }}
.filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }} .filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }}
@@ -559,18 +600,26 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
.filter-section input[type="checkbox"] {{ accent-color: #1976D2; }} .filter-section input[type="checkbox"] {{ accent-color: #1976D2; }}
#floor-filter {{ margin-top: 8px; }} #floor-filter {{ margin-top: 8px; }}
#floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }} #floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }}
.status-link {{ display: block; margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; text-align: center; }}
.status-link a {{ color: #1976D2; text-decoration: none; font-size: 12px; }}
@media (max-width: 600px) {{
.info-panel {{ max-width: calc(100vw - 60px); right: 10px; }}
.info-panel.collapsed {{ transform: translateX(calc(100% + 20px)); }}
.panel-close-btn {{ top: 6px; right: 6px; }}
}}
</style> </style>
</head> </head>
<body> <body>
<div id="map"></div> <div id="map"></div>
<div class="info-panel"> <button class="panel-open-btn hidden" id="panel-open-btn" onclick="togglePanel()">☰</button>
<div class="info-panel" id="info-panel">
<button class="panel-close-btn" id="panel-close-btn" onclick="togglePanel()">✕</button>
<h2>Byty v Praze</h2> <h2>Byty v Praze</h2>
<div class="stats"> <div class="stats">
<div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div> <div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div>
<div>Cena: {min_price}{max_price}</div> <div>Cena: {min_price}{max_price}</div>
<div>Průměr: {avg_price}</div> <div>Průměr: {avg_price}</div>
</div> </div>
<div><b>Dispozice:</b></div>
{legend_items} {legend_items}
<div class="filter-section"> <div class="filter-section">
<b>Filtry:</b> <b>Filtry:</b>
@@ -594,17 +643,6 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
</select> </select>
</label> </label>
</div> </div>
<div style="margin-top:6px;">
<label>Přidáno:
<select id="first-seen-filter" onchange="applyFilters()">
<option value="all">Vše</option>
<option value="1">Posledních 24h</option>
<option value="3">Poslední 3 dny</option>
<option value="7">Poslední týden</option>
<option value="14">Posledních 14 dní</option>
</select>
</label>
</div>
</div> </div>
<div class="filter-section"> <div class="filter-section">
<div id="rating-counts" style="margin-bottom:6px;font-size:12px;color:#666;"> <div id="rating-counts" style="margin-bottom:6px;font-size:12px;color:#666;">
@@ -615,6 +653,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
Skrýt zamítnuté Skrýt zamítnuté
</label> </label>
</div> </div>
<div class="status-link"><a href="status.html">Scraper status</a></div>
</div> </div>
<script> <script>
@@ -636,7 +675,7 @@ L.tileLayer('https://{{s}}.basemaps.cartocdn.com/light_only_labels/{{z}}/{{x}}/{
var allMarkers = []; var allMarkers = [];
function addMarker(lat, lon, color, popup, hashId, firstSeen) {{ function addMarker(lat, lon, color, popup, hashId) {{
var marker = L.circleMarker([lat, lon], {{ var marker = L.circleMarker([lat, lon], {{
radius: 8, radius: 8,
fillColor: color, fillColor: color,
@@ -645,11 +684,28 @@ function addMarker(lat, lon, color, popup, hashId, firstSeen) {{
opacity: 1, opacity: 1,
fillOpacity: 0.85, fillOpacity: 0.85,
}}).bindPopup(popup); }}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, firstSeen: firstSeen }}; marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId }};
allMarkers.push(marker); allMarkers.push(marker);
marker.addTo(map); marker.addTo(map);
}} }}
function addNewMarker(lat, lon, color, popup, hashId) {{
var marker = L.circleMarker([lat, lon], {{
radius: 12,
fillColor: color,
color: color,
weight: 4,
opacity: 0.35,
fillOpacity: 0.95,
}}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true }};
allMarkers.push(marker);
marker.addTo(map);
marker.on('add', function() {{
if (marker._path) marker._path.classList.add('marker-new');
}});
}}
function heartIcon(color) {{ function heartIcon(color) {{
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">' var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">'
+ '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 ' + '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 '
@@ -665,11 +721,26 @@ function heartIcon(color) {{
}}); }});
}} }}
function addHeartMarker(lat, lon, color, popup, hashId, firstSeen) {{ function starIcon() {{
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="28" height="28" viewBox="0 0 24 24">'
+ '<path d="M12 2l3.09 6.26L22 9.27l-5 4.87L18.18 22 12 18.27 '
+ '5.82 22 7 14.14 2 9.27l6.91-1.01L12 2z" '
+ 'fill="#FFC107" stroke="#F57F17" stroke-width="1" '
+ 'filter="drop-shadow(0 1px 3px rgba(0,0,0,0.3))"/></svg>';
return L.divIcon({{
html: svg,
className: 'star-icon',
iconSize: [28, 28],
iconAnchor: [14, 14],
popupAnchor: [0, -14],
}});
}}
function addHeartMarker(lat, lon, color, popup, hashId) {{
var marker = L.marker([lat, lon], {{ var marker = L.marker([lat, lon], {{
icon: heartIcon(color), icon: heartIcon(color),
}}).bindPopup(popup); }}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isHeart: true, firstSeen: firstSeen }}; marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isHeart: true }};
allMarkers.push(marker); allMarkers.push(marker);
marker.addTo(map); marker.addTo(map);
}} }}
@@ -690,6 +761,36 @@ function saveRatings(ratings) {{
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings)); localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
}} }}
function addRejectStrike(marker) {{
removeRejectStrike(marker);
var color = marker._data.color || '#999';
// SVG "no entry" icon — circle with diagonal line, colored to match marker
var svg = '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20">'
+ '<circle cx="12" cy="12" r="10" fill="none" stroke="' + color + '" stroke-width="2.5" opacity="0.85"/>'
+ '<line x1="5.5" y1="5.5" x2="18.5" y2="18.5" stroke="' + color + '" stroke-width="2.5" stroke-linecap="round" opacity="0.85"/>'
+ '</svg>';
var icon = L.divIcon({{
className: 'reject-overlay',
html: svg,
iconSize: [20, 20],
iconAnchor: [10, 10],
}});
var m = L.marker([marker._data.lat, marker._data.lon], {{
icon: icon,
interactive: false,
pane: 'markerPane',
}});
m.addTo(map);
marker._rejectStrike = m;
}}
function removeRejectStrike(marker) {{
if (marker._rejectStrike) {{
map.removeLayer(marker._rejectStrike);
marker._rejectStrike = null;
}}
}}
function applyMarkerStyle(marker, status) {{ function applyMarkerStyle(marker, status) {{
if (marker._data.isHeart) {{ if (marker._data.isHeart) {{
var el = marker._icon; var el = marker._icon;
@@ -704,16 +805,33 @@ function applyMarkerStyle(marker, status) {{
}} }}
}} else {{ }} else {{
if (status === 'fav') {{ if (status === 'fav') {{
marker.setStyle({{ removeRejectStrike(marker);
radius: 12, fillOpacity: 1, weight: 3, if (!marker._data._origCircle) marker._data._origCircle = true;
fillColor: marker._data.color, color: '#fff', var popup = marker.getPopup();
}}); var popupContent = popup ? popup.getContent() : '';
if (marker._path) marker._path.classList.add('marker-favorite'); var wasOnMap = map.hasLayer(marker);
if (wasOnMap) map.removeLayer(marker);
var starMarker = L.marker([marker._data.lat, marker._data.lon], {{
icon: starIcon(),
}}).bindPopup(popupContent);
starMarker._data = marker._data;
var idx = allMarkers.indexOf(marker);
if (idx !== -1) allMarkers[idx] = starMarker;
if (wasOnMap) starMarker.addTo(map);
}} else if (status === 'reject') {{ }} else if (status === 'reject') {{
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
revertToCircle(marker, {{ radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1 }});
}} else {{
marker.setStyle({{ marker.setStyle({{
radius: 6, fillOpacity: 0.15, fillColor: '#999', color: '#bbb', weight: 1, radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1,
}}); }});
if (marker._path) marker._path.classList.remove('marker-favorite'); if (marker._path) marker._path.classList.remove('marker-favorite');
}}
// Add strikethrough line over the marker
addRejectStrike(marker);
}} else {{
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }});
}} else {{ }} else {{
marker.setStyle({{ marker.setStyle({{
radius: 8, fillColor: marker._data.color, color: '#fff', radius: 8, fillColor: marker._data.color, color: '#fff',
@@ -721,8 +839,24 @@ function applyMarkerStyle(marker, status) {{
}}); }});
if (marker._path) marker._path.classList.remove('marker-favorite'); if (marker._path) marker._path.classList.remove('marker-favorite');
}} }}
if (marker._path) marker._path.classList.remove('marker-rejected');
removeRejectStrike(marker);
}} }}
}} }}
}}
function revertToCircle(marker, style) {{
var popup = marker.getPopup();
var popupContent = popup ? popup.getContent() : '';
var wasOnMap = map.hasLayer(marker);
if (wasOnMap) map.removeLayer(marker);
var cm = L.circleMarker([marker._data.lat, marker._data.lon], style).bindPopup(popupContent);
cm._data = marker._data;
delete cm._data._starRef;
var idx = allMarkers.indexOf(marker);
if (idx !== -1) allMarkers[idx] = cm;
if (wasOnMap) cm.addTo(map);
}}
function rateMarker(marker, action) {{ function rateMarker(marker, action) {{
var hashId = marker._data.hashId; var hashId = marker._data.hashId;
@@ -860,25 +994,13 @@ map.on('popupopen', function(e) {{
}}); }});
// ── Filters ──────────────────────────────────────────────────── // ── Filters ────────────────────────────────────────────────────
function daysAgoDate(days) {{
var d = new Date();
d.setDate(d.getDate() - days);
return d.toISOString().slice(0, 10);
}}
function applyFilters() {{ function applyFilters() {{
var minFloor = parseInt(document.getElementById('min-floor').value); var minFloor = parseInt(document.getElementById('min-floor').value);
var maxPrice = parseInt(document.getElementById('max-price').value); var maxPrice = parseInt(document.getElementById('max-price').value);
var hideRejected = document.getElementById('hide-rejected').checked; var hideRejected = document.getElementById('hide-rejected').checked;
var firstSeenVal = document.getElementById('first-seen-filter').value;
var ratings = loadRatings(); var ratings = loadRatings();
var visible = 0; var visible = 0;
var minFirstSeen = '';
if (firstSeenVal !== 'all') {{
minFirstSeen = daysAgoDate(parseInt(firstSeenVal));
}}
allMarkers.forEach(function(m) {{ allMarkers.forEach(function(m) {{
var popup = m.getPopup().getContent(); var popup = m.getPopup().getContent();
var floorMatch = popup.match(/(\\d+)\\. NP/); var floorMatch = popup.match(/(\\d+)\\. NP/);
@@ -891,19 +1013,18 @@ function applyFilters() {{
if (floor !== null && floor < minFloor) show = false; if (floor !== null && floor < minFloor) show = false;
if (price > maxPrice) show = false; if (price > maxPrice) show = false;
// Date filter
if (minFirstSeen && m._data.firstSeen) {{
if (m._data.firstSeen < minFirstSeen) show = false;
}}
var r = ratings[m._data.hashId]; var r = ratings[m._data.hashId];
if (hideRejected && r && r.status === 'reject') show = false; if (hideRejected && r && r.status === 'reject') show = false;
if (show) {{ if (show) {{
if (!map.hasLayer(m)) m.addTo(map); if (!map.hasLayer(m)) m.addTo(map);
visible++; visible++;
// Show strike line if rejected and visible
if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map);
}} else {{ }} else {{
if (map.hasLayer(m)) map.removeLayer(m); if (map.hasLayer(m)) map.removeLayer(m);
// Hide strike line when marker hidden
if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike);
}} }}
}}); }});
@@ -921,6 +1042,26 @@ function applyFilters() {{
// Initialize ratings on load // Initialize ratings on load
restoreRatings(); restoreRatings();
// ── Panel toggle ──────────────────────────────────────────────
function togglePanel() {{
var panel = document.getElementById('info-panel');
var openBtn = document.getElementById('panel-open-btn');
var isOpen = !panel.classList.contains('collapsed');
if (isOpen) {{
panel.classList.add('collapsed');
openBtn.classList.remove('hidden');
}} else {{
panel.classList.remove('collapsed');
openBtn.classList.add('hidden');
}}
}}
// On mobile, start with panel collapsed
if (window.innerWidth <= 600) {{
document.getElementById('info-panel').classList.add('collapsed');
document.getElementById('panel-open-btn').classList.remove('hidden');
}}
</script> </script>
</body> </body>
</html>""" </html>"""
@@ -939,8 +1080,6 @@ if __name__ == "__main__":
parser.add_argument("--max-properties", type=int, help="Maximum number of properties to fetch details for") parser.add_argument("--max-properties", type=int, help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging # Configure logging
@@ -950,13 +1089,12 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
data_dir = Path(args.data_dir)
start = time.time() start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates: if estates:
# Save raw data as JSON backup # Save raw data as JSON backup
json_path = data_dir / "byty_sreality.json" json_path = Path("byty_sreality.json")
json_path.write_text( json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2), json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",
@@ -964,7 +1102,7 @@ if __name__ == "__main__":
logger.info(f"✓ Data uložena: {json_path.resolve()}") logger.info(f"✓ Data uložena: {json_path.resolve()}")
# Generate map # Generate map
map_path = generate_map(estates, output_path=str(data_dir / "mapa_bytu.html")) map_path = generate_map(estates)
elapsed = time.time() - start elapsed = time.time() - start
logger.info(f"\n⏱ Celkový čas: {elapsed:.0f} s") logger.info(f"\n⏱ Celkový čas: {elapsed:.0f} s")
logger.info(f"\nOtevři v prohlížeči:\n file://{map_path}") logger.info(f"\nOtevři v prohlížeči:\n file://{map_path}")

View File

@@ -7,13 +7,13 @@ Výstup: byty_bezrealitky.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
import re import re
import time import time
import urllib.request import urllib.request
from datetime import datetime
from pathlib import Path from pathlib import Path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -170,8 +170,8 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
return {} return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache(str(Path(data_dir) / "byty_bezrealitky.json")) cache = load_cache()
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z Bezrealitky.cz") logger.info("Stahuji inzeráty z Bezrealitky.cz")
@@ -285,14 +285,10 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
# Check cache — if hash_id exists and price unchanged, reuse # Check cache — if hash_id exists and price unchanged, reuse
adv_id = int(adv["id"]) adv_id = int(adv["id"])
adv_price = adv.get("price", 0) or 0 adv_price = adv.get("price", 0) or 0
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(adv_id) cached = cache.get(adv_id)
if cached and cached.get("price") == adv_price: if cached and cached.get("price") == adv_price:
cache_hits += 1 cache_hits += 1
logger.debug(f"Cache hit for id={adv_id}") logger.debug(f"Cache hit for id={adv_id}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached) results.append(cached)
continue continue
@@ -344,11 +340,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
if not address: if not address:
address = adv.get('address({"locale":"CS"})', "Praha") address = adv.get('address({"locale":"CS"})', "Praha")
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = { result = {
"hash_id": int(adv["id"]), "hash_id": int(adv["id"]),
"name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')}", "name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')}",
@@ -365,8 +356,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
"url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}", "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
"source": "bezrealitky", "source": "bezrealitky",
"image": "", "image": "",
"first_seen": first_seen, "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1
@@ -395,8 +385,6 @@ if __name__ == "__main__":
help="Maximum number of properties to fetch details for") help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging # Configure logging
@@ -406,12 +394,11 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
data_dir = Path(args.data_dir)
start = time.time() start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates: if estates:
json_path = data_dir / "byty_bezrealitky.json" json_path = Path("byty_bezrealitky.json")
json_path.write_text( json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2), json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",

View File

@@ -34,24 +34,26 @@ HEADERS = {
BASE_URL = "https://www.city-home.cz" BASE_URL = "https://www.city-home.cz"
def fetch_url(url: str) -> str: def fetch_url(url: str, retries: int = 3) -> str:
"""Fetch URL and return HTML string.""" """Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx."""
for attempt in range(3): for attempt in range(retries):
try: try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}") logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS) req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30) resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8") html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html return html
except urllib.error.HTTPError:
# Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e: except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
if attempt < 2: if attempt < retries - 1:
wait = (attempt + 1) * 2 wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}") logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
time.sleep(wait) time.sleep(wait)
else: else:
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True) logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
raise raise
@@ -125,31 +127,21 @@ def parse_filter_page(html: str) -> list[dict]:
if detail_url and not detail_url.startswith("http"): if detail_url and not detail_url.startswith("http"):
detail_url = BASE_URL + detail_url detail_url = BASE_URL + detail_url
# Extract floor from cells — look for pattern like "3.NP" or "2.PP" # Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price]
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL) cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
floor = None cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
floor_text = ""
project_name = ""
for cell in cells: # Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
cell_text = re.sub(r'<[^>]+>', '', cell).strip() project_address = cell_texts[2] if len(cell_texts) > 2 else ""
# Floor pattern
np_match = re.search(r'(\d+)\.\s*NP', cell_text) floor = None
pp_match = re.search(r'(\d+)\.\s*PP', cell_text) if len(cell_texts) > 3:
np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3])
pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
if np_match: if np_match:
floor = int(np_match.group(1)) floor = int(np_match.group(1))
floor_text = cell_text
elif pp_match: elif pp_match:
floor = -int(pp_match.group(1)) # Underground floor = -int(pp_match.group(1))
floor_text = cell_text
# Extract project name — usually in a cell that's not a number/price/floor
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "" not in cell_text and "" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
if len(cell_text) > 3 and cell_text != unit_name:
project_name = cell_text
break
listing = { listing = {
"price": int(cena.group(1)), "price": int(cena.group(1)),
@@ -159,43 +151,58 @@ def parse_filter_page(html: str) -> list[dict]:
"project_id": project.group(1) if project else "", "project_id": project.group(1) if project else "",
"transaction": transaction.group(1) if transaction else "", "transaction": transaction.group(1) if transaction else "",
"disposition": dispozition.group(1) if dispozition else "", "disposition": dispozition.group(1) if dispozition else "",
"location": location.group(1) if location else "",
"url": detail_url, "url": detail_url,
"unit_name": unit_name, "unit_name": unit_name,
"floor": floor, "floor": floor,
"project_name": project_name, "project_address": project_address,
} }
listings.append(listing) listings.append(listing)
return listings return listings
def extract_project_gps(html: str) -> dict[str, tuple[float, float]]: def get_lokalita_urls(slug: str) -> list[str]:
"""Extract GPS coordinates for projects from locality pages.""" """Return candidate lokalita URLs to try in order."""
# Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name'] return [
gps_data = {} f"{BASE_URL}/projekty/{slug}/lokalita",
for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html): f"{BASE_URL}/bytove-domy/{slug}/lokalita",
name = match.group(1).strip() f"{BASE_URL}/bytove-domy/{slug}/lokalita1",
lat = float(match.group(2)) ]
lon = float(match.group(3))
gps_data[name] = (lat, lon)
return gps_data
def load_previous(json_path: str = "byty_cityhome.json") -> dict[str, str]: def extract_project_gps(html: str) -> tuple[float, float] | None:
"""Load first_seen dates from previous run, keyed by hash_id.""" """Extract project GPS from lokalita page JS variable.
path = Path(json_path)
if not path.exists(): The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...]
return {} Category '1' = the project's own marker. Some projects have two cat-1 entries (data error);
try: in that case we pick the one whose name contains a digit and is not a transit landmark.
data = json.loads(path.read_text(encoding="utf-8")) """
return {str(e["hash_id"]): e.get("first_seen", "") for e in data if "hash_id" in e} block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL)
except (json.JSONDecodeError, KeyError): if not block:
return {} return None
entries = re.findall(
r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
block.group(0),
re.DOTALL,
)
if not entries:
return None
if len(entries) == 1:
return float(entries[0][1]), float(entries[0][2])
# Multiple cat-1 entries: pick the real project marker
transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
for name, lat, lng in entries:
if re.search(r'\d', name) and not transit_re.search(name):
return float(lat), float(lng)
# Fallback: first entry
return float(entries[0][1]), float(entries[0][2])
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): def scrape(max_pages: int | None = None, max_properties: int | None = None):
previous_first_seen = load_previous(str(Path(data_dir) / "byty_cityhome.json"))
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z CityHome (city-home.cz)") logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Cena: do {format_price(MAX_PRICE)}")
@@ -224,22 +231,24 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
# Fetch GPS for each project from locality pages # Fetch GPS for each project from locality pages
project_gps = {} project_gps = {}
for slug in sorted(project_slugs): for slug in sorted(project_slugs):
time.sleep(0.5) time.sleep(0.3)
gps = None
for url in get_lokalita_urls(slug):
try: try:
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita" logger.debug(f"Fetching project GPS: {url}")
logger.debug(f"Fetching project GPS: {locality_url}") loc_html = fetch_url(url)
loc_html = fetch_url(locality_url)
gps = extract_project_gps(loc_html) gps = extract_project_gps(loc_html)
if gps: if gps:
# Take first entry (the project itself) break
first_name, (lat, lon) = next(iter(gps.items())) except Exception as e:
project_gps[slug] = (lat, lon) logger.debug(f"GPS fetch failed for {url}: {e}")
logger.info(f"{slug}: {lat}, {lon}") continue
if gps:
project_gps[slug] = gps
logger.info(f"{slug}: {gps[0]}, {gps[1]}")
else: else:
logger.info(f"{slug}: GPS nenalezeno") logger.info(f"{slug}: GPS nenalezeno")
except Exception as e:
logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
logger.info(f"{slug}: chyba ({e})")
# Step 3: Filter listings # Step 3: Filter listings
logger.info(f"\nFáze 3: Filtrování...") logger.info(f"\nFáze 3: Filtrování...")
@@ -317,28 +326,37 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
lat, lon = gps lat, lon = gps
today = datetime.now().strftime("%Y-%m-%d") # locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup
hash_id = f"cityhome_{slug}_{listing['unit_name']}" project_address = listing.get("project_address", "")
first_seen = previous_first_seen.get(str(hash_id), "") or today # derive city from slug (GPS lookup key)
city_map = {
"karlinske-namesti-5": "Praha 8",
"melnicka-12": "Praha 7",
"na-vaclavce-34": "Praha 5",
"nad-kajetankou-12": "Praha 6",
"vosmikovych-3": "Praha 9",
"zateckych-14": "Praha 2",
}
city_str = city_map.get(slug, "Praha")
locality_str = f"{project_address}, {city_str}" if project_address else city_str
result = { result = {
"hash_id": hash_id, "hash_id": f"cityhome_{slug}_{listing['unit_name']}",
"name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}", "name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}",
"price": price, "price": price,
"price_formatted": format_price(price), "price_formatted": format_price(price),
"locality": f"{listing['project_name']}, Praha", "locality": locality_str,
"lat": lat, "lat": lat,
"lon": lon, "lon": lon,
"disposition": disp, "disposition": disp,
"floor": floor, "floor": floor,
"area": area, "area": float(area),
"building_type": "Cihlová", # CityHome renovuje cihlové domy "building_type": "Cihlová", # CityHome renovuje cihlové domy
"ownership": "neuvedeno", "ownership": "neuvedeno",
"url": url, "url": url,
"source": "cityhome", "source": "cityhome",
"image": "", "image": "",
"first_seen": first_seen, "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1
@@ -367,8 +385,6 @@ if __name__ == "__main__":
help="Maximum number of properties to include in results") help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging # Configure logging
@@ -378,12 +394,11 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
data_dir = Path(args.data_dir)
start = time.time() start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates: if estates:
json_path = data_dir / "byty_cityhome.json" json_path = Path("byty_cityhome.json")
json_path.write_text( json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2), json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",

View File

@@ -7,6 +7,7 @@ Výstup: byty_idnes.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
@@ -14,7 +15,6 @@ import re
import time import time
import urllib.request import urllib.request
import urllib.parse import urllib.parse
from datetime import datetime
from html.parser import HTMLParser from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
@@ -278,8 +278,8 @@ def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
return {} return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache(str(Path(data_dir) / "byty_idnes.json")) cache = load_cache()
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z Reality iDNES") logger.info("Stahuji inzeráty z Reality iDNES")
@@ -379,14 +379,10 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
logger.debug(f"Max properties limit reached: {max_properties}") logger.debug(f"Max properties limit reached: {max_properties}")
break break
# Check cache — if hash_id exists and price unchanged, reuse # Check cache — if hash_id exists and price unchanged, reuse
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(str(item["id"])) cached = cache.get(str(item["id"]))
if cached and cached.get("price") == item["price"]: if cached and cached.get("price") == item["price"]:
cache_hits += 1 cache_hits += 1
logger.debug(f"Cache hit for id={item['id']}") logger.debug(f"Cache hit for id={item['id']}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached) results.append(cached)
continue continue
@@ -447,11 +443,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
else: else:
building_type = construction.capitalize() building_type = construction.capitalize()
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = { result = {
"hash_id": item["id"], "hash_id": item["id"],
"name": f"Prodej bytu {item['disposition']} {item.get('area', '?')}", "name": f"Prodej bytu {item['disposition']} {item.get('area', '?')}",
@@ -468,8 +459,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
"url": item["url"], "url": item["url"],
"source": "idnes", "source": "idnes",
"image": "", "image": "",
"first_seen": first_seen, "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1
@@ -499,8 +489,6 @@ if __name__ == "__main__":
help="Maximum number of properties to fetch details for") help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging # Configure logging
@@ -510,12 +498,11 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
data_dir = Path(args.data_dir)
start = time.time() start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates: if estates:
json_path = data_dir / "byty_idnes.json" json_path = Path("byty_idnes.json")
json_path.write_text( json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2), json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
PSN.cz scraper. PSN.cz scraper.
Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií. Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování.
Výstup: byty_psn.json Výstup: byty_psn.json
""" """
from __future__ import annotations from __future__ import annotations
@@ -14,6 +14,7 @@ import subprocess
import time import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from urllib.parse import urlencode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -23,82 +24,37 @@ MAX_PRICE = 14_000_000
MIN_AREA = 69 MIN_AREA = 69
MIN_FLOOR = 2 MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"}
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
WANTED_CITIES = {"Praha"}
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://psn.cz" BASE_URL = "https://psn.cz"
UNITS_API = f"{BASE_URL}/api/units-list"
# Known Prague project slugs with GPS (from research)
PRAGUE_PROJECTS = [
{"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
{"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
{"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
{"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
{"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
{"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
{"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
{"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
{"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
{"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
{"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
{"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
{"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
{"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
{"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
{"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
]
def fetch_url(url: str) -> str: def fetch_json(url: str) -> dict:
"""Fetch URL via curl (urllib SSL too old for Cloudflare).""" """Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
logger.debug(f"HTTP GET request (via curl): {url}") logger.debug(f"HTTP GET: {url}")
logger.debug(f"User-Agent: {UA}")
result = subprocess.run( result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30", ["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}", "-H", f"User-Agent: {UA}",
"-H", "Accept: text/html", "-H", "Accept: application/json",
url], url],
capture_output=True, text=True, timeout=60 capture_output=True, text=True, timeout=60
) )
if result.returncode != 0: if result.returncode != 0:
logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}")
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
logger.debug(f"HTTP response: size={len(result.stdout)} bytes") return json.loads(result.stdout)
return result.stdout
def extract_units_from_html(html: str) -> list[dict]: def fix_gps(lat, lng):
"""Extract unit JSON objects from raw HTML with escaped quotes.""" """PSN má u některých projektů prohozené lat/lng — opravíme."""
# The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\" if lat is not None and lng is not None and lat < 20 and lng > 20:
# Step 1: Unescape the double-backslash-quotes to regular quotes return lng, lat
cleaned = html.replace('\\"', '"') return lat, lng
# Step 2: Find each unit by looking for "title":"Byt and walking back to {
units = []
decoder = json.JSONDecoder()
for m in re.finditer(r'"title":"Byt', cleaned):
pos = m.start()
# Walk backwards to find the opening brace
depth = 0
found = False
for i in range(pos - 1, max(pos - 3000, 0), -1):
if cleaned[i] == '}':
depth += 1
elif cleaned[i] == '{':
if depth == 0:
try:
obj, end = decoder.raw_decode(cleaned, i)
if isinstance(obj, dict) and 'price_czk' in obj:
units.append(obj)
found = True
except (json.JSONDecodeError, ValueError):
pass
break
depth -= 1
return units
def format_price(price: int) -> str: def format_price(price: int) -> str:
@@ -110,228 +66,178 @@ def format_price(price: int) -> str:
return " ".join(reversed(parts)) + "" return " ".join(reversed(parts)) + ""
def load_previous(json_path: str = "byty_psn.json") -> dict[str, str]: def scrape(max_properties: int | None = None):
"""Load first_seen dates from previous run, keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {str(e["hash_id"]): e.get("first_seen", "") for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
previous_first_seen = load_previous(str(Path(data_dir) / "byty_psn.json"))
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z PSN.cz") logger.info("Stahuji inzeráty z PSN.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}") logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP") logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)") logger.info(f"Region: Praha")
if max_pages:
logger.info(f"Max. stran: {max_pages}")
if max_properties: if max_properties:
logger.info(f"Max. bytů: {max_properties}") logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60) logger.info("=" * 60)
# Fetch units from each Prague project # Jediný API požadavek — vrátí všechny jednotky (cca 236)
all_units = [] params = urlencode({
"locale": "cs",
for proj in PRAGUE_PROJECTS: "filters": "{}",
page = 1 "type": "list",
project_units = [] "order": "price-asc",
"offset": 0,
while True: "limit": 500,
if max_pages and page > max_pages: })
logger.debug(f"Max pages limit reached: {max_pages}") url = f"{UNITS_API}?{params}"
break logger.info("Stahuji jednotky z API ...")
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
logger.info(f"{proj['name']} — strana {page} ...")
time.sleep(0.5)
try: try:
html = fetch_url(url) data = fetch_json(url)
except Exception as e: except Exception as e:
logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True) logger.error(f"Chyba při stahování: {e}", exc_info=True)
break return []
units = extract_units_from_html(html) all_units = data.get("units", {}).get("data", [])
logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units") logger.info(f"Staženo jednotek celkem: {len(all_units)}")
if not units: # Filtrování
if page == 1:
logger.info(f"→ 0 jednotek")
break
# Add project info to each unit
for unit in units:
if not unit.get("latitude") or not unit.get("longitude"):
unit["latitude"] = proj["lat"]
unit["longitude"] = proj["lon"]
unit["_project_name"] = proj["name"]
unit["_project_slug"] = proj["slug"]
project_units.extend(units)
if page == 1:
logger.info(f"{len(units)} jednotek na stránce")
# Check if there might be more pages
# If we got fewer than expected or same units, stop
if len(units) < 10:
break
page += 1
if page > 10: # Safety limit
break
all_units.extend(project_units)
# Deduplicate by slug
seen_slugs = set()
unique_units = []
for u in all_units:
slug = u.get("slug", "")
if slug and slug not in seen_slugs:
seen_slugs.add(slug)
unique_units.append(u)
elif not slug:
unique_units.append(u)
logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek")
# Filter
logger.info(f"\nFiltrování...")
results = [] results = []
excluded_sold = 0 excluded = {
excluded_type = 0 "prodáno": 0,
excluded_disp = 0 "typ": 0,
excluded_price = 0 "město": 0,
excluded_area = 0 "dispozice": 0,
excluded_floor = 0 "cena": 0,
excluded_panel = 0 "plocha": 0,
"patro": 0,
}
properties_fetched = 0 properties_fetched = 0
for unit in unique_units: for unit in all_units:
if max_properties and properties_fetched >= max_properties: if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break break
unit_id = unit.get("id", unit.get("slug", "unknown"))
# Only free units unit_id = unit.get("id", "?")
# Pouze prodej bytů (type_id=0)
if unit.get("type_id") != 0:
excluded["typ"] += 1
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)")
continue
# Pouze volné (ne rezervované, prodané, v přípravě)
sale_status = unit.get("sale_status", "")
is_free = unit.get("is_free", False) is_free = unit.get("is_free", False)
is_sold = unit.get("is_sold", False) is_sold = unit.get("is_sold", False)
if is_sold or not is_free: if is_sold or not is_free:
excluded_sold += 1 excluded["prodáno"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)") logger.debug(f"id={unit_id}: přeskočen (status={sale_status})")
continue continue
# Only apartments # Pouze Praha
category = str(unit.get("category", "")).lower() city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip()
if "byt" not in category and "ateliér" not in category: # location field je typicky "Praha 4", "Praha 7" atd.
excluded_type += 1 city_base = city.split(" ")[0] if city else ""
logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})") if city_base not in WANTED_CITIES:
excluded["město"] += 1
logger.debug(f"id={unit_id}: přeskočen (město={city})")
continue continue
# Disposition # Dispozice
disp = unit.get("disposition", "") disp = unit.get("disposition", "")
if disp not in WANTED_DISPOSITIONS: if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1 excluded["dispozice"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})") logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})")
continue continue
# Price # Cena
price = unit.get("price_czk") or unit.get("action_price_czk") or 0 price = unit.get("action_price_czk") or unit.get("price_czk") or 0
if price <= 0 or price > MAX_PRICE: if not price or price <= 0 or price > MAX_PRICE:
excluded_price += 1 excluded["cena"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (price {price})") logger.debug(f"id={unit_id}: přeskočen (cena={price})")
continue continue
# Area # Plocha
area = unit.get("total_area") or unit.get("floor_area") or 0 area = unit.get("total_area") or unit.get("floor_area") or 0
if area < MIN_AREA: if area < MIN_AREA:
excluded_area += 1 excluded["plocha"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)") logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)")
continue continue
# Floor # Patro
floor_str = str(unit.get("floor", "")) floor_str = str(unit.get("floor", ""))
floor = None floor = None
if floor_str: if floor_str:
try: try:
floor = int(floor_str) floor = int(floor_str)
except ValueError: except ValueError:
floor_match = re.search(r'(-?\d+)', floor_str) m = re.search(r'(-?\d+)', floor_str)
if floor_match: if m:
floor = int(floor_match.group(1)) floor = int(m.group(1))
if floor is not None and floor < MIN_FLOOR: if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1 excluded["patro"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})") logger.debug(f"id={unit_id}: přeskočen (patro={floor})")
continue continue
# Construction — check for panel # GPS — opravit prohozené souřadnice
build_type = str(unit.get("build_type", "")).lower() lat_raw = unit.get("latitude")
if "panel" in build_type: lng_raw = unit.get("longitude")
excluded_panel += 1 lat, lng = fix_gps(lat_raw, lng_raw)
logger.debug(f"Filter: id={unit_id} - excluded (panel construction)") if not lat or not lng:
logger.info(f"✗ Vyloučen: panel ({build_type})") logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji")
continue continue
# Build construction label # Sestavit adresu pro locality
building_type = "neuvedeno" addr = unit.get("address") or {}
if build_type and build_type != "nevybráno": street = addr.get("street", "")
if "cihlo" in build_type or "cihla" in build_type: street_no = addr.get("street_no", "")
building_type = "Cihlová" if street and street_no:
elif "skelet" in build_type: locality_str = f"{street} {street_no}, {city}"
building_type = "Skeletová" elif street:
locality_str = f"{street}, {city}"
else: else:
building_type = build_type.capitalize() project_name = unit.get("project", "")
locality_str = f"{project_name}, {city}" if project_name else city
lat = unit.get("latitude", 0) # URL na detail jednotky
lon = unit.get("longitude", 0) unit_slug = unit.get("slug", "")
project_slug = ""
slug = unit.get("slug", "") # project_slug lze odvodit z projektu nebo z reference_no
project_slug = unit.get("_project_slug", "") # API nevrací project_slug přímo — použijeme reference_no nebo jen ID
detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}" reference_no = unit.get("reference_no", "")
if unit_slug:
today = datetime.now().strftime("%Y-%m-%d") detail_url = f"{BASE_URL}/prodej/{unit_slug}"
hash_id = unit.get("id", slug) elif reference_no:
first_seen = previous_first_seen.get(str(hash_id), "") or today detail_url = f"{BASE_URL}/prodej/{reference_no}"
else:
detail_url = BASE_URL
result = { result = {
"hash_id": hash_id, "hash_id": str(unit_id),
"name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}", "name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}",
"price": int(price), "price": int(price),
"price_formatted": format_price(int(price)), "price_formatted": format_price(int(price)),
"locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha", "locality": locality_str,
"lat": lat, "lat": lat,
"lon": lon, "lon": lng,
"disposition": disp, "disposition": disp,
"floor": floor, "floor": floor,
"area": area, "area": float(area),
"building_type": building_type, "building_type": "neuvedeno",
"ownership": unit.get("ownership", "neuvedeno") or "neuvedeno", "ownership": "osobní",
"url": detail_url, "url": detail_url,
"source": "psn", "source": "psn",
"image": "", "image": "",
"first_seen": first_seen, "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1
logger.info(f"\n{'=' * 60}") logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky PSN:") logger.info(f"Výsledky PSN:")
logger.info(f" Celkem jednotek: {len(unique_units)}") logger.info(f" Staženo jednotek: {len(all_units)}")
logger.info(f" Vyloučeno (prodáno): {excluded_sold}") for reason, count in excluded.items():
logger.info(f" Vyloučeno (typ): {excluded_type}") if count:
logger.info(f" Vyloučeno (dispozice): {excluded_disp}") logger.info(f" Vyloučeno ({reason}): {count}")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (panel): {excluded_panel}")
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
@@ -340,35 +246,30 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz") parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages per project to scrape")
parser.add_argument("--max-properties", type=int, default=None, parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to include in results") help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], parser.add_argument("--log-level", type=str, default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging
logging.basicConfig( logging.basicConfig(
level=getattr(logging, args.log_level), level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
data_dir = Path(args.data_dir)
start = time.time() start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) estates = scrape(max_properties=args.max_properties)
if estates: if estates:
json_path = data_dir / "byty_psn.json" json_path = Path("byty_psn.json")
json_path.write_text( json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2), json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",
) )
elapsed = time.time() - start elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") logger.info(f"⏱ Celkový čas: {elapsed:.1f} s")
else: else:
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(") logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")

View File

@@ -7,13 +7,13 @@ Výstup: byty_realingo.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
import re import re
import time import time
import urllib.request import urllib.request
from datetime import datetime
from pathlib import Path from pathlib import Path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -135,8 +135,8 @@ def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
return {} return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache(str(Path(data_dir) / "byty_realingo.json")) cache = load_cache()
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z Realingo.cz") logger.info("Stahuji inzeráty z Realingo.cz")
@@ -239,14 +239,10 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
# Check cache — if hash_id exists and price unchanged, reuse # Check cache — if hash_id exists and price unchanged, reuse
item_id = int(item["id"]) item_id = int(item["id"])
item_price = item.get("price", {}).get("total", 0) or 0 item_price = item.get("price", {}).get("total", 0) or 0
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(item_id) cached = cache.get(item_id)
if cached and cached.get("price") == item_price: if cached and cached.get("price") == item_price:
cache_hits += 1 cache_hits += 1
logger.debug(f"Cache hit for id={item_id}") logger.debug(f"Cache hit for id={item_id}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached) results.append(cached)
continue continue
@@ -303,11 +299,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
cat = item.get("category", "") cat = item.get("category", "")
loc = item.get("location", {}) loc = item.get("location", {})
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = { result = {
"hash_id": int(item["id"]), "hash_id": int(item["id"]),
"name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')}", "name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')}",
@@ -324,8 +315,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None, data
"url": f"{BASE_URL}{item['url']}", "url": f"{BASE_URL}{item['url']}",
"source": "realingo", "source": "realingo",
"image": "", "image": "",
"first_seen": first_seen, "scraped_at": datetime.now().strftime("%Y-%m-%d"),
"last_updated": today,
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1
@@ -354,8 +344,6 @@ if __name__ == "__main__":
help="Maximum number of properties to fetch details for") help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging # Configure logging
@@ -365,12 +353,11 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
data_dir = Path(args.data_dir)
start = time.time() start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates: if estates:
json_path = data_dir / "byty_realingo.json" json_path = Path("byty_realingo.json")
json_path.write_text( json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2), json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",

204
status.html Normal file
View File

@@ -0,0 +1,204 @@
<!DOCTYPE html>
<html lang="cs">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Scraper status</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: system-ui, -apple-system, sans-serif;
background: #f5f5f5; color: #333;
padding: 24px; max-width: 640px; margin: 0 auto;
}
h1 { font-size: 22px; margin-bottom: 4px; }
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
.card {
background: white; border-radius: 12px; padding: 20px;
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
}
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
.timestamp {
font-size: 28px; font-weight: 700; color: #1976D2;
}
.timestamp-ago { font-size: 13px; color: #999; margin-top: 2px; }
/* Source table */
.source-table { width: 100%; border-collapse: collapse; }
.source-table td { padding: 8px 0; border-bottom: 1px solid #f0f0f0; font-size: 14px; }
.source-table tr:last-child td { border-bottom: none; }
.source-table .name { font-weight: 600; }
.source-table .count { text-align: right; font-variant-numeric: tabular-nums; }
.source-table .rejected { text-align: right; color: #999; font-size: 12px; }
.badge {
display: inline-block; padding: 2px 8px; border-radius: 4px;
font-size: 11px; font-weight: 600; color: white;
}
.badge-ok { background: #4CAF50; }
.badge-err { background: #F44336; }
.badge-skip { background: #FF9800; }
/* Summary bar */
.summary-row {
display: flex; justify-content: space-between; align-items: center;
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
}
.summary-row:last-child { border-bottom: none; }
.summary-label { font-size: 13px; color: #666; }
.summary-value { font-size: 18px; font-weight: 700; }
/* Source bar chart */
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
.bar-label { width: 90px; font-size: 12px; text-align: right; color: #666; }
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; position: relative; }
.bar-fill { height: 100%; border-radius: 4px; transition: width 0.5s ease; }
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
/* Loader */
.loader-wrap {
display: flex; flex-direction: column; align-items: center;
justify-content: center; padding: 60px 0;
}
.spinner {
width: 40px; height: 40px; border: 4px solid #e0e0e0;
border-top-color: #1976D2; border-radius: 50%;
animation: spin 0.8s linear infinite;
}
@keyframes spin { to { transform: rotate(360deg); } }
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
.error-msg { color: #F44336; padding: 40px 0; text-align: center; }
.link-row { text-align: center; margin-top: 8px; }
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
</style>
</head>
<body>
<h1>Scraper status</h1>
<div class="subtitle">maru-hleda-byt</div>
<div id="content">
<div class="loader-wrap">
<div class="spinner"></div>
<div class="loader-text">Nacitam status...</div>
</div>
</div>
<div class="link-row"><a href="mapa_bytu.html">Otevrit mapu</a></div>
<script>
var COLORS = {
sreality: '#1976D2',
realingo: '#7B1FA2',
bezrealitky: '#E65100',
idnes: '#C62828',
psn: '#2E7D32',
cityhome: '#00838F',
};
function timeAgo(dateStr) {
var d = new Date(dateStr);
var now = new Date();
var diff = Math.floor((now - d) / 1000);
if (diff < 60) return 'prave ted';
if (diff < 3600) return Math.floor(diff / 60) + ' min zpet';
if (diff < 86400) return Math.floor(diff / 3600) + ' hod zpet';
return Math.floor(diff / 86400) + ' dni zpet';
}
function formatDate(dateStr) {
var d = new Date(dateStr);
var day = d.getDate();
var months = ['ledna','unora','brezna','dubna','kvetna','cervna',
'cervence','srpna','zari','rijna','listopadu','prosince'];
var hh = String(d.getHours()).padStart(2, '0');
var mm = String(d.getMinutes()).padStart(2, '0');
return day + '. ' + months[d.getMonth()] + ' ' + d.getFullYear() + ', ' + hh + ':' + mm;
}
function render(data) {
// Check if scrape is currently running
if (data.status === 'running') {
document.getElementById('content').innerHTML =
'<div class="loader-wrap">' +
'<div class="spinner"></div>' +
'<div class="loader-text">Scraper prave bezi...</div>' +
'</div>';
setTimeout(loadStatus, 30000);
return;
}
var sources = data.sources || [];
var totalOk = 0, totalRej = 0;
var maxCount = 0;
sources.forEach(function(s) {
totalOk += s.accepted || 0;
totalRej += s.rejected || 0;
if (s.accepted > maxCount) maxCount = s.accepted;
});
var html = '';
// Timestamp card
html += '<div class="card">';
html += '<h2>Posledni scrape</h2>';
html += '<div class="timestamp">' + formatDate(data.timestamp) + '</div>';
html += '<div class="timestamp-ago">' + timeAgo(data.timestamp) + '</div>';
if (data.duration_sec) {
html += '<div class="timestamp-ago">Trvani: ' + Math.round(data.duration_sec) + 's</div>';
}
html += '</div>';
// Summary card
html += '<div class="card">';
html += '<h2>Souhrn</h2>';
html += '<div class="summary-row"><span class="summary-label">Vyhovujicich bytu</span><span class="summary-value" style="color:#4CAF50">' + totalOk + '</span></div>';
html += '<div class="summary-row"><span class="summary-label">Vyloucenych</span><span class="summary-value" style="color:#999">' + totalRej + '</span></div>';
if (data.deduplicated !== undefined) {
html += '<div class="summary-row"><span class="summary-label">Po deduplikaci (v mape)</span><span class="summary-value" style="color:#1976D2">' + data.deduplicated + '</span></div>';
}
html += '</div>';
// Sources card
html += '<div class="card">';
html += '<h2>Zdroje</h2>';
sources.forEach(function(s) {
var color = COLORS[s.name.toLowerCase()] || '#999';
var pct = maxCount > 0 ? Math.round((s.accepted / maxCount) * 100) : 0;
var badge = s.error
? '<span class="badge badge-err">chyba</span>'
: (s.accepted === 0 ? '<span class="badge badge-skip">0</span>' : '<span class="badge badge-ok">OK</span>');
html += '<div style="margin-bottom:12px;">';
html += '<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">';
html += '<span style="font-weight:600;font-size:14px;">' + s.name + ' ' + badge + '</span>';
html += '<span style="font-size:12px;color:#999;">' + (s.rejected || 0) + ' vyloucenych</span>';
html += '</div>';
html += '<div class="bar-row">';
html += '<div class="bar-track"><div class="bar-fill" style="width:' + pct + '%;background:' + color + ';"></div></div>';
html += '<span class="bar-count">' + (s.accepted || 0) + '</span>';
html += '</div>';
html += '</div>';
});
html += '</div>';
document.getElementById('content').innerHTML = html;
}
function loadStatus() {
fetch('status.json?t=' + Date.now())
.then(function(r) {
if (!r.ok) throw new Error(r.status);
return r.json();
})
.then(render)
.catch(function(err) {
document.getElementById('content').innerHTML =
'<div class="error-msg">Status zatim neni k dispozici.<br><small>(' + err.message + ')</small></div>';
});
}
loadStatus();
</script>
</body>
</html>