4 Commits

Author SHA1 Message Date
Jan Novak
a0ae8319be Add Gitea Actions CI pipeline for Docker image builds
All checks were successful
Build and Push / build (push) Successful in 14s
Triggers on tag push or manual dispatch. Builds the image using
build/Dockerfile and pushes to the Gitea container registry.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 23:31:59 +01:00
Jan Novak
a1ef9d1632 Add comprehensive project documentation
Cover the full pipeline (scrapers, merge, map generation), all 6 data
sources with their parsing methods, filter criteria, CLI arguments,
Docker setup, caching, rate limiting, and project structure.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 23:20:54 +01:00
Jan Novak
f3f2f3625f Add validation mode, structured logging, and CLI args to all scrapers
- Replace print() with Python logging module across all 6 scrapers
  for configurable log levels (DEBUG/INFO/WARNING/ERROR)
- Add --max-pages, --max-properties, and --log-level CLI arguments
  to each scraper via argparse for limiting scrape scope
- Add validation Make targets (validation, validation-local,
  validation-local-debug) for quick test runs with limited data
- Update run_all.sh to parse and forward CLI args to all scrapers
- Update mapa_bytu.html with latest scrape results

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 23:12:59 +01:00
Jan Novak
8373e5e910 add docker build, makefile, and some more shit before we move forward 2026-02-14 22:18:02 +01:00
22 changed files with 395 additions and 32982 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@@ -1,31 +0,0 @@
{
"permissions": {
"allow": [
"WebFetch(domain:github.com)",
"WebFetch(domain:www.sreality.cz)",
"WebFetch(domain:webscraping.pro)",
"WebFetch(domain:raw.githubusercontent.com)",
"Bash(python3:*)",
"Bash(open:*)",
"WebFetch(domain:www.realingo.cz)",
"WebFetch(domain:api.realingo.cz)",
"Bash(curl:*)",
"Bash(grep:*)",
"WebFetch(domain:www.realitni-pes.cz)",
"WebFetch(domain:www.bezrealitky.cz)",
"WebFetch(domain:apify.com)",
"WebFetch(domain:www.bezrealitky.com)",
"WebFetch(domain:reality.idnes.cz)",
"Bash(# Final checks: robots.txt and response time for rate limiting clues curl -s -L -H \"\"User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/120.0.0.0 Safari/537.36\"\" \"\"https://reality.idnes.cz/robots.txt\"\")",
"WebFetch(domain:www.cityhome.cz)",
"WebFetch(domain:www.psn.cz)",
"WebFetch(domain:www.city-home.cz)",
"WebFetch(domain:psn.cz)",
"WebFetch(domain:api.psn.cz)",
"Bash(done)",
"Bash(# Final summary: count total units across all projects\n# Get the total count from the unitsCountData we already extracted\necho \"\"From unitsCountData on /prodej page:\"\"\necho \"\" type_id 0 \\(Prodej bytů a ateliérů\\): 146\"\"\necho \"\" type_id 1 \\(Prodej komerčních nemovitostí\\): 14\"\"\necho \"\" type_id 2 \\(Pronájem bytů\\): 3\"\"\necho \"\" type_id 3 \\(Pronájem komerčních nemovitostí\\): 48\"\"\necho \"\"\"\"\necho \"\"Total for-sale projects: 19\"\"\necho \"\"\"\"\necho \"\"Disposition counts from the data:\"\"\npython3 << 'PYEOF'\n# Extract disposition counts from prodej page\nimport re\n\nwith open\\('/tmp/psn_prodej_p1.html', 'r', encoding='utf-8'\\) as f:\n html = f.read\\(\\)\n\n# Find disposition data\nidx = html.find\\('\\\\\\\\\"disposition\\\\\\\\\":['\\)\nif idx >= 0:\n chunk = html[idx:idx+2000].replace\\('\\\\\\\\\"', '\"'\\)\n # Extract name and count pairs\n import re\n pairs = re.findall\\(r'\"name\":\"\\([^\"]+\\)\",\"count\":\\(\\\\d+\\)', chunk\\)\n for name, count in pairs:\n print\\(f\" {name}: {count}\"\\)\nPYEOF)",
"Bash(ls:*)",
"Bash(chmod:*)"
]
}
}

View File

@@ -10,7 +10,7 @@ WORKDIR /app
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
scrape_idnes.py scrape_psn.py scrape_cityhome.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \
merge_and_map.py regen_map.py run_all.sh ratings_server.py ./ merge_and_map.py regen_map.py run_all.sh ./
COPY build/crontab /etc/crontabs/root COPY build/crontab /etc/crontabs/root
COPY build/entrypoint.sh /entrypoint.sh COPY build/entrypoint.sh /entrypoint.sh
@@ -18,7 +18,7 @@ RUN chmod +x /entrypoint.sh run_all.sh
RUN mkdir -p /app/data RUN mkdir -p /app/data
EXPOSE 8080 8081 EXPOSE 8080
HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \
CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 CMD wget -q -O /dev/null http://localhost:8080/ || exit 1

View File

@@ -6,7 +6,7 @@ DATA_DIR="/app/data"
# Create symlinks so scripts (which write to /app/) persist data to the volume # Create symlinks so scripts (which write to /app/) persist data to the volume
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \ byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \
mapa_bytu.html ratings.json; do mapa_bytu.html; do
# Remove real file if it exists (e.g. baked into image) # Remove real file if it exists (e.g. baked into image)
[ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f" [ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f"
ln -sf "$DATA_DIR/$f" "/app/$f" ln -sf "$DATA_DIR/$f" "/app/$f"
@@ -18,8 +18,5 @@ crond -b -l 2
echo "[entrypoint] Starting initial scrape in background..." echo "[entrypoint] Starting initial scrape in background..."
bash /app/run_all.sh & bash /app/run_all.sh &
echo "[entrypoint] Starting ratings API server on port 8081..."
DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py &
echo "[entrypoint] Starting HTTP server on port 8080..." echo "[entrypoint] Starting HTTP server on port 8080..."
exec python3 -m http.server 8080 --directory "$DATA_DIR" exec python3 -m http.server 8080 --directory "$DATA_DIR"

View File

@@ -1,427 +0,0 @@
[
{
"hash_id": 990183,
"name": "Prodej bytu 3+kk 86 m²",
"price": 10385000,
"price_formatted": "10 385 000 Kč",
"locality": "Ke Tvrzi, Praha - Královice",
"lat": 50.0390519,
"lon": 14.63862,
"disposition": "3+kk",
"floor": 2,
"area": 86,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/990183-nabidka-prodej-bytu-ke-tvrzi-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 989862,
"name": "Prodej bytu 3+kk 73 m²",
"price": 12790000,
"price_formatted": "12 790 000 Kč",
"locality": "Vrázova, Praha - Smíchov",
"lat": 50.0711312,
"lon": 14.4076652,
"disposition": "3+kk",
"floor": 3,
"area": 73,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/989862-nabidka-prodej-bytu-vrazova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 981278,
"name": "Prodej bytu 3+kk 70 m²",
"price": 11890000,
"price_formatted": "11 890 000 Kč",
"locality": "Argentinská, Praha - Holešovice",
"lat": 50.1026043,
"lon": 14.4435365,
"disposition": "3+kk",
"floor": 3,
"area": 70,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981278-nabidka-prodej-bytu-argentinska-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 989817,
"name": "Prodej bytu 3+kk 88 m²",
"price": 13490000,
"price_formatted": "13 490 000 Kč",
"locality": "Miroslava Hajna, Praha - Letňany",
"lat": 50.1406487,
"lon": 14.5207541,
"disposition": "3+kk",
"floor": 2,
"area": 88,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/989817-nabidka-prodej-bytu-miroslava-hajna-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 970257,
"name": "Prodej bytu 3+1 106 m²",
"price": 12950000,
"price_formatted": "12 950 000 Kč",
"locality": "Novákových, Praha - Libeň",
"lat": 50.1034771,
"lon": 14.4758735,
"disposition": "3+1",
"floor": 5,
"area": 106,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/970257-nabidka-prodej-bytu-novakovych-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 972406,
"name": "Prodej bytu 3+kk 83 m²",
"price": 10490000,
"price_formatted": "10 490 000 Kč",
"locality": "Na Výrovně, Praha - Stodůlky",
"lat": 50.0396067,
"lon": 14.3167022,
"disposition": "3+kk",
"floor": 2,
"area": 83,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/972406-nabidka-prodej-bytu-na-vyrovne",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 967142,
"name": "Prodej bytu 3+kk 78 m²",
"price": 11648000,
"price_formatted": "11 648 000 Kč",
"locality": "Na Míčánkách, Praha - Vršovice",
"lat": 50.0713284,
"lon": 14.4638722,
"disposition": "3+kk",
"floor": 6,
"area": 78,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/967142-nabidka-prodej-bytu-na-micankach",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 955977,
"name": "Prodej bytu 4+kk 75 m²",
"price": 10363000,
"price_formatted": "10 363 000 Kč",
"locality": "Karla Guta, Praha - Uhříněves",
"lat": 50.03017,
"lon": 14.5940072,
"disposition": "4+kk",
"floor": 4,
"area": 75,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/955977-nabidka-prodej-bytu-karla-guta",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 974557,
"name": "Prodej bytu 4+kk 94 m²",
"price": 13499900,
"price_formatted": "13 499 900 Kč",
"locality": "V Dolině, Praha - Michle",
"lat": 50.0579963,
"lon": 14.4682887,
"disposition": "4+kk",
"floor": 8,
"area": 94,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/974557-nabidka-prodej-bytu-v-doline-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 988498,
"name": "Prodej bytu 3+1 75 m²",
"price": 11400000,
"price_formatted": "11 400 000 Kč",
"locality": "5. května, Praha - Nusle",
"lat": 50.0604096,
"lon": 14.4326302,
"disposition": "3+1",
"floor": 4,
"area": 75,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/988498-nabidka-prodej-bytu-5-kvetna-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 985285,
"name": "Prodej bytu 3+kk 70 m²",
"price": 12200000,
"price_formatted": "12 200 000 Kč",
"locality": "Klausova, Praha - Stodůlky",
"lat": 50.0370204,
"lon": 14.3432643,
"disposition": "3+kk",
"floor": 5,
"area": 70,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985285-nabidka-prodej-bytu-klausova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 965526,
"name": "Prodej bytu 3+kk 77 m²",
"price": 11890000,
"price_formatted": "11 890 000 Kč",
"locality": "Vinohradská, Praha - Strašnice",
"lat": 50.0776726,
"lon": 14.4870072,
"disposition": "3+kk",
"floor": 16,
"area": 77,
"building_type": "Smíšená",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/965526-nabidka-prodej-bytu-vinohradska-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 924811,
"name": "Prodej bytu 3+kk 75 m²",
"price": 13390000,
"price_formatted": "13 390 000 Kč",
"locality": "Waltariho, Praha - Hloubětín",
"lat": 50.1076717,
"lon": 14.5248559,
"disposition": "3+kk",
"floor": 4,
"area": 75,
"building_type": "Smíšená",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/924811-nabidka-prodej-bytu-waltariho-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 985859,
"name": "Prodej bytu 3+1 80 m²",
"price": 9000000,
"price_formatted": "9 000 000 Kč",
"locality": "Staňkova, Praha - Háje",
"lat": 50.0377128,
"lon": 14.5311557,
"disposition": "3+1",
"floor": 2,
"area": 80,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985859-nabidka-prodej-bytu-stankova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 985583,
"name": "Prodej bytu 3+kk 76 m²",
"price": 10850000,
"price_formatted": "10 850 000 Kč",
"locality": "Boloňská, Praha - Horní Měcholupy",
"lat": 50.047328,
"lon": 14.5565277,
"disposition": "3+kk",
"floor": 4,
"area": 76,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985583-nabidka-prodej-bytu-bolonska-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 981178,
"name": "Prodej bytu 4+kk 86 m²",
"price": 11990000,
"price_formatted": "11 990 000 Kč",
"locality": "Sušilova, Praha - Uhříněves",
"lat": 50.032081,
"lon": 14.5885148,
"disposition": "4+kk",
"floor": 2,
"area": 86,
"building_type": "SKELET",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981178-nabidka-prodej-bytu-susilova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 973216,
"name": "Prodej bytu 4+1 82 m²",
"price": 11357000,
"price_formatted": "11 357 000 Kč",
"locality": "Nad Kapličkou, Praha - Strašnice",
"lat": 50.0839509,
"lon": 14.4904493,
"disposition": "4+1",
"floor": 2,
"area": 82,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/973216-nabidka-prodej-bytu-nad-kaplickou-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 868801,
"name": "Prodej bytu 3+kk 109 m²",
"price": 7299000,
"price_formatted": "7 299 000 Kč",
"locality": "Pod Karlovem, Praha - Vinohrady",
"lat": 50.0676313,
"lon": 14.432498,
"disposition": "3+kk",
"floor": 5,
"area": 109,
"building_type": "Cihlová",
"ownership": "Družstevní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/868801-nabidka-prodej-bytu-pod-karlovem-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 868795,
"name": "Prodej bytu 3+kk 106 m²",
"price": 6299000,
"price_formatted": "6 299 000 Kč",
"locality": "Pod Karlovem, Praha - Vinohrady",
"lat": 50.0676313,
"lon": 14.432498,
"disposition": "3+kk",
"floor": 2,
"area": 106,
"building_type": "Cihlová",
"ownership": "Družstevní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/868795-nabidka-prodej-bytu-pod-karlovem-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 981890,
"name": "Prodej bytu 3+1 84 m²",
"price": 12980000,
"price_formatted": "12 980 000 Kč",
"locality": "Novákových, Praha - Libeň",
"lat": 50.103273,
"lon": 14.4746894,
"disposition": "3+1",
"floor": 2,
"area": 84,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981890-nabidka-prodej-bytu-novakovych-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 976276,
"name": "Prodej bytu 3+kk 75 m²",
"price": 13490000,
"price_formatted": "13 490 000 Kč",
"locality": "Svornosti, Praha - Smíchov",
"lat": 50.0673284,
"lon": 14.4095087,
"disposition": "3+kk",
"floor": 2,
"area": 75,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/976276-nabidka-prodej-bytu-svornosti-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 950787,
"name": "Prodej bytu 3+kk 70 m²",
"price": 9999000,
"price_formatted": "9 999 000 Kč",
"locality": "Sečská, Praha - Strašnice",
"lat": 50.071191,
"lon": 14.5035501,
"disposition": "3+kk",
"floor": 3,
"area": 70,
"building_type": "Smíšená",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/950787-nabidka-prodej-bytu-secska-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 978045,
"name": "Prodej bytu 3+kk 76 m²",
"price": 11133000,
"price_formatted": "11 133 000 Kč",
"locality": "K Vinoři, Praha - Kbely",
"lat": 50.1329656,
"lon": 14.5618499,
"disposition": "3+kk",
"floor": 2,
"area": 76,
"building_type": "Smíšená",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/978045-nabidka-prodej-bytu-k-vinori",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 974552,
"name": "Prodej bytu 3+1 75 m²",
"price": 11000000,
"price_formatted": "11 000 000 Kč",
"locality": "Vejražkova, Praha - Košíře",
"lat": 50.0637808,
"lon": 14.3612275,
"disposition": "3+1",
"floor": 2,
"area": 75,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/974552-nabidka-prodej-bytu-vejrazkova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 955010,
"name": "Prodej bytu 3+kk 70 m²",
"price": 12290000,
"price_formatted": "12 290 000 Kč",
"locality": "Břeclavská, Praha - Kyje",
"lat": 50.0951045,
"lon": 14.5454237,
"disposition": "3+kk",
"floor": 2,
"area": 70,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/955010-nabidka-prodej-bytu-breclavska-hlavni-mesto-praha",
"source": "bezrealitky",
"image": ""
}
]

View File

@@ -1 +0,0 @@
[]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1 +0,0 @@
[]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,202 +0,0 @@
#!/usr/bin/env python3
"""Generate status.json from scraper JSON outputs and run log."""
from __future__ import annotations
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
HERE = Path(__file__).parent
SOURCE_FILES = {
"Sreality": "byty_sreality.json",
"Realingo": "byty_realingo.json",
"Bezrealitky": "byty_bezrealitky.json",
"iDNES": "byty_idnes.json",
"PSN": "byty_psn.json",
"CityHome": "byty_cityhome.json",
}
MERGED_FILE = "byty_merged.json"
def count_source(path: Path) -> dict:
"""Read a scraper JSON and return accepted count + file mtime."""
if not path.exists():
return {"accepted": 0, "error": "soubor nenalezen"}
try:
data = json.loads(path.read_text(encoding="utf-8"))
mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds")
return {"accepted": len(data), "updated_at": mtime}
except Exception as e:
return {"accepted": 0, "error": str(e)}
def parse_log(log_path: str) -> dict[str, dict]:
"""Parse scraper run log and extract per-source statistics.
Scrapers log summary lines like:
✓ Vyhovující byty: 12
Vyloučeno (prodáno): 5
Staženo stránek: 3
Staženo inzerátů: 48
Celkem bytů v cache: 120
and section headers like:
[2/6] Realingo
"""
if not log_path or not os.path.exists(log_path):
return {}
with open(log_path, encoding="utf-8") as f:
content = f.read()
# Split into per-source sections by the [N/6] Step header
# Each section header looks like "[2/6] Realingo\n----..."
section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE)
sections_found = list(section_pattern.finditer(content))
if not sections_found:
return {}
stats = {}
for i, match in enumerate(sections_found):
step_name = match.group(2).strip()
start = match.end()
end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content)
section_text = content[start:end]
# Identify which sources this section covers
# "PSN + CityHome" covers both
source_names = []
for name in SOURCE_FILES:
if name.lower() in step_name.lower():
source_names.append(name)
if not source_names:
continue
# Parse numeric summary lines
def extract(pattern: str) -> Optional[int]:
m = re.search(pattern, section_text)
return int(m.group(1)) if m else None
# Lines present in all/most scrapers
accepted = extract(r'Vyhovující byty[:\s]+(\d+)')
fetched = extract(r'Staženo inzerátů[:\s]+(\d+)')
pages = extract(r'Staženo stránek[:\s]+(\d+)')
cached = extract(r'Celkem bytů v cache[:\s]+(\d+)')
cache_hits = extract(r'Cache hit[:\s]+(\d+)')
# Rejection reasons — collect all into a dict
excluded = {}
for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text):
excluded[m.group(1)] = int(m.group(2))
# Also PSN-style "Vyloučeno (prodáno): N"
total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)')
entry = {}
if accepted is not None:
entry["accepted"] = accepted
if fetched is not None:
entry["fetched"] = fetched
if pages is not None:
entry["pages"] = pages
if cached is not None:
entry["cached"] = cached
if cache_hits is not None:
entry["cache_hits"] = cache_hits
if excluded:
entry["excluded"] = excluded
elif total_excluded is not None:
entry["excluded_total"] = total_excluded
for name in source_names:
stats[name] = entry
return stats
def main():
start_time = None
duration_sec = None
if len(sys.argv) >= 3:
start_time = sys.argv[1]
try:
duration_sec = int(sys.argv[2])
except ValueError:
pass
if not start_time:
start_time = datetime.now().isoformat(timespec="seconds")
log_path = sys.argv[3] if len(sys.argv) >= 4 else None
log_stats = parse_log(log_path)
sources = []
for name, filename in SOURCE_FILES.items():
path = HERE / filename
info = count_source(path)
info["name"] = name
# Merge log stats
ls = log_stats.get(name, {})
for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"):
if k in ls:
info[k] = ls[k]
# Override accepted from log if available (log is authoritative for latest run)
if "accepted" in ls:
info["accepted"] = ls["accepted"]
sources.append(info)
# Total accepted before dedup
total_accepted = sum(s.get("accepted", 0) for s in sources)
# Merged / deduplicated count
merged_path = HERE / MERGED_FILE
deduplicated = 0
if merged_path.exists():
try:
merged = json.loads(merged_path.read_text(encoding="utf-8"))
deduplicated = len(merged)
except Exception:
pass
duplicates_removed = total_accepted - deduplicated if deduplicated else 0
status = {
"status": "done",
"timestamp": start_time,
"duration_sec": duration_sec,
"total_accepted": total_accepted,
"deduplicated": deduplicated,
"duplicates_removed": duplicates_removed,
"sources": sources,
}
out = HERE / "status.json"
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Status uložen: {out}")
print(f" Celkem bytů (před dedup): {total_accepted}")
print(f" Po deduplikaci: {deduplicated}")
if duplicates_removed:
print(f" Odstraněno duplikátů: {duplicates_removed}")
for s in sources:
acc = s.get("accepted", 0)
err = s.get("error", "")
exc = s.get("excluded", {})
exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0)
parts = [f"{s['name']:12s}: {acc} bytů"]
if exc_total:
parts.append(f"({exc_total} vyloučeno)")
if err:
parts.append(f"[CHYBA: {err}]")
print(" " + " ".join(parts))
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,116 +0,0 @@
#!/usr/bin/env python3
"""
Minimal HTTP API server for persisting apartment ratings.
GET /api/ratings → returns ratings.json contents
POST /api/ratings → saves entire ratings object
GET /api/ratings/export → same as GET, but with download header
Ratings file: /app/data/ratings.json (or ./ratings.json locally)
"""
import json
import logging
import os
import sys
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
PORT = int(os.environ.get("RATINGS_PORT", 8081))
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
RATINGS_FILE = DATA_DIR / "ratings.json"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [ratings] %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
log = logging.getLogger(__name__)
def load_ratings() -> dict:
try:
if RATINGS_FILE.exists():
return json.loads(RATINGS_FILE.read_text(encoding="utf-8"))
except Exception as e:
log.error("Failed to load ratings: %s", e)
return {}
def save_ratings(data: dict) -> None:
RATINGS_FILE.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
class RatingsHandler(BaseHTTPRequestHandler):
def log_message(self, format, *args):
# Suppress default HTTP access log (we use our own)
pass
def _send_json(self, status: int, body: dict, extra_headers=None):
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
if extra_headers:
for k, v in extra_headers.items():
self.send_header(k, v)
self.end_headers()
self.wfile.write(payload)
def do_OPTIONS(self):
# CORS preflight
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
if self.path in ("/api/ratings", "/api/ratings/export"):
ratings = load_ratings()
extra = None
if self.path == "/api/ratings/export":
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
log.info("GET %s%d ratings", self.path, len(ratings))
self._send_json(200, ratings, extra)
else:
self._send_json(404, {"error": "not found"})
def do_POST(self):
if self.path == "/api/ratings":
length = int(self.headers.get("Content-Length", 0))
if length == 0:
self._send_json(400, {"error": "empty body"})
return
try:
raw = self.rfile.read(length)
data = json.loads(raw.decode("utf-8"))
except Exception as e:
log.warning("Bad request body: %s", e)
self._send_json(400, {"error": "invalid JSON"})
return
if not isinstance(data, dict):
self._send_json(400, {"error": "expected JSON object"})
return
save_ratings(data)
log.info("POST /api/ratings → saved %d ratings", len(data))
self._send_json(200, {"ok": True, "count": len(data)})
else:
self._send_json(404, {"error": "not found"})
if __name__ == "__main__":
log.info("Ratings server starting on port %d, data dir: %s", PORT, DATA_DIR)
log.info("Ratings file: %s", RATINGS_FILE)
server = HTTPServer(("0.0.0.0", PORT), RatingsHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
log.info("Stopped.")
sys.exit(0)

View File

@@ -16,12 +16,6 @@ NC='\033[0m'
TOTAL=6 TOTAL=6
CURRENT=0 CURRENT=0
FAILED=0 FAILED=0
START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S")
START_EPOCH=$(date +%s)
LOG_FILE="$(pwd)/scrape_run.log"
# Mark status as running
echo '{"status":"running"}' > status.json
show_help() { show_help() {
echo "Usage: ./run_all.sh [OPTIONS]" echo "Usage: ./run_all.sh [OPTIONS]"
@@ -69,8 +63,6 @@ step() {
} }
# ── Scrapery (paralelně kde to jde) ───────────────────────── # ── Scrapery (paralelně kde to jde) ─────────────────────────
# Tee all output to log file for status generation
exec > >(tee -a "$LOG_FILE") 2>&1
step "Sreality" step "Sreality"
python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); } python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); }
@@ -99,12 +91,6 @@ python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((F
# ── Otevření mapy ──────────────────────────────────────────── # ── Otevření mapy ────────────────────────────────────────────
# ── Generování statusu ─────────────────────────────────────
END_EPOCH=$(date +%s)
DURATION=$((END_EPOCH - START_EPOCH))
python3 generate_status.py "$START_TIME" "$DURATION" "$LOG_FILE"
echo "" echo ""
echo "============================================================" echo "============================================================"
if [ $FAILED -eq 0 ]; then if [ $FAILED -eq 0 ]; then

View File

@@ -347,7 +347,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"ownership": ownership, "ownership": ownership,
"url": sreality_url(hash_id, seo), "url": sreality_url(hash_id, seo),
"image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""), "image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""),
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
details_fetched += 1 details_fetched += 1
@@ -374,58 +373,20 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
"""Generate an interactive Leaflet.js HTML map.""" """Generate an interactive Leaflet.js HTML map."""
# Color by price per m² — cool blue→warm red scale, no yellow # Color by disposition
# Thresholds based on Prague market distribution (p25=120k, p50=144k, p75=162k) color_map = {
price_color_scale = [ "3+kk": "#2196F3", # blue
(110_000, "#1565C0"), # < 110k/m² → deep blue (levné) "3+1": "#4CAF50", # green
(130_000, "#42A5F5"), # 110130k → light blue "4+kk": "#FF9800", # orange
(150_000, "#66BB6A"), # 130150k → green (střed) "4+1": "#F44336", # red
(165_000, "#EF6C00"), # 150165k → dark orange "5+kk": "#9C27B0", # purple
(float("inf"), "#C62828"), # > 165k → dark red (drahé) "5+1": "#795548", # brown
] "6+": "#607D8B", # grey-blue
}
def price_color(estate: dict) -> str:
price = estate.get("price") or 0
area = estate.get("area") or 0
if not area:
return "#9E9E9E"
ppm2 = price / area
for threshold, color in price_color_scale:
if ppm2 < threshold:
return color
return "#E53935"
# Legend bands for info panel (built once)
price_legend_items = (
'<div style="margin-bottom:4px;font-size:12px;color:#555;font-weight:600;">Cena / m²:</div>'
)
bands = [
("#1565C0", "< 110 000 Kč/m²"),
("#42A5F5", "110 130 000 Kč/m²"),
("#66BB6A", "130 150 000 Kč/m²"),
("#EF6C00", "150 165 000 Kč/m²"),
("#C62828", "> 165 000 Kč/m²"),
("#9E9E9E", "cena/plocha neuvedena"),
]
for bcolor, blabel in bands:
price_legend_items += (
f'<div style="display:flex;align-items:center;gap:6px;margin:2px 0;">'
f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};'
f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>'
f'<span>{blabel}</span></div>'
)
# New marker indicator — bigger dot, no extra border
price_legend_items += (
'<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;'
'padding-top:6px;border-top:1px solid #eee;">'
'<span style="width:18px;height:18px;border-radius:50%;background:#66BB6A;'
'display:inline-block;box-shadow:0 1px 4px rgba(0,0,0,0.35);flex-shrink:0;"></span>'
'<span>Nové (z dnešního scrapu) — větší</span></div>'
)
markers_js = "" markers_js = ""
for e in estates: for e in estates:
color = price_color(e) color = color_map.get(e["disposition"], "#999999")
floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno" floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno"
area_text = f'{e["area"]}' if e["area"] else "neuvedeno" area_text = f'{e["area"]}' if e["area"] else "neuvedeno"
building_text = e["building_type"] or "neuvedeno" building_text = e["building_type"] or "neuvedeno"
@@ -444,19 +405,11 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
hash_id = e.get("hash_id", "") hash_id = e.get("hash_id", "")
scraped_at = e.get("scraped_at", "")
is_new = scraped_at == datetime.now().strftime("%Y-%m-%d")
new_badge = (
'<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;'
'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>'
if is_new else ""
)
popup = ( popup = (
f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">' f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">'
f'<b style="font-size:14px;">{format_price(e["price"])}</b>' f'<b style="font-size:14px;">{format_price(e["price"])}</b>'
f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;' f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;'
f'padding:1px 6px;border-radius:3px;">{source_label}</span>{new_badge}<br>' f'padding:1px 6px;border-radius:3px;">{source_label}</span><br>'
f'<span style="color:#666;">{e["disposition"]} | {area_text} | {floor_text}</span>' f'<span style="color:#666;">{e["disposition"]} | {area_text} | {floor_text}</span>'
f'{floor_note}<br><br>' f'{floor_note}<br><br>'
f'<b>{e["locality"]}</b><br>' f'<b>{e["locality"]}</b><br>'
@@ -485,33 +438,27 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
popup = popup.replace("'", "\\'").replace("\n", "") popup = popup.replace("'", "\\'").replace("\n", "")
is_fav = source in ("psn", "cityhome") is_fav = source in ("psn", "cityhome")
marker_fn = "addHeartMarker" if is_fav else "addMarker"
if is_fav:
marker_fn = "addHeartMarker"
elif is_new:
marker_fn = "addNewMarker"
else:
marker_fn = "addMarker"
markers_js += ( markers_js += (
f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n" f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n"
) )
# Build legend — price per m² bands + disposition counts # Build legend
legend_items = price_legend_items legend_items = ""
# Disposition counts below the color legend
disp_counts = {} disp_counts = {}
for e in estates: for e in estates:
d = e["disposition"] d = e["disposition"]
disp_counts[d] = disp_counts.get(d, 0) + 1 disp_counts[d] = disp_counts.get(d, 0) + 1
disp_order = ["3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+"] for disp, color in color_map.items():
disp_summary = ", ".join( count = disp_counts.get(disp, 0)
f"{d} ({disp_counts[d]})" for d in disp_order if d in disp_counts if count > 0:
) legend_items += (
legend_items += ( f'<div style="display:flex;align-items:center;gap:6px;margin:3px 0;">'
f'<div style="margin-top:8px;padding-top:6px;border-top:1px solid #eee;' f'<span style="width:14px;height:14px;border-radius:50%;'
f'font-size:12px;color:#666;">{disp_summary}</div>' f'background:{color};display:inline-block;border:2px solid white;'
) f'box-shadow:0 1px 3px rgba(0,0,0,0.3);"></span>'
f'<span>{disp} ({count})</span></div>'
)
# Heart marker legend for PSN/CityHome # Heart marker legend for PSN/CityHome
fav_count = sum(1 for e in estates if e.get("source") in ("psn", "cityhome")) fav_count = sum(1 for e in estates if e.get("source") in ("psn", "cityhome"))
@@ -546,7 +493,6 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
body {{ font-family: system-ui, -apple-system, sans-serif; }} body {{ font-family: system-ui, -apple-system, sans-serif; }}
#map {{ width: 100%; height: 100vh; }} #map {{ width: 100%; height: 100vh; }}
.heart-icon {{ background: none !important; border: none !important; }} .heart-icon {{ background: none !important; border: none !important; }}
.star-icon {{ background: none !important; border: none !important; }}
.rate-btn:hover {{ background: #f0f0f0 !important; }} .rate-btn:hover {{ background: #f0f0f0 !important; }}
.rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }} .rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }}
.rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }} .rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }}
@@ -557,42 +503,13 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
}} }}
.marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }} .marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }}
.heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }} .heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }}
.heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }} .heart-icon-rej {{ opacity: 0.2 !important; }}
.reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }}
@keyframes pulse-new {{
0% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
50% {{ stroke-opacity: 0.4; stroke-width: 6px; r: 12; }}
100% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
}}
.marker-new {{ animation: pulse-new 2s ease-in-out infinite; }}
.info-panel {{ .info-panel {{
position: absolute; top: 10px; right: 10px; z-index: 1000; position: absolute; top: 10px; right: 10px; z-index: 1000;
background: white; padding: 16px; border-radius: 10px; background: white; padding: 16px; border-radius: 10px;
box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px; box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px;
font-size: 13px; line-height: 1.5; font-size: 13px; line-height: 1.5;
transition: transform 0.3s ease, opacity 0.3s ease;
}} }}
.info-panel.collapsed {{
transform: translateX(calc(100% + 20px));
opacity: 0; pointer-events: none;
}}
.panel-open-btn {{
position: absolute; top: 10px; right: 10px; z-index: 1001;
width: 40px; height: 40px; border-radius: 8px;
background: white; border: none; cursor: pointer;
box-shadow: 0 2px 12px rgba(0,0,0,0.15);
font-size: 20px; display: flex; align-items: center; justify-content: center;
transition: opacity 0.3s ease;
}}
.panel-open-btn.hidden {{ opacity: 0; pointer-events: none; }}
.panel-close-btn {{
position: absolute; top: 8px; right: 8px;
width: 28px; height: 28px; border-radius: 6px;
background: none; border: 1px solid #ddd; cursor: pointer;
font-size: 16px; display: flex; align-items: center; justify-content: center;
color: #888;
}}
.panel-close-btn:hover {{ background: #f0f0f0; color: #333; }}
.info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }} .info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }}
.info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }} .info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }}
.filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }} .filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }}
@@ -600,26 +517,18 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
.filter-section input[type="checkbox"] {{ accent-color: #1976D2; }} .filter-section input[type="checkbox"] {{ accent-color: #1976D2; }}
#floor-filter {{ margin-top: 8px; }} #floor-filter {{ margin-top: 8px; }}
#floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }} #floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }}
.status-link {{ display: block; margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; text-align: center; }}
.status-link a {{ color: #1976D2; text-decoration: none; font-size: 12px; }}
@media (max-width: 600px) {{
.info-panel {{ max-width: calc(100vw - 60px); right: 10px; }}
.info-panel.collapsed {{ transform: translateX(calc(100% + 20px)); }}
.panel-close-btn {{ top: 6px; right: 6px; }}
}}
</style> </style>
</head> </head>
<body> <body>
<div id="map"></div> <div id="map"></div>
<button class="panel-open-btn hidden" id="panel-open-btn" onclick="togglePanel()">☰</button> <div class="info-panel">
<div class="info-panel" id="info-panel">
<button class="panel-close-btn" id="panel-close-btn" onclick="togglePanel()">✕</button>
<h2>Byty v Praze</h2> <h2>Byty v Praze</h2>
<div class="stats"> <div class="stats">
<div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div> <div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div>
<div>Cena: {min_price}{max_price}</div> <div>Cena: {min_price}{max_price}</div>
<div>Průměr: {avg_price}</div> <div>Průměr: {avg_price}</div>
</div> </div>
<div><b>Dispozice:</b></div>
{legend_items} {legend_items}
<div class="filter-section"> <div class="filter-section">
<b>Filtry:</b> <b>Filtry:</b>
@@ -653,7 +562,6 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
Skrýt zamítnuté Skrýt zamítnuté
</label> </label>
</div> </div>
<div class="status-link"><a href="status.html">Scraper status</a></div>
</div> </div>
<script> <script>
@@ -689,23 +597,6 @@ function addMarker(lat, lon, color, popup, hashId) {{
marker.addTo(map); marker.addTo(map);
}} }}
function addNewMarker(lat, lon, color, popup, hashId) {{
var marker = L.circleMarker([lat, lon], {{
radius: 12,
fillColor: color,
color: color,
weight: 4,
opacity: 0.35,
fillOpacity: 0.95,
}}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true }};
allMarkers.push(marker);
marker.addTo(map);
marker.on('add', function() {{
if (marker._path) marker._path.classList.add('marker-new');
}});
}}
function heartIcon(color) {{ function heartIcon(color) {{
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">' var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">'
+ '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 ' + '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 '
@@ -721,21 +612,6 @@ function heartIcon(color) {{
}}); }});
}} }}
function starIcon() {{
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="28" height="28" viewBox="0 0 24 24">'
+ '<path d="M12 2l3.09 6.26L22 9.27l-5 4.87L18.18 22 12 18.27 '
+ '5.82 22 7 14.14 2 9.27l6.91-1.01L12 2z" '
+ 'fill="#FFC107" stroke="#F57F17" stroke-width="1" '
+ 'filter="drop-shadow(0 1px 3px rgba(0,0,0,0.3))"/></svg>';
return L.divIcon({{
html: svg,
className: 'star-icon',
iconSize: [28, 28],
iconAnchor: [14, 14],
popupAnchor: [0, -14],
}});
}}
function addHeartMarker(lat, lon, color, popup, hashId) {{ function addHeartMarker(lat, lon, color, popup, hashId) {{
var marker = L.marker([lat, lon], {{ var marker = L.marker([lat, lon], {{
icon: heartIcon(color), icon: heartIcon(color),
@@ -761,36 +637,6 @@ function saveRatings(ratings) {{
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings)); localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
}} }}
function addRejectStrike(marker) {{
removeRejectStrike(marker);
var color = marker._data.color || '#999';
// SVG "no entry" icon — circle with diagonal line, colored to match marker
var svg = '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20">'
+ '<circle cx="12" cy="12" r="10" fill="none" stroke="' + color + '" stroke-width="2.5" opacity="0.85"/>'
+ '<line x1="5.5" y1="5.5" x2="18.5" y2="18.5" stroke="' + color + '" stroke-width="2.5" stroke-linecap="round" opacity="0.85"/>'
+ '</svg>';
var icon = L.divIcon({{
className: 'reject-overlay',
html: svg,
iconSize: [20, 20],
iconAnchor: [10, 10],
}});
var m = L.marker([marker._data.lat, marker._data.lon], {{
icon: icon,
interactive: false,
pane: 'markerPane',
}});
m.addTo(map);
marker._rejectStrike = m;
}}
function removeRejectStrike(marker) {{
if (marker._rejectStrike) {{
map.removeLayer(marker._rejectStrike);
marker._rejectStrike = null;
}}
}}
function applyMarkerStyle(marker, status) {{ function applyMarkerStyle(marker, status) {{
if (marker._data.isHeart) {{ if (marker._data.isHeart) {{
var el = marker._icon; var el = marker._icon;
@@ -805,59 +651,26 @@ function applyMarkerStyle(marker, status) {{
}} }}
}} else {{ }} else {{
if (status === 'fav') {{ if (status === 'fav') {{
removeRejectStrike(marker); marker.setStyle({{
if (!marker._data._origCircle) marker._data._origCircle = true; radius: 12, fillOpacity: 1, weight: 3,
var popup = marker.getPopup(); fillColor: marker._data.color, color: '#fff',
var popupContent = popup ? popup.getContent() : ''; }});
var wasOnMap = map.hasLayer(marker); if (marker._path) marker._path.classList.add('marker-favorite');
if (wasOnMap) map.removeLayer(marker);
var starMarker = L.marker([marker._data.lat, marker._data.lon], {{
icon: starIcon(),
}}).bindPopup(popupContent);
starMarker._data = marker._data;
var idx = allMarkers.indexOf(marker);
if (idx !== -1) allMarkers[idx] = starMarker;
if (wasOnMap) starMarker.addTo(map);
}} else if (status === 'reject') {{ }} else if (status === 'reject') {{
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{ marker.setStyle({{
revertToCircle(marker, {{ radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1 }}); radius: 6, fillOpacity: 0.15, fillColor: '#999', color: '#bbb', weight: 1,
}} else {{ }});
marker.setStyle({{ if (marker._path) marker._path.classList.remove('marker-favorite');
radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1,
}});
if (marker._path) marker._path.classList.remove('marker-favorite');
}}
// Add strikethrough line over the marker
addRejectStrike(marker);
}} else {{ }} else {{
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{ marker.setStyle({{
revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }}); radius: 8, fillColor: marker._data.color, color: '#fff',
}} else {{ weight: 2, fillOpacity: 0.85,
marker.setStyle({{ }});
radius: 8, fillColor: marker._data.color, color: '#fff', if (marker._path) marker._path.classList.remove('marker-favorite');
weight: 2, fillOpacity: 0.85,
}});
if (marker._path) marker._path.classList.remove('marker-favorite');
}}
if (marker._path) marker._path.classList.remove('marker-rejected');
removeRejectStrike(marker);
}} }}
}} }}
}} }}
function revertToCircle(marker, style) {{
var popup = marker.getPopup();
var popupContent = popup ? popup.getContent() : '';
var wasOnMap = map.hasLayer(marker);
if (wasOnMap) map.removeLayer(marker);
var cm = L.circleMarker([marker._data.lat, marker._data.lon], style).bindPopup(popupContent);
cm._data = marker._data;
delete cm._data._starRef;
var idx = allMarkers.indexOf(marker);
if (idx !== -1) allMarkers[idx] = cm;
if (wasOnMap) cm.addTo(map);
}}
function rateMarker(marker, action) {{ function rateMarker(marker, action) {{
var hashId = marker._data.hashId; var hashId = marker._data.hashId;
var ratings = loadRatings(); var ratings = loadRatings();
@@ -1019,12 +832,8 @@ function applyFilters() {{
if (show) {{ if (show) {{
if (!map.hasLayer(m)) m.addTo(map); if (!map.hasLayer(m)) m.addTo(map);
visible++; visible++;
// Show strike line if rejected and visible
if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map);
}} else {{ }} else {{
if (map.hasLayer(m)) map.removeLayer(m); if (map.hasLayer(m)) map.removeLayer(m);
// Hide strike line when marker hidden
if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike);
}} }}
}}); }});
@@ -1042,26 +851,6 @@ function applyFilters() {{
// Initialize ratings on load // Initialize ratings on load
restoreRatings(); restoreRatings();
// ── Panel toggle ──────────────────────────────────────────────
function togglePanel() {{
var panel = document.getElementById('info-panel');
var openBtn = document.getElementById('panel-open-btn');
var isOpen = !panel.classList.contains('collapsed');
if (isOpen) {{
panel.classList.add('collapsed');
openBtn.classList.remove('hidden');
}} else {{
panel.classList.remove('collapsed');
openBtn.classList.add('hidden');
}}
}}
// On mobile, start with panel collapsed
if (window.innerWidth <= 600) {{
document.getElementById('info-panel').classList.add('collapsed');
document.getElementById('panel-open-btn').classList.remove('hidden');
}}
</script> </script>
</body> </body>
</html>""" </html>"""

View File

@@ -7,7 +7,6 @@ Výstup: byty_bezrealitky.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
@@ -356,7 +355,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}", "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
"source": "bezrealitky", "source": "bezrealitky",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -12,7 +12,6 @@ import logging
import re import re
import time import time
import urllib.request import urllib.request
from datetime import datetime
from pathlib import Path from pathlib import Path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -34,26 +33,24 @@ HEADERS = {
BASE_URL = "https://www.city-home.cz" BASE_URL = "https://www.city-home.cz"
def fetch_url(url: str, retries: int = 3) -> str: def fetch_url(url: str) -> str:
"""Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx.""" """Fetch URL and return HTML string."""
for attempt in range(retries): for attempt in range(3):
try: try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}") logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS) req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30) resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8") html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html return html
except urllib.error.HTTPError:
# Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e: except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
if attempt < retries - 1: if attempt < 2:
wait = (attempt + 1) * 2 wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}") logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
time.sleep(wait) time.sleep(wait)
else: else:
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True) logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
raise raise
@@ -127,21 +124,31 @@ def parse_filter_page(html: str) -> list[dict]:
if detail_url and not detail_url.startswith("http"): if detail_url and not detail_url.startswith("http"):
detail_url = BASE_URL + detail_url detail_url = BASE_URL + detail_url
# Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price] # Extract floor from cells — look for pattern like "3.NP" or "2.PP"
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL) cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
# Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
project_address = cell_texts[2] if len(cell_texts) > 2 else ""
floor = None floor = None
if len(cell_texts) > 3: floor_text = ""
np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3]) project_name = ""
pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
# Floor pattern
np_match = re.search(r'(\d+)\.\s*NP', cell_text)
pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
if np_match: if np_match:
floor = int(np_match.group(1)) floor = int(np_match.group(1))
floor_text = cell_text
elif pp_match: elif pp_match:
floor = -int(pp_match.group(1)) floor = -int(pp_match.group(1)) # Underground
floor_text = cell_text
# Extract project name — usually in a cell that's not a number/price/floor
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "" not in cell_text and "" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
if len(cell_text) > 3 and cell_text != unit_name:
project_name = cell_text
break
listing = { listing = {
"price": int(cena.group(1)), "price": int(cena.group(1)),
@@ -151,55 +158,27 @@ def parse_filter_page(html: str) -> list[dict]:
"project_id": project.group(1) if project else "", "project_id": project.group(1) if project else "",
"transaction": transaction.group(1) if transaction else "", "transaction": transaction.group(1) if transaction else "",
"disposition": dispozition.group(1) if dispozition else "", "disposition": dispozition.group(1) if dispozition else "",
"location": location.group(1) if location else "",
"url": detail_url, "url": detail_url,
"unit_name": unit_name, "unit_name": unit_name,
"floor": floor, "floor": floor,
"project_address": project_address, "project_name": project_name,
} }
listings.append(listing) listings.append(listing)
return listings return listings
def get_lokalita_urls(slug: str) -> list[str]: def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
"""Return candidate lokalita URLs to try in order.""" """Extract GPS coordinates for projects from locality pages."""
return [ # Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
f"{BASE_URL}/projekty/{slug}/lokalita", gps_data = {}
f"{BASE_URL}/bytove-domy/{slug}/lokalita", for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
f"{BASE_URL}/bytove-domy/{slug}/lokalita1", name = match.group(1).strip()
] lat = float(match.group(2))
lon = float(match.group(3))
gps_data[name] = (lat, lon)
def extract_project_gps(html: str) -> tuple[float, float] | None: return gps_data
"""Extract project GPS from lokalita page JS variable.
The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...]
Category '1' = the project's own marker. Some projects have two cat-1 entries (data error);
in that case we pick the one whose name contains a digit and is not a transit landmark.
"""
block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL)
if not block:
return None
entries = re.findall(
r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
block.group(0),
re.DOTALL,
)
if not entries:
return None
if len(entries) == 1:
return float(entries[0][1]), float(entries[0][2])
# Multiple cat-1 entries: pick the real project marker
transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
for name, lat, lng in entries:
if re.search(r'\d', name) and not transit_re.search(name):
return float(lat), float(lng)
# Fallback: first entry
return float(entries[0][1]), float(entries[0][2])
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
@@ -231,24 +210,22 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
# Fetch GPS for each project from locality pages # Fetch GPS for each project from locality pages
project_gps = {} project_gps = {}
for slug in sorted(project_slugs): for slug in sorted(project_slugs):
time.sleep(0.3) time.sleep(0.5)
gps = None try:
for url in get_lokalita_urls(slug): locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
try: logger.debug(f"Fetching project GPS: {locality_url}")
logger.debug(f"Fetching project GPS: {url}") loc_html = fetch_url(locality_url)
loc_html = fetch_url(url) gps = extract_project_gps(loc_html)
gps = extract_project_gps(loc_html) if gps:
if gps: # Take first entry (the project itself)
break first_name, (lat, lon) = next(iter(gps.items()))
except Exception as e: project_gps[slug] = (lat, lon)
logger.debug(f"GPS fetch failed for {url}: {e}") logger.info(f"{slug}: {lat}, {lon}")
continue else:
logger.info(f"{slug}: GPS nenalezeno")
if gps: except Exception as e:
project_gps[slug] = gps logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
logger.info(f" {slug}: {gps[0]}, {gps[1]}") logger.info(f" {slug}: chyba ({e})")
else:
logger.info(f"{slug}: GPS nenalezeno")
# Step 3: Filter listings # Step 3: Filter listings
logger.info(f"\nFáze 3: Filtrování...") logger.info(f"\nFáze 3: Filtrování...")
@@ -326,37 +303,22 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
lat, lon = gps lat, lon = gps
# locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup
project_address = listing.get("project_address", "")
# derive city from slug (GPS lookup key)
city_map = {
"karlinske-namesti-5": "Praha 8",
"melnicka-12": "Praha 7",
"na-vaclavce-34": "Praha 5",
"nad-kajetankou-12": "Praha 6",
"vosmikovych-3": "Praha 9",
"zateckych-14": "Praha 2",
}
city_str = city_map.get(slug, "Praha")
locality_str = f"{project_address}, {city_str}" if project_address else city_str
result = { result = {
"hash_id": f"cityhome_{slug}_{listing['unit_name']}", "hash_id": f"cityhome_{slug}_{listing['unit_name']}",
"name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}", "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
"price": price, "price": price,
"price_formatted": format_price(price), "price_formatted": format_price(price),
"locality": locality_str, "locality": f"{listing['project_name']}, Praha",
"lat": lat, "lat": lat,
"lon": lon, "lon": lon,
"disposition": disp, "disposition": disp,
"floor": floor, "floor": floor,
"area": float(area), "area": area,
"building_type": "Cihlová", # CityHome renovuje cihlové domy "building_type": "Cihlová", # CityHome renovuje cihlové domy
"ownership": "neuvedeno", "ownership": "neuvedeno",
"url": url, "url": url,
"source": "cityhome", "source": "cityhome",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -7,7 +7,6 @@ Výstup: byty_idnes.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
@@ -459,7 +458,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": item["url"], "url": item["url"],
"source": "idnes", "source": "idnes",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
PSN.cz scraper. PSN.cz scraper.
Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování. Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
Výstup: byty_psn.json Výstup: byty_psn.json
""" """
from __future__ import annotations from __future__ import annotations
@@ -12,9 +12,7 @@ import logging
import re import re
import subprocess import subprocess
import time import time
from datetime import datetime
from pathlib import Path from pathlib import Path
from urllib.parse import urlencode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -24,37 +22,82 @@ MAX_PRICE = 14_000_000
MIN_AREA = 69 MIN_AREA = 69
MIN_FLOOR = 2 MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"} WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
WANTED_CITIES = {"Praha"}
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://psn.cz" BASE_URL = "https://psn.cz"
UNITS_API = f"{BASE_URL}/api/units-list"
# Known Prague project slugs with GPS (from research)
PRAGUE_PROJECTS = [
{"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
{"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
{"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
{"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
{"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
{"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
{"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
{"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
{"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
{"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
{"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
{"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
{"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
{"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
{"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
{"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
]
def fetch_json(url: str) -> dict: def fetch_url(url: str) -> str:
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare).""" """Fetch URL via curl (urllib SSL too old for Cloudflare)."""
logger.debug(f"HTTP GET: {url}") logger.debug(f"HTTP GET request (via curl): {url}")
logger.debug(f"User-Agent: {UA}")
result = subprocess.run( result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30", ["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}", "-H", f"User-Agent: {UA}",
"-H", "Accept: application/json", "-H", "Accept: text/html",
url], url],
capture_output=True, text=True, timeout=60 capture_output=True, text=True, timeout=60
) )
if result.returncode != 0: if result.returncode != 0:
logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}")
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
return json.loads(result.stdout) logger.debug(f"HTTP response: size={len(result.stdout)} bytes")
return result.stdout
def fix_gps(lat, lng): def extract_units_from_html(html: str) -> list[dict]:
"""PSN má u některých projektů prohozené lat/lng — opravíme.""" """Extract unit JSON objects from raw HTML with escaped quotes."""
if lat is not None and lng is not None and lat < 20 and lng > 20: # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
return lng, lat # Step 1: Unescape the double-backslash-quotes to regular quotes
return lat, lng cleaned = html.replace('\\"', '"')
# Step 2: Find each unit by looking for "title":"Byt and walking back to {
units = []
decoder = json.JSONDecoder()
for m in re.finditer(r'"title":"Byt', cleaned):
pos = m.start()
# Walk backwards to find the opening brace
depth = 0
found = False
for i in range(pos - 1, max(pos - 3000, 0), -1):
if cleaned[i] == '}':
depth += 1
elif cleaned[i] == '{':
if depth == 0:
try:
obj, end = decoder.raw_decode(cleaned, i)
if isinstance(obj, dict) and 'price_czk' in obj:
units.append(obj)
found = True
except (json.JSONDecodeError, ValueError):
pass
break
depth -= 1
return units
def format_price(price: int) -> str: def format_price(price: int) -> str:
@@ -66,178 +109,209 @@ def format_price(price: int) -> str:
return " ".join(reversed(parts)) + "" return " ".join(reversed(parts)) + ""
def scrape(max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z PSN.cz") logger.info("Stahuji inzeráty z PSN.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}") logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP") logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha") logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
if max_pages:
logger.info(f"Max. stran: {max_pages}")
if max_properties: if max_properties:
logger.info(f"Max. bytů: {max_properties}") logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60) logger.info("=" * 60)
# Jediný API požadavek — vrátí všechny jednotky (cca 236) # Fetch units from each Prague project
params = urlencode({ all_units = []
"locale": "cs",
"filters": "{}",
"type": "list",
"order": "price-asc",
"offset": 0,
"limit": 500,
})
url = f"{UNITS_API}?{params}"
logger.info("Stahuji jednotky z API ...")
try: for proj in PRAGUE_PROJECTS:
data = fetch_json(url) page = 1
except Exception as e: project_units = []
logger.error(f"Chyba při stahování: {e}", exc_info=True)
return []
all_units = data.get("units", {}).get("data", []) while True:
logger.info(f"Staženo jednotek celkem: {len(all_units)}") if max_pages and page > max_pages:
logger.debug(f"Max pages limit reached: {max_pages}")
break
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
logger.info(f"{proj['name']} — strana {page} ...")
time.sleep(0.5)
# Filtrování try:
html = fetch_url(url)
except Exception as e:
logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True)
break
units = extract_units_from_html(html)
logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units")
if not units:
if page == 1:
logger.info(f"→ 0 jednotek")
break
# Add project info to each unit
for unit in units:
if not unit.get("latitude") or not unit.get("longitude"):
unit["latitude"] = proj["lat"]
unit["longitude"] = proj["lon"]
unit["_project_name"] = proj["name"]
unit["_project_slug"] = proj["slug"]
project_units.extend(units)
if page == 1:
logger.info(f"{len(units)} jednotek na stránce")
# Check if there might be more pages
# If we got fewer than expected or same units, stop
if len(units) < 10:
break
page += 1
if page > 10: # Safety limit
break
all_units.extend(project_units)
# Deduplicate by slug
seen_slugs = set()
unique_units = []
for u in all_units:
slug = u.get("slug", "")
if slug and slug not in seen_slugs:
seen_slugs.add(slug)
unique_units.append(u)
elif not slug:
unique_units.append(u)
logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek")
# Filter
logger.info(f"\nFiltrování...")
results = [] results = []
excluded = { excluded_sold = 0
"prodáno": 0, excluded_type = 0
"typ": 0, excluded_disp = 0
"město": 0, excluded_price = 0
"dispozice": 0, excluded_area = 0
"cena": 0, excluded_floor = 0
"plocha": 0, excluded_panel = 0
"patro": 0,
}
properties_fetched = 0 properties_fetched = 0
for unit in all_units: for unit in unique_units:
if max_properties and properties_fetched >= max_properties: if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break break
unit_id = unit.get("id", unit.get("slug", "unknown"))
unit_id = unit.get("id", "?") # Only free units
# Pouze prodej bytů (type_id=0)
if unit.get("type_id") != 0:
excluded["typ"] += 1
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)")
continue
# Pouze volné (ne rezervované, prodané, v přípravě)
sale_status = unit.get("sale_status", "")
is_free = unit.get("is_free", False) is_free = unit.get("is_free", False)
is_sold = unit.get("is_sold", False) is_sold = unit.get("is_sold", False)
if is_sold or not is_free: if is_sold or not is_free:
excluded["prodáno"] += 1 excluded_sold += 1
logger.debug(f"id={unit_id}: přeskočen (status={sale_status})") logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)")
continue continue
# Pouze Praha # Only apartments
city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip() category = str(unit.get("category", "")).lower()
# location field je typicky "Praha 4", "Praha 7" atd. if "byt" not in category and "ateliér" not in category:
city_base = city.split(" ")[0] if city else "" excluded_type += 1
if city_base not in WANTED_CITIES: logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})")
excluded["město"] += 1
logger.debug(f"id={unit_id}: přeskočen (město={city})")
continue continue
# Dispozice # Disposition
disp = unit.get("disposition", "") disp = unit.get("disposition", "")
if disp not in WANTED_DISPOSITIONS: if disp not in WANTED_DISPOSITIONS:
excluded["dispozice"] += 1 excluded_disp += 1
logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})") logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})")
continue continue
# Cena # Price
price = unit.get("action_price_czk") or unit.get("price_czk") or 0 price = unit.get("price_czk") or unit.get("action_price_czk") or 0
if not price or price <= 0 or price > MAX_PRICE: if price <= 0 or price > MAX_PRICE:
excluded["cena"] += 1 excluded_price += 1
logger.debug(f"id={unit_id}: přeskočen (cena={price})") logger.debug(f"Filter: id={unit_id} - excluded (price {price})")
continue continue
# Plocha # Area
area = unit.get("total_area") or unit.get("floor_area") or 0 area = unit.get("total_area") or unit.get("floor_area") or 0
if area < MIN_AREA: if area < MIN_AREA:
excluded["plocha"] += 1 excluded_area += 1
logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)") logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)")
continue continue
# Patro # Floor
floor_str = str(unit.get("floor", "")) floor_str = str(unit.get("floor", ""))
floor = None floor = None
if floor_str: if floor_str:
try: try:
floor = int(floor_str) floor = int(floor_str)
except ValueError: except ValueError:
m = re.search(r'(-?\d+)', floor_str) floor_match = re.search(r'(-?\d+)', floor_str)
if m: if floor_match:
floor = int(m.group(1)) floor = int(floor_match.group(1))
if floor is not None and floor < MIN_FLOOR: if floor is not None and floor < MIN_FLOOR:
excluded["patro"] += 1 excluded_floor += 1
logger.debug(f"id={unit_id}: přeskočen (patro={floor})") logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})")
continue continue
# GPS — opravit prohozené souřadnice # Construction — check for panel
lat_raw = unit.get("latitude") build_type = str(unit.get("build_type", "")).lower()
lng_raw = unit.get("longitude") if "panel" in build_type:
lat, lng = fix_gps(lat_raw, lng_raw) excluded_panel += 1
if not lat or not lng: logger.debug(f"Filter: id={unit_id} - excluded (panel construction)")
logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji") logger.info(f"✗ Vyloučen: panel ({build_type})")
continue continue
# Sestavit adresu pro locality # Build construction label
addr = unit.get("address") or {} building_type = "neuvedeno"
street = addr.get("street", "") if build_type and build_type != "nevybráno":
street_no = addr.get("street_no", "") if "cihlo" in build_type or "cihla" in build_type:
if street and street_no: building_type = "Cihlová"
locality_str = f"{street} {street_no}, {city}" elif "skelet" in build_type:
elif street: building_type = "Skeletová"
locality_str = f"{street}, {city}" else:
else: building_type = build_type.capitalize()
project_name = unit.get("project", "")
locality_str = f"{project_name}, {city}" if project_name else city
# URL na detail jednotky lat = unit.get("latitude", 0)
unit_slug = unit.get("slug", "") lon = unit.get("longitude", 0)
project_slug = ""
# project_slug lze odvodit z projektu nebo z reference_no slug = unit.get("slug", "")
# API nevrací project_slug přímo — použijeme reference_no nebo jen ID project_slug = unit.get("_project_slug", "")
reference_no = unit.get("reference_no", "") detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
if unit_slug:
detail_url = f"{BASE_URL}/prodej/{unit_slug}"
elif reference_no:
detail_url = f"{BASE_URL}/prodej/{reference_no}"
else:
detail_url = BASE_URL
result = { result = {
"hash_id": str(unit_id), "hash_id": unit.get("id", slug),
"name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}", "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
"price": int(price), "price": int(price),
"price_formatted": format_price(int(price)), "price_formatted": format_price(int(price)),
"locality": locality_str, "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
"lat": lat, "lat": lat,
"lon": lng, "lon": lon,
"disposition": disp, "disposition": disp,
"floor": floor, "floor": floor,
"area": float(area), "area": area,
"building_type": "neuvedeno", "building_type": building_type,
"ownership": "osobní", "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
"url": detail_url, "url": detail_url,
"source": "psn", "source": "psn",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1
logger.info(f"\n{'=' * 60}") logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky PSN:") logger.info(f"Výsledky PSN:")
logger.info(f" Staženo jednotek: {len(all_units)}") logger.info(f" Celkem jednotek: {len(unique_units)}")
for reason, count in excluded.items(): logger.info(f" Vyloučeno (prodáno): {excluded_sold}")
if count: logger.info(f" Vyloučeno (typ): {excluded_type}")
logger.info(f" Vyloučeno ({reason}): {count}") logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (panel): {excluded_panel}")
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
@@ -246,13 +320,15 @@ def scrape(max_properties: int | None = None):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz") parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages per project to scrape")
parser.add_argument("--max-properties", type=int, default=None, parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to include in results") help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO", parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging
logging.basicConfig( logging.basicConfig(
level=getattr(logging, args.log_level), level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
@@ -260,7 +336,7 @@ if __name__ == "__main__":
) )
start = time.time() start = time.time()
estates = scrape(max_properties=args.max_properties) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates: if estates:
json_path = Path("byty_psn.json") json_path = Path("byty_psn.json")
@@ -270,6 +346,6 @@ if __name__ == "__main__":
) )
elapsed = time.time() - start elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.1f} s") logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
else: else:
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(") logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")

View File

@@ -7,7 +7,6 @@ Výstup: byty_realingo.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
@@ -315,7 +314,6 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": f"{BASE_URL}{item['url']}", "url": f"{BASE_URL}{item['url']}",
"source": "realingo", "source": "realingo",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -1,204 +0,0 @@
<!DOCTYPE html>
<html lang="cs">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Scraper status</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: system-ui, -apple-system, sans-serif;
background: #f5f5f5; color: #333;
padding: 24px; max-width: 640px; margin: 0 auto;
}
h1 { font-size: 22px; margin-bottom: 4px; }
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
.card {
background: white; border-radius: 12px; padding: 20px;
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
}
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
.timestamp {
font-size: 28px; font-weight: 700; color: #1976D2;
}
.timestamp-ago { font-size: 13px; color: #999; margin-top: 2px; }
/* Source table */
.source-table { width: 100%; border-collapse: collapse; }
.source-table td { padding: 8px 0; border-bottom: 1px solid #f0f0f0; font-size: 14px; }
.source-table tr:last-child td { border-bottom: none; }
.source-table .name { font-weight: 600; }
.source-table .count { text-align: right; font-variant-numeric: tabular-nums; }
.source-table .rejected { text-align: right; color: #999; font-size: 12px; }
.badge {
display: inline-block; padding: 2px 8px; border-radius: 4px;
font-size: 11px; font-weight: 600; color: white;
}
.badge-ok { background: #4CAF50; }
.badge-err { background: #F44336; }
.badge-skip { background: #FF9800; }
/* Summary bar */
.summary-row {
display: flex; justify-content: space-between; align-items: center;
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
}
.summary-row:last-child { border-bottom: none; }
.summary-label { font-size: 13px; color: #666; }
.summary-value { font-size: 18px; font-weight: 700; }
/* Source bar chart */
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
.bar-label { width: 90px; font-size: 12px; text-align: right; color: #666; }
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; position: relative; }
.bar-fill { height: 100%; border-radius: 4px; transition: width 0.5s ease; }
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
/* Loader */
.loader-wrap {
display: flex; flex-direction: column; align-items: center;
justify-content: center; padding: 60px 0;
}
.spinner {
width: 40px; height: 40px; border: 4px solid #e0e0e0;
border-top-color: #1976D2; border-radius: 50%;
animation: spin 0.8s linear infinite;
}
@keyframes spin { to { transform: rotate(360deg); } }
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
.error-msg { color: #F44336; padding: 40px 0; text-align: center; }
.link-row { text-align: center; margin-top: 8px; }
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
</style>
</head>
<body>
<h1>Scraper status</h1>
<div class="subtitle">maru-hleda-byt</div>
<div id="content">
<div class="loader-wrap">
<div class="spinner"></div>
<div class="loader-text">Nacitam status...</div>
</div>
</div>
<div class="link-row"><a href="mapa_bytu.html">Otevrit mapu</a></div>
<script>
var COLORS = {
sreality: '#1976D2',
realingo: '#7B1FA2',
bezrealitky: '#E65100',
idnes: '#C62828',
psn: '#2E7D32',
cityhome: '#00838F',
};
function timeAgo(dateStr) {
var d = new Date(dateStr);
var now = new Date();
var diff = Math.floor((now - d) / 1000);
if (diff < 60) return 'prave ted';
if (diff < 3600) return Math.floor(diff / 60) + ' min zpet';
if (diff < 86400) return Math.floor(diff / 3600) + ' hod zpet';
return Math.floor(diff / 86400) + ' dni zpet';
}
function formatDate(dateStr) {
var d = new Date(dateStr);
var day = d.getDate();
var months = ['ledna','unora','brezna','dubna','kvetna','cervna',
'cervence','srpna','zari','rijna','listopadu','prosince'];
var hh = String(d.getHours()).padStart(2, '0');
var mm = String(d.getMinutes()).padStart(2, '0');
return day + '. ' + months[d.getMonth()] + ' ' + d.getFullYear() + ', ' + hh + ':' + mm;
}
function render(data) {
// Check if scrape is currently running
if (data.status === 'running') {
document.getElementById('content').innerHTML =
'<div class="loader-wrap">' +
'<div class="spinner"></div>' +
'<div class="loader-text">Scraper prave bezi...</div>' +
'</div>';
setTimeout(loadStatus, 30000);
return;
}
var sources = data.sources || [];
var totalOk = 0, totalRej = 0;
var maxCount = 0;
sources.forEach(function(s) {
totalOk += s.accepted || 0;
totalRej += s.rejected || 0;
if (s.accepted > maxCount) maxCount = s.accepted;
});
var html = '';
// Timestamp card
html += '<div class="card">';
html += '<h2>Posledni scrape</h2>';
html += '<div class="timestamp">' + formatDate(data.timestamp) + '</div>';
html += '<div class="timestamp-ago">' + timeAgo(data.timestamp) + '</div>';
if (data.duration_sec) {
html += '<div class="timestamp-ago">Trvani: ' + Math.round(data.duration_sec) + 's</div>';
}
html += '</div>';
// Summary card
html += '<div class="card">';
html += '<h2>Souhrn</h2>';
html += '<div class="summary-row"><span class="summary-label">Vyhovujicich bytu</span><span class="summary-value" style="color:#4CAF50">' + totalOk + '</span></div>';
html += '<div class="summary-row"><span class="summary-label">Vyloucenych</span><span class="summary-value" style="color:#999">' + totalRej + '</span></div>';
if (data.deduplicated !== undefined) {
html += '<div class="summary-row"><span class="summary-label">Po deduplikaci (v mape)</span><span class="summary-value" style="color:#1976D2">' + data.deduplicated + '</span></div>';
}
html += '</div>';
// Sources card
html += '<div class="card">';
html += '<h2>Zdroje</h2>';
sources.forEach(function(s) {
var color = COLORS[s.name.toLowerCase()] || '#999';
var pct = maxCount > 0 ? Math.round((s.accepted / maxCount) * 100) : 0;
var badge = s.error
? '<span class="badge badge-err">chyba</span>'
: (s.accepted === 0 ? '<span class="badge badge-skip">0</span>' : '<span class="badge badge-ok">OK</span>');
html += '<div style="margin-bottom:12px;">';
html += '<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">';
html += '<span style="font-weight:600;font-size:14px;">' + s.name + ' ' + badge + '</span>';
html += '<span style="font-size:12px;color:#999;">' + (s.rejected || 0) + ' vyloucenych</span>';
html += '</div>';
html += '<div class="bar-row">';
html += '<div class="bar-track"><div class="bar-fill" style="width:' + pct + '%;background:' + color + ';"></div></div>';
html += '<span class="bar-count">' + (s.accepted || 0) + '</span>';
html += '</div>';
html += '</div>';
});
html += '</div>';
document.getElementById('content').innerHTML = html;
}
function loadStatus() {
fetch('status.json?t=' + Date.now())
.then(function(r) {
if (!r.ok) throw new Error(r.status);
return r.json();
})
.then(render)
.catch(function(err) {
document.getElementById('content').innerHTML =
'<div class="error-msg">Status zatim neni k dispozici.<br><small>(' + err.message + ')</small></div>';
});
}
loadStatus();
</script>
</body>
</html>