6 Commits

Author SHA1 Message Date
7d3021efbf Remove tracked generated/data files and fix map link on status page
- Remove byty_*.json, mapa_bytu.html, .DS_Store and settings.local.json from git tracking
  (already in .gitignore, files kept locally)
- Fix "Otevřít mapu" link on scraper status page: / → /mapa_bytu.html

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 18:50:16 +01:00
23d208a5b7 Merge pull request 'Add scraper status collection and presentation' (#3) from add-scraper-statuses into main
Reviewed-on: #3
2026-02-26 09:04:23 +00:00
Jan Novak
00c9144010 Fix DATA_DIR usage in stats/history paths, set env in Dockerfile, add validation docs
All checks were successful
Build and Push / build (push) Successful in 5s
- scraper_stats.py: respect DATA_DIR env var when writing stats_*.json files
- generate_status.py: read stats files and write history from DATA_DIR instead of HERE
- build/Dockerfile: set DATA_DIR=/app/data as default env var
- docs/validation.md: end-to-end Docker validation recipe

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 09:46:16 +01:00
Jan Novak
44c02b45b4 Increase history retention to 20, run scrapers every 4 hours
All checks were successful
Build and Push / build (push) Successful in 7s
- generate_status.py: raise --keep default from 5 to 20 entries
- build/crontab: change schedule from 06:00/18:00 to every 4 hours (*/4)
  covers 6 runs/day ≈ 3.3 days of history at default retention

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 08:53:27 +01:00
Jan Novak
5fb3b984b6 Add status dashboard, server, scraper stats, and DATA_DIR support
All checks were successful
Build and Push / build (push) Successful in 7s
Key changes:
- Replace ratings_server.py + status.html with a unified server.py that
  serves the map, scraper status dashboard, and ratings API in one process
- Add scraper_stats.py utility: each scraper writes per-run stats (fetched,
  accepted, excluded, duration) to stats_<source>.json for the status page
- generate_status.py: respect DATA_DIR env var so status.json lands in the
  configured data directory instead of always the project root
- run_all.sh: replace the {"status":"running"} overwrite of status.json with
  a dedicated scraper_running.json lock file; trap on EXIT ensures cleanup
  even on kill/error, preventing the previous run's results from being wiped
- server.py: detect running state via scraper_running.json existence instead
  of status["status"] field, eliminating the dual-use race condition
- Makefile: add serve (local dev), debug (Docker debug container) targets;
  add SERVER_PORT variable
- build/Dockerfile + entrypoint.sh: switch to server.py, set DATA_DIR,
  adjust volume mounts
- .gitignore: add *.json and *.log to keep runtime data files out of VCS
- mapa_bytu.html: price-per-m² colouring, status link, UX tweaks

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 00:30:25 +01:00
6f49533c94 Merge pull request 'Rewrite PSN + CityHome scrapers, add price/m² map coloring, ratings system, and status dashboard' (#2) from ui-tweaks/2026-02-17 into main
Reviewed-on: #2
2026-02-25 21:26:51 +00:00
28 changed files with 958 additions and 32747 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@@ -1,31 +0,0 @@
{
"permissions": {
"allow": [
"WebFetch(domain:github.com)",
"WebFetch(domain:www.sreality.cz)",
"WebFetch(domain:webscraping.pro)",
"WebFetch(domain:raw.githubusercontent.com)",
"Bash(python3:*)",
"Bash(open:*)",
"WebFetch(domain:www.realingo.cz)",
"WebFetch(domain:api.realingo.cz)",
"Bash(curl:*)",
"Bash(grep:*)",
"WebFetch(domain:www.realitni-pes.cz)",
"WebFetch(domain:www.bezrealitky.cz)",
"WebFetch(domain:apify.com)",
"WebFetch(domain:www.bezrealitky.com)",
"WebFetch(domain:reality.idnes.cz)",
"Bash(# Final checks: robots.txt and response time for rate limiting clues curl -s -L -H \"\"User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/120.0.0.0 Safari/537.36\"\" \"\"https://reality.idnes.cz/robots.txt\"\")",
"WebFetch(domain:www.cityhome.cz)",
"WebFetch(domain:www.psn.cz)",
"WebFetch(domain:www.city-home.cz)",
"WebFetch(domain:psn.cz)",
"WebFetch(domain:api.psn.cz)",
"Bash(done)",
"Bash(# Final summary: count total units across all projects\n# Get the total count from the unitsCountData we already extracted\necho \"\"From unitsCountData on /prodej page:\"\"\necho \"\" type_id 0 \\(Prodej bytů a ateliérů\\): 146\"\"\necho \"\" type_id 1 \\(Prodej komerčních nemovitostí\\): 14\"\"\necho \"\" type_id 2 \\(Pronájem bytů\\): 3\"\"\necho \"\" type_id 3 \\(Pronájem komerčních nemovitostí\\): 48\"\"\necho \"\"\"\"\necho \"\"Total for-sale projects: 19\"\"\necho \"\"\"\"\necho \"\"Disposition counts from the data:\"\"\npython3 << 'PYEOF'\n# Extract disposition counts from prodej page\nimport re\n\nwith open\\('/tmp/psn_prodej_p1.html', 'r', encoding='utf-8'\\) as f:\n html = f.read\\(\\)\n\n# Find disposition data\nidx = html.find\\('\\\\\\\\\"disposition\\\\\\\\\":['\\)\nif idx >= 0:\n chunk = html[idx:idx+2000].replace\\('\\\\\\\\\"', '\"'\\)\n # Extract name and count pairs\n import re\n pairs = re.findall\\(r'\"name\":\"\\([^\"]+\\)\",\"count\":\\(\\\\d+\\)', chunk\\)\n for name, count in pairs:\n print\\(f\" {name}: {count}\"\\)\nPYEOF)",
"Bash(ls:*)",
"Bash(chmod:*)"
]
}
}

5
.gitignore vendored
View File

@@ -1,3 +1,8 @@
.vscode/ .vscode/
__pycache__/ __pycache__/
.DS_Store
byty_*.json byty_*.json
*.json
*.log
mapa_bytu.html

View File

@@ -3,9 +3,13 @@ CONTAINER_NAME := maru-hleda-byt
VOLUME_NAME := maru-hleda-byt-data VOLUME_NAME := maru-hleda-byt-data
VALIDATION_CONTAINER := maru-hleda-byt-validation VALIDATION_CONTAINER := maru-hleda-byt-validation
VALIDATION_VOLUME := maru-hleda-byt-validation-data VALIDATION_VOLUME := maru-hleda-byt-validation-data
DEBUG_CONTAINER := maru-hleda-byt-debug
DEBUG_VOLUME := maru-hleda-byt-debug-data
DEBUG_PORT ?= 8082
PORT := 8080 PORT := 8080
SERVER_PORT ?= 8080
.PHONY: build run stop logs scrape restart clean help validation validation-local validation-stop validation-local-debug .PHONY: build run stop logs scrape restart clean help serve validation validation-local validation-stop validation-local-debug debug debug-stop
help: help:
@echo "Available targets:" @echo "Available targets:"
@@ -20,6 +24,9 @@ help:
@echo " validation-local-debug - Run validation locally with DEBUG logging" @echo " validation-local-debug - Run validation locally with DEBUG logging"
@echo " restart - Restart the container (stop and run again)" @echo " restart - Restart the container (stop and run again)"
@echo " clean - Stop container and remove the Docker image" @echo " clean - Stop container and remove the Docker image"
@echo " serve - Start server.py locally on port 8080"
@echo " debug - Build and run debug Docker container with limited scrape (port $(DEBUG_PORT))"
@echo " debug-stop - Stop and remove the debug Docker container"
@echo " help - Show this help message" @echo " help - Show this help message"
build: build:
@@ -59,6 +66,27 @@ validation-stop:
@docker rm $(VALIDATION_CONTAINER) 2>/dev/null || true @docker rm $(VALIDATION_CONTAINER) 2>/dev/null || true
@echo "Validation container stopped and removed" @echo "Validation container stopped and removed"
debug: build
@docker stop $(DEBUG_CONTAINER) 2>/dev/null || true
@docker rm $(DEBUG_CONTAINER) 2>/dev/null || true
docker run -d --name $(DEBUG_CONTAINER) \
-p $(DEBUG_PORT):8080 \
-v $(DEBUG_VOLUME):/app/data \
-e LOG_LEVEL=DEBUG \
$(IMAGE_NAME)
@sleep 2
docker exec $(DEBUG_CONTAINER) bash /app/run_all.sh --max-pages 1 --max-properties 10
@echo "Debug app at http://localhost:$(DEBUG_PORT)/mapa_bytu.html"
@echo "Debug status at http://localhost:$(DEBUG_PORT)/scrapers-status"
debug-stop:
@docker stop $(DEBUG_CONTAINER) 2>/dev/null || true
@docker rm $(DEBUG_CONTAINER) 2>/dev/null || true
@echo "Debug container stopped and removed"
serve:
DATA_DIR=. SERVER_PORT=$(SERVER_PORT) python3 server.py
validation-local: validation-local:
./run_all.sh --max-pages 1 --max-properties 10 ./run_all.sh --max-pages 1 --max-properties 10

View File

@@ -5,12 +5,14 @@ RUN apk add --no-cache curl bash tzdata \
&& echo "Europe/Prague" > /etc/timezone && echo "Europe/Prague" > /etc/timezone
ENV PYTHONUNBUFFERED=1 ENV PYTHONUNBUFFERED=1
ENV DATA_DIR=/app/data
WORKDIR /app WORKDIR /app
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
scrape_idnes.py scrape_psn.py scrape_cityhome.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \
merge_and_map.py regen_map.py run_all.sh ratings_server.py ./ merge_and_map.py regen_map.py generate_status.py scraper_stats.py \
run_all.sh server.py ./
COPY build/crontab /etc/crontabs/root COPY build/crontab /etc/crontabs/root
COPY build/entrypoint.sh /entrypoint.sh COPY build/entrypoint.sh /entrypoint.sh
@@ -18,7 +20,7 @@ RUN chmod +x /entrypoint.sh run_all.sh
RUN mkdir -p /app/data RUN mkdir -p /app/data
EXPOSE 8080 8081 EXPOSE 8080
HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \
CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 CMD wget -q -O /dev/null http://localhost:8080/ || exit 1

View File

@@ -1 +1 @@
0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 0 */4 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2

View File

@@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
set -euo pipefail set -euo pipefail
DATA_DIR="/app/data" export DATA_DIR="/app/data"
# Create symlinks so scripts (which write to /app/) persist data to the volume # Create symlinks so scripts (which write to /app/) persist data to the volume
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
@@ -18,8 +18,5 @@ crond -b -l 2
echo "[entrypoint] Starting initial scrape in background..." echo "[entrypoint] Starting initial scrape in background..."
bash /app/run_all.sh & bash /app/run_all.sh &
echo "[entrypoint] Starting ratings API server on port 8081..." echo "[entrypoint] Starting server on port 8080..."
DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py & exec python3 /app/server.py
echo "[entrypoint] Starting HTTP server on port 8080..."
exec python3 -m http.server 8080 --directory "$DATA_DIR"

View File

@@ -1,427 +0,0 @@
[
{
"hash_id": 990183,
"name": "Prodej bytu 3+kk 86 m²",
"price": 10385000,
"price_formatted": "10 385 000 Kč",
"locality": "Ke Tvrzi, Praha - Královice",
"lat": 50.0390519,
"lon": 14.63862,
"disposition": "3+kk",
"floor": 2,
"area": 86,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/990183-nabidka-prodej-bytu-ke-tvrzi-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 989862,
"name": "Prodej bytu 3+kk 73 m²",
"price": 12790000,
"price_formatted": "12 790 000 Kč",
"locality": "Vrázova, Praha - Smíchov",
"lat": 50.0711312,
"lon": 14.4076652,
"disposition": "3+kk",
"floor": 3,
"area": 73,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/989862-nabidka-prodej-bytu-vrazova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 981278,
"name": "Prodej bytu 3+kk 70 m²",
"price": 11890000,
"price_formatted": "11 890 000 Kč",
"locality": "Argentinská, Praha - Holešovice",
"lat": 50.1026043,
"lon": 14.4435365,
"disposition": "3+kk",
"floor": 3,
"area": 70,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981278-nabidka-prodej-bytu-argentinska-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 989817,
"name": "Prodej bytu 3+kk 88 m²",
"price": 13490000,
"price_formatted": "13 490 000 Kč",
"locality": "Miroslava Hajna, Praha - Letňany",
"lat": 50.1406487,
"lon": 14.5207541,
"disposition": "3+kk",
"floor": 2,
"area": 88,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/989817-nabidka-prodej-bytu-miroslava-hajna-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 970257,
"name": "Prodej bytu 3+1 106 m²",
"price": 12950000,
"price_formatted": "12 950 000 Kč",
"locality": "Novákových, Praha - Libeň",
"lat": 50.1034771,
"lon": 14.4758735,
"disposition": "3+1",
"floor": 5,
"area": 106,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/970257-nabidka-prodej-bytu-novakovych-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 972406,
"name": "Prodej bytu 3+kk 83 m²",
"price": 10490000,
"price_formatted": "10 490 000 Kč",
"locality": "Na Výrovně, Praha - Stodůlky",
"lat": 50.0396067,
"lon": 14.3167022,
"disposition": "3+kk",
"floor": 2,
"area": 83,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/972406-nabidka-prodej-bytu-na-vyrovne",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 967142,
"name": "Prodej bytu 3+kk 78 m²",
"price": 11648000,
"price_formatted": "11 648 000 Kč",
"locality": "Na Míčánkách, Praha - Vršovice",
"lat": 50.0713284,
"lon": 14.4638722,
"disposition": "3+kk",
"floor": 6,
"area": 78,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/967142-nabidka-prodej-bytu-na-micankach",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 955977,
"name": "Prodej bytu 4+kk 75 m²",
"price": 10363000,
"price_formatted": "10 363 000 Kč",
"locality": "Karla Guta, Praha - Uhříněves",
"lat": 50.03017,
"lon": 14.5940072,
"disposition": "4+kk",
"floor": 4,
"area": 75,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/955977-nabidka-prodej-bytu-karla-guta",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 974557,
"name": "Prodej bytu 4+kk 94 m²",
"price": 13499900,
"price_formatted": "13 499 900 Kč",
"locality": "V Dolině, Praha - Michle",
"lat": 50.0579963,
"lon": 14.4682887,
"disposition": "4+kk",
"floor": 8,
"area": 94,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/974557-nabidka-prodej-bytu-v-doline-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 988498,
"name": "Prodej bytu 3+1 75 m²",
"price": 11400000,
"price_formatted": "11 400 000 Kč",
"locality": "5. května, Praha - Nusle",
"lat": 50.0604096,
"lon": 14.4326302,
"disposition": "3+1",
"floor": 4,
"area": 75,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/988498-nabidka-prodej-bytu-5-kvetna-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 985285,
"name": "Prodej bytu 3+kk 70 m²",
"price": 12200000,
"price_formatted": "12 200 000 Kč",
"locality": "Klausova, Praha - Stodůlky",
"lat": 50.0370204,
"lon": 14.3432643,
"disposition": "3+kk",
"floor": 5,
"area": 70,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985285-nabidka-prodej-bytu-klausova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 965526,
"name": "Prodej bytu 3+kk 77 m²",
"price": 11890000,
"price_formatted": "11 890 000 Kč",
"locality": "Vinohradská, Praha - Strašnice",
"lat": 50.0776726,
"lon": 14.4870072,
"disposition": "3+kk",
"floor": 16,
"area": 77,
"building_type": "Smíšená",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/965526-nabidka-prodej-bytu-vinohradska-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 924811,
"name": "Prodej bytu 3+kk 75 m²",
"price": 13390000,
"price_formatted": "13 390 000 Kč",
"locality": "Waltariho, Praha - Hloubětín",
"lat": 50.1076717,
"lon": 14.5248559,
"disposition": "3+kk",
"floor": 4,
"area": 75,
"building_type": "Smíšená",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/924811-nabidka-prodej-bytu-waltariho-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 985859,
"name": "Prodej bytu 3+1 80 m²",
"price": 9000000,
"price_formatted": "9 000 000 Kč",
"locality": "Staňkova, Praha - Háje",
"lat": 50.0377128,
"lon": 14.5311557,
"disposition": "3+1",
"floor": 2,
"area": 80,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985859-nabidka-prodej-bytu-stankova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 985583,
"name": "Prodej bytu 3+kk 76 m²",
"price": 10850000,
"price_formatted": "10 850 000 Kč",
"locality": "Boloňská, Praha - Horní Měcholupy",
"lat": 50.047328,
"lon": 14.5565277,
"disposition": "3+kk",
"floor": 4,
"area": 76,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985583-nabidka-prodej-bytu-bolonska-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 981178,
"name": "Prodej bytu 4+kk 86 m²",
"price": 11990000,
"price_formatted": "11 990 000 Kč",
"locality": "Sušilova, Praha - Uhříněves",
"lat": 50.032081,
"lon": 14.5885148,
"disposition": "4+kk",
"floor": 2,
"area": 86,
"building_type": "SKELET",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981178-nabidka-prodej-bytu-susilova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 973216,
"name": "Prodej bytu 4+1 82 m²",
"price": 11357000,
"price_formatted": "11 357 000 Kč",
"locality": "Nad Kapličkou, Praha - Strašnice",
"lat": 50.0839509,
"lon": 14.4904493,
"disposition": "4+1",
"floor": 2,
"area": 82,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/973216-nabidka-prodej-bytu-nad-kaplickou-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 868801,
"name": "Prodej bytu 3+kk 109 m²",
"price": 7299000,
"price_formatted": "7 299 000 Kč",
"locality": "Pod Karlovem, Praha - Vinohrady",
"lat": 50.0676313,
"lon": 14.432498,
"disposition": "3+kk",
"floor": 5,
"area": 109,
"building_type": "Cihlová",
"ownership": "Družstevní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/868801-nabidka-prodej-bytu-pod-karlovem-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 868795,
"name": "Prodej bytu 3+kk 106 m²",
"price": 6299000,
"price_formatted": "6 299 000 Kč",
"locality": "Pod Karlovem, Praha - Vinohrady",
"lat": 50.0676313,
"lon": 14.432498,
"disposition": "3+kk",
"floor": 2,
"area": 106,
"building_type": "Cihlová",
"ownership": "Družstevní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/868795-nabidka-prodej-bytu-pod-karlovem-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 981890,
"name": "Prodej bytu 3+1 84 m²",
"price": 12980000,
"price_formatted": "12 980 000 Kč",
"locality": "Novákových, Praha - Libeň",
"lat": 50.103273,
"lon": 14.4746894,
"disposition": "3+1",
"floor": 2,
"area": 84,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981890-nabidka-prodej-bytu-novakovych-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 976276,
"name": "Prodej bytu 3+kk 75 m²",
"price": 13490000,
"price_formatted": "13 490 000 Kč",
"locality": "Svornosti, Praha - Smíchov",
"lat": 50.0673284,
"lon": 14.4095087,
"disposition": "3+kk",
"floor": 2,
"area": 75,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/976276-nabidka-prodej-bytu-svornosti-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 950787,
"name": "Prodej bytu 3+kk 70 m²",
"price": 9999000,
"price_formatted": "9 999 000 Kč",
"locality": "Sečská, Praha - Strašnice",
"lat": 50.071191,
"lon": 14.5035501,
"disposition": "3+kk",
"floor": 3,
"area": 70,
"building_type": "Smíšená",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/950787-nabidka-prodej-bytu-secska-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 978045,
"name": "Prodej bytu 3+kk 76 m²",
"price": 11133000,
"price_formatted": "11 133 000 Kč",
"locality": "K Vinoři, Praha - Kbely",
"lat": 50.1329656,
"lon": 14.5618499,
"disposition": "3+kk",
"floor": 2,
"area": 76,
"building_type": "Smíšená",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/978045-nabidka-prodej-bytu-k-vinori",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 974552,
"name": "Prodej bytu 3+1 75 m²",
"price": 11000000,
"price_formatted": "11 000 000 Kč",
"locality": "Vejražkova, Praha - Košíře",
"lat": 50.0637808,
"lon": 14.3612275,
"disposition": "3+1",
"floor": 2,
"area": 75,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/974552-nabidka-prodej-bytu-vejrazkova-praha",
"source": "bezrealitky",
"image": ""
},
{
"hash_id": 955010,
"name": "Prodej bytu 3+kk 70 m²",
"price": 12290000,
"price_formatted": "12 290 000 Kč",
"locality": "Břeclavská, Praha - Kyje",
"lat": 50.0951045,
"lon": 14.5454237,
"disposition": "3+kk",
"floor": 2,
"area": 70,
"building_type": "Cihlová",
"ownership": "Osobní",
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/955010-nabidka-prodej-bytu-breclavska-hlavni-mesto-praha",
"source": "bezrealitky",
"image": ""
}
]

View File

@@ -1 +0,0 @@
[]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1 +0,0 @@
[]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

123
docs/validation.md Normal file
View File

@@ -0,0 +1,123 @@
# Validation Recipe
End-to-end check that scraping, data persistence, history, and the status page all work correctly in Docker.
## What it verifies
- All scrapers run and write output to `DATA_DIR` (`/app/data`)
- `stats_*.json` land in `/app/data/` (not in `/app/`)
- `status.json` and `scraper_history.json` land in `/app/data/`
- `/api/status`, `/api/status/history`, and `/scrapers-status` serve correct data
- History accumulates across runs
## Steps
### 1. Build the image
```bash
make build
```
### 2. Start a clean validation container
```bash
# Stop/remove any leftover container and volume from a previous run
docker stop maru-hleda-byt-validation 2>/dev/null; docker rm maru-hleda-byt-validation 2>/dev/null
docker volume rm maru-hleda-byt-validation-data 2>/dev/null
docker run -d --name maru-hleda-byt-validation \
-p 8081:8080 \
-v maru-hleda-byt-validation-data:/app/data \
maru-hleda-byt
```
Give the container ~3 seconds to start. The entrypoint launches a background full scrape automatically — suppress it so only controlled runs execute:
```bash
sleep 3
docker exec maru-hleda-byt-validation pkill -f run_all.sh 2>/dev/null || true
docker exec maru-hleda-byt-validation rm -f /app/data/scraper_running.json 2>/dev/null || true
```
### 3. Run a limited scrape (run 1)
```bash
docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10
```
Expected output (last few lines):
```
Status uložen: /app/data/status.json
Historie uložena: /app/data/scraper_history.json (1 záznamů)
```
### 4. Verify data files are in `/app/data/`
```bash
docker exec maru-hleda-byt-validation ls /app/data/
```
Expected files:
```
byty_cityhome.json byty_idnes.json byty_merged.json
byty_realingo.json byty_sreality.json
mapa_bytu.html
scraper_history.json
stats_bezrealitky.json stats_cityhome.json stats_idnes.json
stats_realingo.json stats_sreality.json
status.json
```
### 5. Run a second limited scrape (run 2)
```bash
docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10
```
Expected last line: `Historie uložena: /app/data/scraper_history.json (2 záznamů)`
### 6. Verify history via API
```bash
curl -s http://localhost:8081/api/status/history | python3 -c "
import json, sys
h = json.load(sys.stdin)
print(f'{len(h)} entries:')
for i, e in enumerate(h):
print(f' [{i}] {e[\"timestamp\"]} total={e[\"total_accepted\"]}')
"
```
Expected: 2 entries with different timestamps.
```bash
curl -s http://localhost:8081/api/status | python3 -c "
import json, sys; s=json.load(sys.stdin)
print(f'status={s[\"status\"]} total={s[\"total_accepted\"]} ts={s[\"timestamp\"]}')
"
```
Expected: `status=done total=<N> ts=<latest timestamp>`
### 7. Check the status page
Open http://localhost:8081/scrapers-status in a browser (or `curl -s http://localhost:8081/scrapers-status | grep -c "clickable-row"` — should print `2`).
### 8. Clean up
```bash
docker stop maru-hleda-byt-validation && docker rm maru-hleda-byt-validation
docker volume rm maru-hleda-byt-validation-data
```
Or use the Makefile shortcut:
```bash
make validation-stop
```
## Notes
- PSN scraper does not support `--max-pages` and will always fail with this command; `success=False` in history is expected during validation.
- Bezrealitky may return 0 results with a 1-page limit; `byty_bezrealitky.json` will be absent from `/app/data/` in that case — this is normal.
- `make validation` (the Makefile target) runs the same limited scrape but does not suppress the background startup scrape, so two concurrent runs may occur. Use the manual steps above for a clean controlled test.

View File

@@ -1,16 +1,15 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Generate status.json from scraper JSON outputs and run log.""" """Generate status.json from scraper JSON outputs and per-scraper stats files."""
from __future__ import annotations from __future__ import annotations
import argparse
import json import json
import os import os
import re
import sys
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Optional
HERE = Path(__file__).parent HERE = Path(__file__).parent
DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
SOURCE_FILES = { SOURCE_FILES = {
"Sreality": "byty_sreality.json", "Sreality": "byty_sreality.json",
@@ -21,7 +20,17 @@ SOURCE_FILES = {
"CityHome": "byty_cityhome.json", "CityHome": "byty_cityhome.json",
} }
STATS_FILES = {
"Sreality": "stats_sreality.json",
"Realingo": "stats_realingo.json",
"Bezrealitky": "stats_bezrealitky.json",
"iDNES": "stats_idnes.json",
"PSN": "stats_psn.json",
"CityHome": "stats_cityhome.json",
}
MERGED_FILE = "byty_merged.json" MERGED_FILE = "byty_merged.json"
HISTORY_FILE = "scraper_history.json"
def count_source(path: Path) -> dict: def count_source(path: Path) -> dict:
@@ -36,105 +45,51 @@ def count_source(path: Path) -> dict:
return {"accepted": 0, "error": str(e)} return {"accepted": 0, "error": str(e)}
def parse_log(log_path: str) -> dict[str, dict]: def read_scraper_stats(path: Path) -> dict:
"""Parse scraper run log and extract per-source statistics. """Load a per-scraper stats JSON. Returns {} on missing or corrupt file."""
if not path.exists():
Scrapers log summary lines like: return {}
✓ Vyhovující byty: 12 try:
Vyloučeno (prodáno): 5 data = json.loads(path.read_text(encoding="utf-8"))
Staženo stránek: 3 return data if isinstance(data, dict) else {}
Staženo inzerátů: 48 except Exception:
Celkem bytů v cache: 120
and section headers like:
[2/6] Realingo
"""
if not log_path or not os.path.exists(log_path):
return {} return {}
with open(log_path, encoding="utf-8") as f:
content = f.read()
# Split into per-source sections by the [N/6] Step header def append_to_history(status: dict, keep: int) -> None:
# Each section header looks like "[2/6] Realingo\n----..." """Append the current status entry to scraper_history.json, keeping only `keep` latest."""
section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE) history_path = DATA_DIR / HISTORY_FILE
sections_found = list(section_pattern.finditer(content)) history: list = []
if history_path.exists():
try:
history = json.loads(history_path.read_text(encoding="utf-8"))
if not isinstance(history, list):
history = []
except Exception:
history = []
if not sections_found: history.append(status)
return {}
stats = {} # Keep only the N most recent entries
for i, match in enumerate(sections_found): if keep > 0 and len(history) > keep:
step_name = match.group(2).strip() history = history[-keep:]
start = match.end()
end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content)
section_text = content[start:end]
# Identify which sources this section covers history_path.write_text(json.dumps(history, ensure_ascii=False, indent=2), encoding="utf-8")
# "PSN + CityHome" covers both print(f"Historie uložena: {history_path} ({len(history)} záznamů)")
source_names = []
for name in SOURCE_FILES:
if name.lower() in step_name.lower():
source_names.append(name)
if not source_names:
continue
# Parse numeric summary lines
def extract(pattern: str) -> Optional[int]:
m = re.search(pattern, section_text)
return int(m.group(1)) if m else None
# Lines present in all/most scrapers
accepted = extract(r'Vyhovující byty[:\s]+(\d+)')
fetched = extract(r'Staženo inzerátů[:\s]+(\d+)')
pages = extract(r'Staženo stránek[:\s]+(\d+)')
cached = extract(r'Celkem bytů v cache[:\s]+(\d+)')
cache_hits = extract(r'Cache hit[:\s]+(\d+)')
# Rejection reasons — collect all into a dict
excluded = {}
for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text):
excluded[m.group(1)] = int(m.group(2))
# Also PSN-style "Vyloučeno (prodáno): N"
total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)')
entry = {}
if accepted is not None:
entry["accepted"] = accepted
if fetched is not None:
entry["fetched"] = fetched
if pages is not None:
entry["pages"] = pages
if cached is not None:
entry["cached"] = cached
if cache_hits is not None:
entry["cache_hits"] = cache_hits
if excluded:
entry["excluded"] = excluded
elif total_excluded is not None:
entry["excluded_total"] = total_excluded
for name in source_names:
stats[name] = entry
return stats
def main(): def main():
start_time = None parser = argparse.ArgumentParser(description="Generate status.json from scraper outputs.")
duration_sec = None parser.add_argument("--start-time", dest="start_time", default=None,
help="ISO timestamp of scrape start (default: now)")
parser.add_argument("--duration", dest="duration", type=int, default=None,
help="Run duration in seconds")
parser.add_argument("--keep", dest="keep", type=int, default=20,
help="Number of history entries to keep (default: 20, 0=unlimited)")
args = parser.parse_args()
if len(sys.argv) >= 3: start_time = args.start_time or datetime.now().isoformat(timespec="seconds")
start_time = sys.argv[1] duration_sec = args.duration
try:
duration_sec = int(sys.argv[2])
except ValueError:
pass
if not start_time:
start_time = datetime.now().isoformat(timespec="seconds")
log_path = sys.argv[3] if len(sys.argv) >= 4 else None
log_stats = parse_log(log_path)
sources = [] sources = []
for name, filename in SOURCE_FILES.items(): for name, filename in SOURCE_FILES.items():
@@ -142,14 +97,12 @@ def main():
info = count_source(path) info = count_source(path)
info["name"] = name info["name"] = name
# Merge log stats # Merge in stats from the per-scraper stats file (authoritative for run data)
ls = log_stats.get(name, {}) stats = read_scraper_stats(DATA_DIR / STATS_FILES[name])
for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"): for key in ("accepted", "fetched", "pages", "cache_hits", "excluded", "excluded_total",
if k in ls: "success", "duration_sec", "error"):
info[k] = ls[k] if key in stats:
# Override accepted from log if available (log is authoritative for latest run) info[key] = stats[key]
if "accepted" in ls:
info["accepted"] = ls["accepted"]
sources.append(info) sources.append(info)
@@ -168,17 +121,21 @@ def main():
duplicates_removed = total_accepted - deduplicated if deduplicated else 0 duplicates_removed = total_accepted - deduplicated if deduplicated else 0
# Top-level success: True if no source has an error
success = not any("error" in s for s in sources)
status = { status = {
"status": "done", "status": "done",
"timestamp": start_time, "timestamp": start_time,
"duration_sec": duration_sec, "duration_sec": duration_sec,
"success": success,
"total_accepted": total_accepted, "total_accepted": total_accepted,
"deduplicated": deduplicated, "deduplicated": deduplicated,
"duplicates_removed": duplicates_removed, "duplicates_removed": duplicates_removed,
"sources": sources, "sources": sources,
} }
out = HERE / "status.json" out = DATA_DIR / "status.json"
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8") out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Status uložen: {out}") print(f"Status uložen: {out}")
print(f" Celkem bytů (před dedup): {total_accepted}") print(f" Celkem bytů (před dedup): {total_accepted}")
@@ -197,6 +154,8 @@ def main():
parts.append(f"[CHYBA: {err}]") parts.append(f"[CHYBA: {err}]")
print(" " + " ".join(parts)) print(" " + " ".join(parts))
append_to_history(status, args.keep)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,116 +0,0 @@
#!/usr/bin/env python3
"""
Minimal HTTP API server for persisting apartment ratings.
GET /api/ratings → returns ratings.json contents
POST /api/ratings → saves entire ratings object
GET /api/ratings/export → same as GET, but with download header
Ratings file: /app/data/ratings.json (or ./ratings.json locally)
"""
import json
import logging
import os
import sys
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
PORT = int(os.environ.get("RATINGS_PORT", 8081))
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
RATINGS_FILE = DATA_DIR / "ratings.json"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [ratings] %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
log = logging.getLogger(__name__)
def load_ratings() -> dict:
try:
if RATINGS_FILE.exists():
return json.loads(RATINGS_FILE.read_text(encoding="utf-8"))
except Exception as e:
log.error("Failed to load ratings: %s", e)
return {}
def save_ratings(data: dict) -> None:
RATINGS_FILE.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
class RatingsHandler(BaseHTTPRequestHandler):
def log_message(self, format, *args):
# Suppress default HTTP access log (we use our own)
pass
def _send_json(self, status: int, body: dict, extra_headers=None):
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
if extra_headers:
for k, v in extra_headers.items():
self.send_header(k, v)
self.end_headers()
self.wfile.write(payload)
def do_OPTIONS(self):
# CORS preflight
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
if self.path in ("/api/ratings", "/api/ratings/export"):
ratings = load_ratings()
extra = None
if self.path == "/api/ratings/export":
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
log.info("GET %s%d ratings", self.path, len(ratings))
self._send_json(200, ratings, extra)
else:
self._send_json(404, {"error": "not found"})
def do_POST(self):
if self.path == "/api/ratings":
length = int(self.headers.get("Content-Length", 0))
if length == 0:
self._send_json(400, {"error": "empty body"})
return
try:
raw = self.rfile.read(length)
data = json.loads(raw.decode("utf-8"))
except Exception as e:
log.warning("Bad request body: %s", e)
self._send_json(400, {"error": "invalid JSON"})
return
if not isinstance(data, dict):
self._send_json(400, {"error": "expected JSON object"})
return
save_ratings(data)
log.info("POST /api/ratings → saved %d ratings", len(data))
self._send_json(200, {"ok": True, "count": len(data)})
else:
self._send_json(404, {"error": "not found"})
if __name__ == "__main__":
log.info("Ratings server starting on port %d, data dir: %s", PORT, DATA_DIR)
log.info("Ratings file: %s", RATINGS_FILE)
server = HTTPServer(("0.0.0.0", PORT), RatingsHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
log.info("Stopped.")
sys.exit(0)

View File

@@ -20,8 +20,10 @@ START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S")
START_EPOCH=$(date +%s) START_EPOCH=$(date +%s)
LOG_FILE="$(pwd)/scrape_run.log" LOG_FILE="$(pwd)/scrape_run.log"
# Mark status as running # Mark scraper as running; cleaned up on exit (even on error/kill)
echo '{"status":"running"}' > status.json LOCK_FILE="${DATA_DIR:-.}/scraper_running.json"
echo '{"running":true,"started_at":"'"$START_TIME"'"}' > "$LOCK_FILE"
trap 'rm -f "$LOCK_FILE"' EXIT
show_help() { show_help() {
echo "Usage: ./run_all.sh [OPTIONS]" echo "Usage: ./run_all.sh [OPTIONS]"
@@ -32,16 +34,19 @@ show_help() {
echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje" echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje"
echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje" echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje"
echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)" echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)"
echo " --keep N Počet běhů v historii (výchozí: 5, 0=neomezeno)"
echo " -h, --help Zobrazí tuto nápovědu" echo " -h, --help Zobrazí tuto nápovědu"
echo "" echo ""
echo "Examples:" echo "Examples:"
echo " ./run_all.sh # plný běh" echo " ./run_all.sh # plný běh"
echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test" echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test"
echo " ./run_all.sh --log-level DEBUG # s debug logováním" echo " ./run_all.sh --log-level DEBUG # s debug logováním"
echo " ./run_all.sh --keep 10 # uchovej 10 běhů v historii"
} }
# Parse arguments # Parse arguments
SCRAPER_ARGS="" SCRAPER_ARGS=""
KEEP_ARG=""
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
-h|--help) -h|--help)
@@ -52,6 +57,10 @@ while [[ $# -gt 0 ]]; do
SCRAPER_ARGS="$SCRAPER_ARGS $1 $2" SCRAPER_ARGS="$SCRAPER_ARGS $1 $2"
shift 2 shift 2
;; ;;
--keep)
KEEP_ARG="--keep $2"
shift 2
;;
*) *)
echo "Unknown argument: $1" echo "Unknown argument: $1"
echo "" echo ""
@@ -103,7 +112,7 @@ python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((F
END_EPOCH=$(date +%s) END_EPOCH=$(date +%s)
DURATION=$((END_EPOCH - START_EPOCH)) DURATION=$((END_EPOCH - START_EPOCH))
python3 generate_status.py "$START_TIME" "$DURATION" "$LOG_FILE" python3 generate_status.py --start-time "$START_TIME" --duration "$DURATION" $KEEP_ARG
echo "" echo ""
echo "============================================================" echo "============================================================"

View File

@@ -15,6 +15,9 @@ import urllib.request
import urllib.parse import urllib.parse
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats
STATS_FILE = "stats_sreality.json"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -209,6 +212,8 @@ def load_cache(json_path: str = "byty_sreality.json") -> dict[int, dict]:
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
"""Main scraping function. Returns list of filtered estates.""" """Main scraping function. Returns list of filtered estates."""
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
all_estates_raw = [] all_estates_raw = []
cache = load_cache() cache = load_cache()
@@ -366,6 +371,21 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "Sreality",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(unique_estates),
"cache_hits": cache_hits,
"excluded": {
"panel/síd": excluded_panel,
"<69 m²": excluded_small,
"bez GPS": excluded_no_gps,
"bez detailu": excluded_no_detail,
},
})
return results return results
@@ -653,7 +673,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
Skrýt zamítnuté Skrýt zamítnuté
</label> </label>
</div> </div>
<div class="status-link"><a href="status.html">Scraper status</a></div> <div class="status-link"><a href="/scrapers-status">Scraper status</a></div>
</div> </div>
<script> <script>
@@ -1089,8 +1109,22 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time() start = time.time()
try:
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "Sreality",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates: if estates:
# Save raw data as JSON backup # Save raw data as JSON backup

View File

@@ -15,6 +15,9 @@ import re
import time import time
import urllib.request import urllib.request
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats
STATS_FILE = "stats_bezrealitky.json"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -171,6 +174,8 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
cache = load_cache() cache = load_cache()
logger.info("=" * 60) logger.info("=" * 60)
@@ -374,6 +379,25 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "Bezrealitky",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(all_adverts),
"pages": page - 1,
"cache_hits": cache_hits,
"excluded": {
"dispozice": excluded_disp,
"cena": excluded_price,
"plocha": excluded_area,
"bez GPS": excluded_no_gps,
"panel/síd": excluded_panel,
"patro": excluded_floor,
"bez detailu": excluded_detail,
},
})
return results return results
@@ -394,8 +418,22 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time() start = time.time()
try:
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "Bezrealitky",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates: if estates:
json_path = Path("byty_bezrealitky.json") json_path = Path("byty_bezrealitky.json")

View File

@@ -14,6 +14,9 @@ import time
import urllib.request import urllib.request
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats
STATS_FILE = "stats_cityhome.json"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -203,6 +206,8 @@ def extract_project_gps(html: str) -> tuple[float, float] | None:
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z CityHome (city-home.cz)") logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Cena: do {format_price(MAX_PRICE)}")
@@ -374,6 +379,23 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "CityHome",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(all_listings),
"excluded": {
"prodáno": excluded_sold,
"typ": excluded_type,
"dispozice": excluded_disp,
"cena": excluded_price,
"plocha": excluded_area,
"patro": excluded_floor,
"bez GPS": excluded_no_gps,
},
})
return results return results
@@ -394,8 +416,22 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time() start = time.time()
try:
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "CityHome",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates: if estates:
json_path = Path("byty_cityhome.json") json_path = Path("byty_cityhome.json")

View File

@@ -17,6 +17,9 @@ import urllib.request
import urllib.parse import urllib.parse
from html.parser import HTMLParser from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats
STATS_FILE = "stats_idnes.json"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -279,6 +282,8 @@ def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
cache = load_cache() cache = load_cache()
logger.info("=" * 60) logger.info("=" * 60)
@@ -478,6 +483,25 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "iDNES",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(all_listings),
"pages": page,
"cache_hits": cache_hits,
"excluded": {
"cena": excluded_price,
"plocha": excluded_area,
"dispozice": excluded_disp,
"panel/síd": excluded_panel,
"patro": excluded_floor,
"bez GPS": excluded_no_gps,
"bez detailu": excluded_detail,
},
})
return results return results
@@ -498,8 +522,22 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time() start = time.time()
try:
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "iDNES",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates: if estates:
json_path = Path("byty_idnes.json") json_path = Path("byty_idnes.json")

View File

@@ -15,6 +15,9 @@ import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from urllib.parse import urlencode from urllib.parse import urlencode
from scraper_stats import write_stats
STATS_FILE = "stats_psn.json"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -67,6 +70,8 @@ def format_price(price: int) -> str:
def scrape(max_properties: int | None = None): def scrape(max_properties: int | None = None):
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z PSN.cz") logger.info("Stahuji inzeráty z PSN.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Cena: do {format_price(MAX_PRICE)}")
@@ -93,6 +98,15 @@ def scrape(max_properties: int | None = None):
data = fetch_json(url) data = fetch_json(url)
except Exception as e: except Exception as e:
logger.error(f"Chyba při stahování: {e}", exc_info=True) logger.error(f"Chyba při stahování: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "PSN",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
return [] return []
all_units = data.get("units", {}).get("data", []) all_units = data.get("units", {}).get("data", [])
@@ -241,6 +255,15 @@ def scrape(max_properties: int | None = None):
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "PSN",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(all_units),
"excluded": excluded,
})
return results return results
@@ -259,8 +282,22 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time() start = time.time()
try:
estates = scrape(max_properties=args.max_properties) estates = scrape(max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "PSN",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates: if estates:
json_path = Path("byty_psn.json") json_path = Path("byty_psn.json")

View File

@@ -15,6 +15,9 @@ import re
import time import time
import urllib.request import urllib.request
from pathlib import Path from pathlib import Path
from scraper_stats import write_stats
STATS_FILE = "stats_realingo.json"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -136,6 +139,8 @@ def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
cache = load_cache() cache = load_cache()
logger.info("=" * 60) logger.info("=" * 60)
@@ -333,6 +338,25 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "Realingo",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(all_listings),
"pages": page - 1,
"cache_hits": cache_hits,
"excluded": {
"dispozice": excluded_category,
"cena": excluded_price,
"plocha": excluded_area,
"bez GPS": excluded_no_gps,
"panel/síd": excluded_panel,
"patro": excluded_floor,
"bez detailu": excluded_detail,
},
})
return results return results
@@ -353,8 +377,22 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()] handlers=[logging.StreamHandler()]
) )
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time() start = time.time()
try:
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "Realingo",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates: if estates:
json_path = Path("byty_realingo.json") json_path = Path("byty_realingo.json")

15
scraper_stats.py Normal file
View File

@@ -0,0 +1,15 @@
"""Shared utility for writing per-scraper run statistics to JSON."""
from __future__ import annotations
import json
import os
from pathlib import Path
HERE = Path(__file__).parent
DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
def write_stats(filename: str, stats: dict) -> None:
"""Write scraper run stats dict to the data directory."""
path = DATA_DIR / filename
path.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")

477
server.py Normal file
View File

@@ -0,0 +1,477 @@
#!/usr/bin/env python3
"""
General-purpose HTTP server for maru-hleda-byt.
Serves static files from DATA_DIR and additionally handles:
GET /scrapers-status → SSR scraper status page
GET /api/ratings → ratings.json contents
POST /api/ratings → save entire ratings object
GET /api/ratings/export → same as GET, with download header
GET /api/status → status.json contents (JSON)
GET /api/status/history → scraper_history.json contents (JSON)
"""
from __future__ import annotations
import functools
import json
import logging
import os
import sys
from datetime import datetime
from http.server import HTTPServer, SimpleHTTPRequestHandler
from pathlib import Path
PORT = int(os.environ.get("SERVER_PORT", 8080))
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
RATINGS_FILE = DATA_DIR / "ratings.json"
_LOG_LEVEL = getattr(logging, os.environ.get("LOG_LEVEL", "INFO").upper(), logging.INFO)
logging.basicConfig(
level=_LOG_LEVEL,
format="%(asctime)s [server] %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
log = logging.getLogger(__name__)
# ── Helpers ──────────────────────────────────────────────────────────────────
COLORS = {
"sreality": "#1976D2",
"realingo": "#7B1FA2",
"bezrealitky": "#E65100",
"idnes": "#C62828",
"psn": "#2E7D32",
"cityhome": "#00838F",
}
MONTHS_CZ = [
"ledna", "února", "března", "dubna", "května", "června",
"července", "srpna", "září", "října", "listopadu", "prosince",
]
def _load_json(path: Path, default=None):
"""Read and parse JSON file; return default on missing or parse error."""
log.debug("_load_json: %s", path.resolve())
try:
if path.exists():
return json.loads(path.read_text(encoding="utf-8"))
except Exception as e:
log.warning("Failed to load %s: %s", path, e)
return default
def _fmt_date(iso_str: str) -> str:
"""Format ISO timestamp as Czech date string."""
try:
d = datetime.fromisoformat(iso_str)
return f"{d.day}. {MONTHS_CZ[d.month - 1]} {d.year}, {d.hour:02d}:{d.minute:02d}"
except Exception:
return iso_str
def load_ratings() -> dict:
return _load_json(RATINGS_FILE, default={})
def save_ratings(data: dict) -> None:
RATINGS_FILE.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
# ── SSR status page ──────────────────────────────────────────────────────────
_CSS = """\
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: system-ui, -apple-system, sans-serif;
background: #f5f5f5; color: #333;
padding: 24px; max-width: 640px; margin: 0 auto;
}
h1 { font-size: 22px; margin-bottom: 4px; }
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
.card {
background: white; border-radius: 12px; padding: 20px;
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
}
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
.timestamp { font-size: 28px; font-weight: 700; color: #1976D2; }
.timestamp-sub { font-size: 13px; color: #999; margin-top: 2px; }
.summary-row {
display: flex; justify-content: space-between; align-items: center;
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
}
.summary-row:last-child { border-bottom: none; }
.summary-label { font-size: 13px; color: #666; }
.summary-value { font-size: 18px; font-weight: 700; }
.badge {
display: inline-block; padding: 2px 8px; border-radius: 4px;
font-size: 11px; font-weight: 600; color: white;
}
.badge-ok { background: #4CAF50; }
.badge-err { background: #F44336; }
.badge-skip { background: #FF9800; }
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; }
.bar-fill { height: 100%; border-radius: 4px; }
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
.loader-wrap {
display: flex; flex-direction: column; align-items: center;
justify-content: center; padding: 60px 0;
}
.spinner {
width: 40px; height: 40px; border: 4px solid #e0e0e0;
border-top-color: #1976D2; border-radius: 50%;
animation: spin 0.8s linear infinite;
}
@keyframes spin { to { transform: rotate(360deg); } }
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
.link-row { text-align: center; margin-top: 8px; }
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
.history-table { width: 100%; border-collapse: collapse; font-size: 12px; }
.history-table th {
text-align: left; font-weight: 600; color: #999; font-size: 11px;
padding: 4px 6px 8px 6px; border-bottom: 2px solid #f0f0f0;
}
.history-table td { padding: 7px 6px; border-bottom: 1px solid #f5f5f5; vertical-align: middle; }
.history-table tr:last-child td { border-bottom: none; }
.history-table tr.latest td { background: #f8fbff; font-weight: 600; }
.src-nums { display: flex; gap: 4px; flex-wrap: wrap; }
.src-chip {
display: inline-block; padding: 1px 5px; border-radius: 3px;
font-size: 10px; color: white; font-variant-numeric: tabular-nums;
}
.clickable-row { cursor: pointer; }
.clickable-row:hover td { background: #f0f7ff !important; }
/* Modal */
#md-overlay {
position: fixed; inset: 0; background: rgba(0,0,0,0.45);
display: flex; align-items: flex-start; justify-content: center;
z-index: 1000; padding: 40px 16px; overflow-y: auto;
}
#md-box {
background: white; border-radius: 12px; padding: 24px;
width: 100%; max-width: 620px; position: relative;
box-shadow: 0 8px 32px rgba(0,0,0,0.24); margin: auto;
}
#md-close {
position: absolute; top: 10px; right: 14px;
background: none; border: none; font-size: 26px; cursor: pointer;
color: #aaa; line-height: 1;
}
#md-close:hover { color: #333; }
#md-box h3 { font-size: 15px; margin-bottom: 14px; padding-right: 24px; }
.md-summary { display: flex; gap: 20px; flex-wrap: wrap; font-size: 13px; margin-bottom: 16px; color: #555; }
.md-summary b { color: #333; }
.detail-table { width: 100%; border-collapse: collapse; font-size: 12px; }
.detail-table th {
text-align: left; color: #999; font-size: 11px; font-weight: 600;
padding: 4px 8px 6px 0; border-bottom: 2px solid #f0f0f0; white-space: nowrap;
}
.detail-table td { padding: 6px 8px 6px 0; border-bottom: 1px solid #f5f5f5; vertical-align: top; }
.detail-table tr:last-child td { border-bottom: none; }
"""
_SOURCE_ORDER = ["Sreality", "Realingo", "Bezrealitky", "iDNES", "PSN", "CityHome"]
_SOURCE_ABBR = ["Sre", "Rea", "Bez", "iDN", "PSN", "CH"]
def _sources_html(sources: list) -> str:
if not sources:
return ""
max_count = max((s.get("accepted", 0) for s in sources), default=1) or 1
parts = ['<div class="card"><h2>Zdroje</h2>']
for s in sources:
name = s.get("name", "?")
accepted = s.get("accepted", 0)
error = s.get("error")
exc = s.get("excluded", {})
excluded_total = sum(exc.values()) if isinstance(exc, dict) else s.get("excluded_total", 0)
color = COLORS.get(name.lower(), "#999")
pct = round(accepted / max_count * 100) if max_count else 0
if error:
badge = '<span class="badge badge-err">chyba</span>'
elif accepted == 0:
badge = '<span class="badge badge-skip">0</span>'
else:
badge = '<span class="badge badge-ok">OK</span>'
parts.append(
f'<div style="margin-bottom:12px;">'
f'<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">'
f'<span style="font-weight:600;font-size:14px;">{name} {badge}</span>'
f'<span style="font-size:12px;color:#999;">{excluded_total} vyloučených</span>'
f'</div>'
f'<div class="bar-row">'
f'<div class="bar-track"><div class="bar-fill" style="width:{pct}%;background:{color};"></div></div>'
f'<span class="bar-count">{accepted}</span>'
f'</div></div>'
)
parts.append("</div>")
return "".join(parts)
def _history_html(history: list) -> str:
if not history:
return ""
rows = list(reversed(history))
parts = [
'<div class="card">'
'<h2>Historie běhů <span style="font-size:11px;font-weight:400;color:#bbb;"> klikni pro detaily</span></h2>',
'<table class="history-table"><thead><tr>',
'<th>Datum</th><th>Trvání</th><th>Přijato&nbsp;/&nbsp;Dedup</th><th>Zdroje</th><th>OK</th>',
'</tr></thead><tbody>',
]
for i, entry in enumerate(rows):
row_class = ' class="latest clickable-row"' if i == 0 else ' class="clickable-row"'
src_map = {s["name"]: s for s in entry.get("sources", []) if "name" in s}
chips = "".join(
f'<span class="src-chip" style="background:{"#F44336" if (src_map.get(name) or {}).get("error") else COLORS.get(name.lower(), "#999")}" title="{name}">'
f'{abbr}&nbsp;{src_map[name].get("accepted", 0) if name in src_map else "-"}</span>'
for name, abbr in zip(_SOURCE_ORDER, _SOURCE_ABBR)
)
ok_badge = (
'<span class="badge badge-err">chyba</span>'
if entry.get("success") is False
else '<span class="badge badge-ok">OK</span>'
)
dur = f'{entry["duration_sec"]}s' if entry.get("duration_sec") is not None else "-"
parts.append(
f'<tr{row_class} data-idx="{i}">'
f'<td>{_fmt_date(entry.get("timestamp", ""))}</td>'
f'<td>{dur}</td>'
f'<td>{entry.get("total_accepted", "-")}&nbsp;/&nbsp;{entry.get("deduplicated", "-")}</td>'
f'<td><div class="src-nums">{chips}</div></td>'
f'<td>{ok_badge}</td>'
f'</tr>'
)
parts.append("</tbody></table></div>")
return "".join(parts)
def _modal_script(rows_json: str) -> str:
"""Return the modal overlay HTML + JS for the history detail popup."""
return (
'<div id="md-overlay" style="display:none">'
'<div id="md-box"><button id="md-close">&times;</button>'
'<div id="md-body"></div></div></div>\n'
'<script>\n(function(){\n'
f'var H={rows_json};\n'
'var C={"sreality":"#1976D2","realingo":"#7B1FA2","bezrealitky":"#E65100","idnes":"#C62828","psn":"#2E7D32","cityhome":"#00838F"};\n'
'var MN=["ledna","února","března","dubna","května","června","července","srpna","září","října","listopadu","prosince"];\n'
'function fd(s){var d=new Date(s);return d.getDate()+". "+MN[d.getMonth()]+" "+d.getFullYear()+", "+String(d.getHours()).padStart(2,"0")+":"+String(d.getMinutes()).padStart(2,"0");}\n'
'function openModal(idx){\n'
' var e=H[idx],src=e.sources||[];\n'
' var h="<h3>Detaily b\u011bhu \u2013 "+fd(e.timestamp)+"</h3>";\n'
' h+="<div class=\\"md-summary\\">";\n'
' if(e.duration_sec!=null) h+="<span><b>Trvání:</b> "+e.duration_sec+"s</span>";\n'
' if(e.total_accepted!=null) h+="<span><b>Přijato:</b> "+e.total_accepted+"</span>";\n'
' if(e.deduplicated!=null) h+="<span><b>Po dedup:</b> "+e.deduplicated+"</span>";\n'
' h+="</div>";\n'
' h+="<table class=\\"detail-table\\"><thead><tr>";\n'
' h+="<th>Zdroj</th><th>Přijato</th><th>Staženo</th><th>Stránky</th><th>Cache</th><th>Vyloučeno</th><th>Čas</th><th>OK</th>";\n'
' h+="</tr></thead><tbody>";\n'
' src.forEach(function(s){\n'
' var nm=s.name||"?",col=C[nm.toLowerCase()]||"#999";\n'
' var exc=s.excluded||{};\n'
' var excStr=Object.entries(exc).filter(function(kv){return kv[1]>0;}).map(function(kv){return kv[0]+":&nbsp;"+kv[1];}).join(", ")||"\u2013";\n'
' var ok=s.error?"<span class=\\"badge badge-err\\" title=\\""+s.error+"\\">chyba</span>":"<span class=\\"badge badge-ok\\">OK</span>";\n'
' var dot="<span style=\\"display:inline-block;width:8px;height:8px;border-radius:50%;background:"+col+";margin-right:5px;\\"></span>";\n'
' h+="<tr>";\n'
' h+="<td>"+dot+nm+"</td>";\n'
' h+="<td>"+(s.accepted!=null?s.accepted:"\u2013")+"</td>";\n'
' h+="<td>"+(s.fetched!=null?s.fetched:"\u2013")+"</td>";\n'
' h+="<td>"+(s.pages!=null?s.pages:"\u2013")+"</td>";\n'
' h+="<td>"+(s.cache_hits!=null?s.cache_hits:"\u2013")+"</td>";\n'
' h+="<td style=\\"font-size:11px;color:#666;\\">"+excStr+"</td>";\n'
' h+="<td>"+(s.duration_sec!=null?s.duration_sec+"s":"\u2013")+"</td>";\n'
' h+="<td>"+ok+"</td></tr>";\n'
' });\n'
' h+="</tbody></table>";\n'
' document.getElementById("md-body").innerHTML=h;\n'
' document.getElementById("md-overlay").style.display="flex";\n'
'}\n'
'function closeModal(){document.getElementById("md-overlay").style.display="none";}\n'
'var tb=document.querySelector(".history-table tbody");\n'
'if(tb)tb.addEventListener("click",function(e){var tr=e.target.closest("tr[data-idx]");if(tr)openModal(parseInt(tr.dataset.idx,10));});\n'
'document.getElementById("md-close").addEventListener("click",closeModal);\n'
'document.getElementById("md-overlay").addEventListener("click",function(e){if(e.target===this)closeModal();});\n'
'document.addEventListener("keydown",function(e){if(e.key==="Escape")closeModal();});\n'
'})();\n</script>'
)
def _render_status_html(status: dict | None, history: list, is_running: bool = False) -> str:
"""Generate the complete HTML page for /scrapers-status."""
head_open = (
'<!DOCTYPE html>\n<html lang="cs">\n<head>\n'
'<meta charset="UTF-8">\n'
'<meta name="viewport" content="width=device-width, initial-scale=1.0">\n'
f'<title>Scraper status</title>\n<style>{_CSS}</style>\n'
)
page_header = '<h1>Scraper status</h1>\n<div class="subtitle">maru-hleda-byt</div>\n'
footer = '<div class="link-row"><a href="/mapa_bytu.html">Otevřít mapu</a></div>'
if status is None:
return (
head_open + '</head>\n<body>\n' + page_header
+ '<div class="card"><p style="color:#F44336">Status není k dispozici.</p></div>\n'
+ footer + '\n</body>\n</html>'
)
if is_running:
return (
head_open
+ '<meta http-equiv="refresh" content="30">\n'
+ '</head>\n<body>\n' + page_header
+ '<div class="loader-wrap"><div class="spinner"></div>'
+ '<div class="loader-text">Scraper právě běží…</div></div>\n'
+ footer + '\n</body>\n</html>'
)
# ── Done state ────────────────────────────────────────────────────────────
ts = status.get("timestamp", "")
duration = status.get("duration_sec")
total_accepted = status.get("total_accepted", 0)
deduplicated = status.get("deduplicated")
ts_card = (
'<div class="card"><h2>Poslední scrape</h2>'
f'<div class="timestamp">{_fmt_date(ts)}</div>'
+ (f'<div class="timestamp-sub">Trvání: {round(duration)}s</div>' if duration is not None else "")
+ '</div>'
)
sum_card = (
'<div class="card"><h2>Souhrn</h2>'
f'<div class="summary-row"><span class="summary-label">Vyhovujících bytů</span>'
f'<span class="summary-value" style="color:#4CAF50">{total_accepted}</span></div>'
+ (
f'<div class="summary-row"><span class="summary-label">Po deduplikaci (v mapě)</span>'
f'<span class="summary-value" style="color:#1976D2">{deduplicated}</span></div>'
if deduplicated is not None else ""
)
+ '</div>'
)
rows_for_js = list(reversed(history))
body = (
page_header
+ ts_card + "\n"
+ sum_card + "\n"
+ _sources_html(status.get("sources", [])) + "\n"
+ _history_html(history) + "\n"
+ footer
)
modal = _modal_script(json.dumps(rows_for_js, ensure_ascii=False))
return head_open + '</head>\n<body>\n' + body + '\n' + modal + '\n</body>\n</html>'
# ── HTTP handler ──────────────────────────────────────────────────────────────
class Handler(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass # suppress default access log; use our own where needed
def _send_json(self, status: int, body, extra_headers=None):
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
if extra_headers:
for k, v in extra_headers.items():
self.send_header(k, v)
self.end_headers()
self.wfile.write(payload)
def do_OPTIONS(self):
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
if self.path.startswith("/api/"):
self._handle_api_get()
elif self.path.rstrip("/") == "/scrapers-status":
self._serve_status_page()
else:
log.debug("GET %s → static file: %s", self.path, self.translate_path(self.path))
super().do_GET()
def _handle_api_get(self):
if self.path in ("/api/ratings", "/api/ratings/export"):
ratings = load_ratings()
extra = None
if self.path == "/api/ratings/export":
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
log.info("GET %s%d ratings", self.path, len(ratings))
self._send_json(200, ratings, extra)
elif self.path == "/api/status":
data = _load_json(DATA_DIR / "status.json")
if data is None:
self._send_json(404, {"error": "status not available"})
return
log.info("GET /api/status → ok")
self._send_json(200, data)
elif self.path == "/api/status/history":
data = _load_json(DATA_DIR / "scraper_history.json", default=[])
if not isinstance(data, list):
data = []
log.info("GET /api/status/history → %d entries", len(data))
self._send_json(200, data)
else:
self._send_json(404, {"error": "not found"})
def _serve_status_page(self):
status = _load_json(DATA_DIR / "status.json")
history = _load_json(DATA_DIR / "scraper_history.json", default=[])
if not isinstance(history, list):
history = []
is_running = (DATA_DIR / "scraper_running.json").exists()
html = _render_status_html(status, history, is_running)
payload = html.encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.end_headers()
self.wfile.write(payload)
def do_POST(self):
if self.path == "/api/ratings":
length = int(self.headers.get("Content-Length", 0))
if length == 0:
self._send_json(400, {"error": "empty body"})
return
try:
raw = self.rfile.read(length)
data = json.loads(raw.decode("utf-8"))
except Exception as e:
log.warning("Bad request body: %s", e)
self._send_json(400, {"error": "invalid JSON"})
return
if not isinstance(data, dict):
self._send_json(400, {"error": "expected JSON object"})
return
save_ratings(data)
log.info("POST /api/ratings → saved %d ratings", len(data))
self._send_json(200, {"ok": True, "count": len(data)})
else:
self._send_json(404, {"error": "not found"})
if __name__ == "__main__":
log.info("Server starting on port %d, data dir: %s", PORT, DATA_DIR)
handler = functools.partial(Handler, directory=str(DATA_DIR))
server = HTTPServer(("0.0.0.0", PORT), handler)
try:
server.serve_forever()
except KeyboardInterrupt:
log.info("Stopped.")
sys.exit(0)

View File

@@ -1,204 +0,0 @@
<!DOCTYPE html>
<html lang="cs">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Scraper status</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: system-ui, -apple-system, sans-serif;
background: #f5f5f5; color: #333;
padding: 24px; max-width: 640px; margin: 0 auto;
}
h1 { font-size: 22px; margin-bottom: 4px; }
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
.card {
background: white; border-radius: 12px; padding: 20px;
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
}
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
.timestamp {
font-size: 28px; font-weight: 700; color: #1976D2;
}
.timestamp-ago { font-size: 13px; color: #999; margin-top: 2px; }
/* Source table */
.source-table { width: 100%; border-collapse: collapse; }
.source-table td { padding: 8px 0; border-bottom: 1px solid #f0f0f0; font-size: 14px; }
.source-table tr:last-child td { border-bottom: none; }
.source-table .name { font-weight: 600; }
.source-table .count { text-align: right; font-variant-numeric: tabular-nums; }
.source-table .rejected { text-align: right; color: #999; font-size: 12px; }
.badge {
display: inline-block; padding: 2px 8px; border-radius: 4px;
font-size: 11px; font-weight: 600; color: white;
}
.badge-ok { background: #4CAF50; }
.badge-err { background: #F44336; }
.badge-skip { background: #FF9800; }
/* Summary bar */
.summary-row {
display: flex; justify-content: space-between; align-items: center;
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
}
.summary-row:last-child { border-bottom: none; }
.summary-label { font-size: 13px; color: #666; }
.summary-value { font-size: 18px; font-weight: 700; }
/* Source bar chart */
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
.bar-label { width: 90px; font-size: 12px; text-align: right; color: #666; }
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; position: relative; }
.bar-fill { height: 100%; border-radius: 4px; transition: width 0.5s ease; }
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
/* Loader */
.loader-wrap {
display: flex; flex-direction: column; align-items: center;
justify-content: center; padding: 60px 0;
}
.spinner {
width: 40px; height: 40px; border: 4px solid #e0e0e0;
border-top-color: #1976D2; border-radius: 50%;
animation: spin 0.8s linear infinite;
}
@keyframes spin { to { transform: rotate(360deg); } }
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
.error-msg { color: #F44336; padding: 40px 0; text-align: center; }
.link-row { text-align: center; margin-top: 8px; }
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
</style>
</head>
<body>
<h1>Scraper status</h1>
<div class="subtitle">maru-hleda-byt</div>
<div id="content">
<div class="loader-wrap">
<div class="spinner"></div>
<div class="loader-text">Nacitam status...</div>
</div>
</div>
<div class="link-row"><a href="mapa_bytu.html">Otevrit mapu</a></div>
<script>
var COLORS = {
sreality: '#1976D2',
realingo: '#7B1FA2',
bezrealitky: '#E65100',
idnes: '#C62828',
psn: '#2E7D32',
cityhome: '#00838F',
};
function timeAgo(dateStr) {
var d = new Date(dateStr);
var now = new Date();
var diff = Math.floor((now - d) / 1000);
if (diff < 60) return 'prave ted';
if (diff < 3600) return Math.floor(diff / 60) + ' min zpet';
if (diff < 86400) return Math.floor(diff / 3600) + ' hod zpet';
return Math.floor(diff / 86400) + ' dni zpet';
}
function formatDate(dateStr) {
var d = new Date(dateStr);
var day = d.getDate();
var months = ['ledna','unora','brezna','dubna','kvetna','cervna',
'cervence','srpna','zari','rijna','listopadu','prosince'];
var hh = String(d.getHours()).padStart(2, '0');
var mm = String(d.getMinutes()).padStart(2, '0');
return day + '. ' + months[d.getMonth()] + ' ' + d.getFullYear() + ', ' + hh + ':' + mm;
}
function render(data) {
// Check if scrape is currently running
if (data.status === 'running') {
document.getElementById('content').innerHTML =
'<div class="loader-wrap">' +
'<div class="spinner"></div>' +
'<div class="loader-text">Scraper prave bezi...</div>' +
'</div>';
setTimeout(loadStatus, 30000);
return;
}
var sources = data.sources || [];
var totalOk = 0, totalRej = 0;
var maxCount = 0;
sources.forEach(function(s) {
totalOk += s.accepted || 0;
totalRej += s.rejected || 0;
if (s.accepted > maxCount) maxCount = s.accepted;
});
var html = '';
// Timestamp card
html += '<div class="card">';
html += '<h2>Posledni scrape</h2>';
html += '<div class="timestamp">' + formatDate(data.timestamp) + '</div>';
html += '<div class="timestamp-ago">' + timeAgo(data.timestamp) + '</div>';
if (data.duration_sec) {
html += '<div class="timestamp-ago">Trvani: ' + Math.round(data.duration_sec) + 's</div>';
}
html += '</div>';
// Summary card
html += '<div class="card">';
html += '<h2>Souhrn</h2>';
html += '<div class="summary-row"><span class="summary-label">Vyhovujicich bytu</span><span class="summary-value" style="color:#4CAF50">' + totalOk + '</span></div>';
html += '<div class="summary-row"><span class="summary-label">Vyloucenych</span><span class="summary-value" style="color:#999">' + totalRej + '</span></div>';
if (data.deduplicated !== undefined) {
html += '<div class="summary-row"><span class="summary-label">Po deduplikaci (v mape)</span><span class="summary-value" style="color:#1976D2">' + data.deduplicated + '</span></div>';
}
html += '</div>';
// Sources card
html += '<div class="card">';
html += '<h2>Zdroje</h2>';
sources.forEach(function(s) {
var color = COLORS[s.name.toLowerCase()] || '#999';
var pct = maxCount > 0 ? Math.round((s.accepted / maxCount) * 100) : 0;
var badge = s.error
? '<span class="badge badge-err">chyba</span>'
: (s.accepted === 0 ? '<span class="badge badge-skip">0</span>' : '<span class="badge badge-ok">OK</span>');
html += '<div style="margin-bottom:12px;">';
html += '<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">';
html += '<span style="font-weight:600;font-size:14px;">' + s.name + ' ' + badge + '</span>';
html += '<span style="font-size:12px;color:#999;">' + (s.rejected || 0) + ' vyloucenych</span>';
html += '</div>';
html += '<div class="bar-row">';
html += '<div class="bar-track"><div class="bar-fill" style="width:' + pct + '%;background:' + color + ';"></div></div>';
html += '<span class="bar-count">' + (s.accepted || 0) + '</span>';
html += '</div>';
html += '</div>';
});
html += '</div>';
document.getElementById('content').innerHTML = html;
}
function loadStatus() {
fetch('status.json?t=' + Date.now())
.then(function(r) {
if (!r.ok) throw new Error(r.status);
return r.json();
})
.then(render)
.catch(function(err) {
document.getElementById('content').innerHTML =
'<div class="error-msg">Status zatim neni k dispozici.<br><small>(' + err.message + ')</small></div>';
});
}
loadStatus();
</script>
</body>
</html>