Compare commits
11 Commits
6f49533c94
...
fix/scrape
| Author | SHA1 | Date | |
|---|---|---|---|
| 8c052840cd | |||
| 39e4b9ce2a | |||
|
|
fd3991f8d6 | ||
|
|
27a7834eb6 | ||
| 57a9f6f21a | |||
| 0ea31d3013 | |||
|
|
4304a42776 | ||
| 23d208a5b7 | |||
|
|
00c9144010 | ||
|
|
44c02b45b4 | ||
|
|
5fb3b984b6 |
@@ -1,31 +0,0 @@
|
|||||||
{
|
|
||||||
"permissions": {
|
|
||||||
"allow": [
|
|
||||||
"WebFetch(domain:github.com)",
|
|
||||||
"WebFetch(domain:www.sreality.cz)",
|
|
||||||
"WebFetch(domain:webscraping.pro)",
|
|
||||||
"WebFetch(domain:raw.githubusercontent.com)",
|
|
||||||
"Bash(python3:*)",
|
|
||||||
"Bash(open:*)",
|
|
||||||
"WebFetch(domain:www.realingo.cz)",
|
|
||||||
"WebFetch(domain:api.realingo.cz)",
|
|
||||||
"Bash(curl:*)",
|
|
||||||
"Bash(grep:*)",
|
|
||||||
"WebFetch(domain:www.realitni-pes.cz)",
|
|
||||||
"WebFetch(domain:www.bezrealitky.cz)",
|
|
||||||
"WebFetch(domain:apify.com)",
|
|
||||||
"WebFetch(domain:www.bezrealitky.com)",
|
|
||||||
"WebFetch(domain:reality.idnes.cz)",
|
|
||||||
"Bash(# Final checks: robots.txt and response time for rate limiting clues curl -s -L -H \"\"User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/120.0.0.0 Safari/537.36\"\" \"\"https://reality.idnes.cz/robots.txt\"\")",
|
|
||||||
"WebFetch(domain:www.cityhome.cz)",
|
|
||||||
"WebFetch(domain:www.psn.cz)",
|
|
||||||
"WebFetch(domain:www.city-home.cz)",
|
|
||||||
"WebFetch(domain:psn.cz)",
|
|
||||||
"WebFetch(domain:api.psn.cz)",
|
|
||||||
"Bash(done)",
|
|
||||||
"Bash(# Final summary: count total units across all projects\n# Get the total count from the unitsCountData we already extracted\necho \"\"From unitsCountData on /prodej page:\"\"\necho \"\" type_id 0 \\(Prodej bytů a ateliérů\\): 146\"\"\necho \"\" type_id 1 \\(Prodej komerčních nemovitostí\\): 14\"\"\necho \"\" type_id 2 \\(Pronájem bytů\\): 3\"\"\necho \"\" type_id 3 \\(Pronájem komerčních nemovitostí\\): 48\"\"\necho \"\"\"\"\necho \"\"Total for-sale projects: 19\"\"\necho \"\"\"\"\necho \"\"Disposition counts from the data:\"\"\npython3 << 'PYEOF'\n# Extract disposition counts from prodej page\nimport re\n\nwith open\\('/tmp/psn_prodej_p1.html', 'r', encoding='utf-8'\\) as f:\n html = f.read\\(\\)\n\n# Find disposition data\nidx = html.find\\('\\\\\\\\\"disposition\\\\\\\\\":['\\)\nif idx >= 0:\n chunk = html[idx:idx+2000].replace\\('\\\\\\\\\"', '\"'\\)\n # Extract name and count pairs\n import re\n pairs = re.findall\\(r'\"name\":\"\\([^\"]+\\)\",\"count\":\\(\\\\d+\\)', chunk\\)\n for name, count in pairs:\n print\\(f\" {name}: {count}\"\\)\nPYEOF)",
|
|
||||||
"Bash(ls:*)",
|
|
||||||
"Bash(chmod:*)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,3 +1,8 @@
|
|||||||
.vscode/
|
.vscode/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
.DS_Store
|
||||||
byty_*.json
|
byty_*.json
|
||||||
|
*.json
|
||||||
|
*.log
|
||||||
|
mapa_bytu.html
|
||||||
|
|
||||||
|
|||||||
30
Makefile
30
Makefile
@@ -3,9 +3,13 @@ CONTAINER_NAME := maru-hleda-byt
|
|||||||
VOLUME_NAME := maru-hleda-byt-data
|
VOLUME_NAME := maru-hleda-byt-data
|
||||||
VALIDATION_CONTAINER := maru-hleda-byt-validation
|
VALIDATION_CONTAINER := maru-hleda-byt-validation
|
||||||
VALIDATION_VOLUME := maru-hleda-byt-validation-data
|
VALIDATION_VOLUME := maru-hleda-byt-validation-data
|
||||||
|
DEBUG_CONTAINER := maru-hleda-byt-debug
|
||||||
|
DEBUG_VOLUME := maru-hleda-byt-debug-data
|
||||||
|
DEBUG_PORT ?= 8082
|
||||||
PORT := 8080
|
PORT := 8080
|
||||||
|
SERVER_PORT ?= 8080
|
||||||
|
|
||||||
.PHONY: build run stop logs scrape restart clean help validation validation-local validation-stop validation-local-debug
|
.PHONY: build run stop logs scrape restart clean help serve validation validation-local validation-stop validation-local-debug debug debug-stop
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@echo "Available targets:"
|
@echo "Available targets:"
|
||||||
@@ -20,6 +24,9 @@ help:
|
|||||||
@echo " validation-local-debug - Run validation locally with DEBUG logging"
|
@echo " validation-local-debug - Run validation locally with DEBUG logging"
|
||||||
@echo " restart - Restart the container (stop and run again)"
|
@echo " restart - Restart the container (stop and run again)"
|
||||||
@echo " clean - Stop container and remove the Docker image"
|
@echo " clean - Stop container and remove the Docker image"
|
||||||
|
@echo " serve - Start server.py locally on port 8080"
|
||||||
|
@echo " debug - Build and run debug Docker container with limited scrape (port $(DEBUG_PORT))"
|
||||||
|
@echo " debug-stop - Stop and remove the debug Docker container"
|
||||||
@echo " help - Show this help message"
|
@echo " help - Show this help message"
|
||||||
|
|
||||||
build:
|
build:
|
||||||
@@ -59,6 +66,27 @@ validation-stop:
|
|||||||
@docker rm $(VALIDATION_CONTAINER) 2>/dev/null || true
|
@docker rm $(VALIDATION_CONTAINER) 2>/dev/null || true
|
||||||
@echo "Validation container stopped and removed"
|
@echo "Validation container stopped and removed"
|
||||||
|
|
||||||
|
debug: build
|
||||||
|
@docker stop $(DEBUG_CONTAINER) 2>/dev/null || true
|
||||||
|
@docker rm $(DEBUG_CONTAINER) 2>/dev/null || true
|
||||||
|
docker run -d --name $(DEBUG_CONTAINER) \
|
||||||
|
-p $(DEBUG_PORT):8080 \
|
||||||
|
-v $(DEBUG_VOLUME):/app/data \
|
||||||
|
-e LOG_LEVEL=DEBUG \
|
||||||
|
$(IMAGE_NAME)
|
||||||
|
@sleep 2
|
||||||
|
docker exec $(DEBUG_CONTAINER) bash /app/run_all.sh --max-pages 1 --max-properties 10
|
||||||
|
@echo "Debug app at http://localhost:$(DEBUG_PORT)/mapa_bytu.html"
|
||||||
|
@echo "Debug status at http://localhost:$(DEBUG_PORT)/scrapers-status"
|
||||||
|
|
||||||
|
debug-stop:
|
||||||
|
@docker stop $(DEBUG_CONTAINER) 2>/dev/null || true
|
||||||
|
@docker rm $(DEBUG_CONTAINER) 2>/dev/null || true
|
||||||
|
@echo "Debug container stopped and removed"
|
||||||
|
|
||||||
|
serve:
|
||||||
|
DATA_DIR=. SERVER_PORT=$(SERVER_PORT) python3 server.py
|
||||||
|
|
||||||
validation-local:
|
validation-local:
|
||||||
./run_all.sh --max-pages 1 --max-properties 10
|
./run_all.sh --max-pages 1 --max-properties 10
|
||||||
|
|
||||||
|
|||||||
11
README.md
11
README.md
@@ -83,10 +83,6 @@ Merges all `byty_*.json` files into `byty_merged.json` and generates `mapa_bytu.
|
|||||||
|
|
||||||
**Deduplication logic:** Two listings are considered duplicates if they share the same normalized street name + price + area. PSN and CityHome have priority during dedup (loaded first), so their listings are kept over duplicates from other portals.
|
**Deduplication logic:** Two listings are considered duplicates if they share the same normalized street name + price + area. PSN and CityHome have priority during dedup (loaded first), so their listings are kept over duplicates from other portals.
|
||||||
|
|
||||||
### `regen_map.py`
|
|
||||||
|
|
||||||
Regenerates the map from existing `byty_sreality.json` data without re-scraping. Fetches missing area values from the Sreality API, fixes URLs, and re-applies the area filter. Useful for tweaking map output after data has already been collected.
|
|
||||||
|
|
||||||
## Interactive map (`mapa_bytu.html`)
|
## Interactive map (`mapa_bytu.html`)
|
||||||
|
|
||||||
The generated map is a standalone HTML file using Leaflet.js with CARTO basemap tiles. Features:
|
The generated map is a standalone HTML file using Leaflet.js with CARTO basemap tiles. Features:
|
||||||
@@ -151,7 +147,7 @@ The project includes a Docker setup for unattended operation with a cron-based s
|
|||||||
│ PID 1: python3 -m http.server :8080 │
|
│ PID 1: python3 -m http.server :8080 │
|
||||||
│ serves /app/data/ │
|
│ serves /app/data/ │
|
||||||
│ │
|
│ │
|
||||||
│ crond: runs run_all.sh at 06:00/18:00 │
|
│ crond: runs run_all.sh every 4 hours │
|
||||||
│ Europe/Prague timezone │
|
│ Europe/Prague timezone │
|
||||||
│ │
|
│ │
|
||||||
│ /app/ -- scripts (.py, .sh) │
|
│ /app/ -- scripts (.py, .sh) │
|
||||||
@@ -160,7 +156,7 @@ The project includes a Docker setup for unattended operation with a cron-based s
|
|||||||
└─────────────────────────────────────────┘
|
└─────────────────────────────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place twice daily at 06:00 and 18:00 CET/CEST.
|
On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place every 4 hours.
|
||||||
|
|
||||||
### Quick start
|
### Quick start
|
||||||
|
|
||||||
@@ -201,14 +197,13 @@ Validation targets run scrapers with `--max-pages 1 --max-properties 10` for a f
|
|||||||
├── scrape_psn.py # PSN scraper
|
├── scrape_psn.py # PSN scraper
|
||||||
├── scrape_cityhome.py # CityHome scraper
|
├── scrape_cityhome.py # CityHome scraper
|
||||||
├── merge_and_map.py # Merge all sources + generate final map
|
├── merge_and_map.py # Merge all sources + generate final map
|
||||||
├── regen_map.py # Regenerate map from cached Sreality data
|
|
||||||
├── run_all.sh # Orchestrator script (runs all scrapers + merge)
|
├── run_all.sh # Orchestrator script (runs all scrapers + merge)
|
||||||
├── mapa_bytu.html # Generated interactive map (output)
|
├── mapa_bytu.html # Generated interactive map (output)
|
||||||
├── Makefile # Docker management + validation shortcuts
|
├── Makefile # Docker management + validation shortcuts
|
||||||
├── build/
|
├── build/
|
||||||
│ ├── Dockerfile # Container image definition (python:3.13-alpine)
|
│ ├── Dockerfile # Container image definition (python:3.13-alpine)
|
||||||
│ ├── entrypoint.sh # Container entrypoint (HTTP server + cron + initial scrape)
|
│ ├── entrypoint.sh # Container entrypoint (HTTP server + cron + initial scrape)
|
||||||
│ ├── crontab # Cron schedule (06:00 and 18:00 CET)
|
│ ├── crontab # Cron schedule (every 4 hours)
|
||||||
│ └── CONTAINER.md # Container-specific documentation
|
│ └── CONTAINER.md # Container-specific documentation
|
||||||
└── .gitignore # Ignores byty_*.json, __pycache__, .vscode
|
└── .gitignore # Ignores byty_*.json, __pycache__, .vscode
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -5,12 +5,14 @@ RUN apk add --no-cache curl bash tzdata \
|
|||||||
&& echo "Europe/Prague" > /etc/timezone
|
&& echo "Europe/Prague" > /etc/timezone
|
||||||
|
|
||||||
ENV PYTHONUNBUFFERED=1
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
ENV DATA_DIR=/app/data
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
|
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
|
||||||
scrape_idnes.py scrape_psn.py scrape_cityhome.py \
|
scrape_idnes.py scrape_psn.py scrape_cityhome.py \
|
||||||
merge_and_map.py regen_map.py run_all.sh ratings_server.py ./
|
merge_and_map.py generate_status.py scraper_stats.py \
|
||||||
|
run_all.sh server.py ./
|
||||||
|
|
||||||
COPY build/crontab /etc/crontabs/root
|
COPY build/crontab /etc/crontabs/root
|
||||||
COPY build/entrypoint.sh /entrypoint.sh
|
COPY build/entrypoint.sh /entrypoint.sh
|
||||||
@@ -18,7 +20,7 @@ RUN chmod +x /entrypoint.sh run_all.sh
|
|||||||
|
|
||||||
RUN mkdir -p /app/data
|
RUN mkdir -p /app/data
|
||||||
|
|
||||||
EXPOSE 8080 8081
|
EXPOSE 8080
|
||||||
|
|
||||||
HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \
|
HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \
|
||||||
CMD wget -q -O /dev/null http://localhost:8080/ || exit 1
|
CMD wget -q -O /dev/null http://localhost:8080/ || exit 1
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2
|
0 */4 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
DATA_DIR="/app/data"
|
export DATA_DIR="/app/data"
|
||||||
|
|
||||||
# Create symlinks so scripts (which write to /app/) persist data to the volume
|
# Create symlinks so scripts (which write to /app/) persist data to the volume
|
||||||
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
|
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
|
||||||
@@ -18,8 +18,5 @@ crond -b -l 2
|
|||||||
echo "[entrypoint] Starting initial scrape in background..."
|
echo "[entrypoint] Starting initial scrape in background..."
|
||||||
bash /app/run_all.sh &
|
bash /app/run_all.sh &
|
||||||
|
|
||||||
echo "[entrypoint] Starting ratings API server on port 8081..."
|
echo "[entrypoint] Starting server on port 8080..."
|
||||||
DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py &
|
exec python3 /app/server.py
|
||||||
|
|
||||||
echo "[entrypoint] Starting HTTP server on port 8080..."
|
|
||||||
exec python3 -m http.server 8080 --directory "$DATA_DIR"
|
|
||||||
|
|||||||
@@ -1,427 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"hash_id": 990183,
|
|
||||||
"name": "Prodej bytu 3+kk 86 m²",
|
|
||||||
"price": 10385000,
|
|
||||||
"price_formatted": "10 385 000 Kč",
|
|
||||||
"locality": "Ke Tvrzi, Praha - Královice",
|
|
||||||
"lat": 50.0390519,
|
|
||||||
"lon": 14.63862,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 86,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/990183-nabidka-prodej-bytu-ke-tvrzi-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 989862,
|
|
||||||
"name": "Prodej bytu 3+kk 73 m²",
|
|
||||||
"price": 12790000,
|
|
||||||
"price_formatted": "12 790 000 Kč",
|
|
||||||
"locality": "Vrázova, Praha - Smíchov",
|
|
||||||
"lat": 50.0711312,
|
|
||||||
"lon": 14.4076652,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 3,
|
|
||||||
"area": 73,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/989862-nabidka-prodej-bytu-vrazova-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 981278,
|
|
||||||
"name": "Prodej bytu 3+kk 70 m²",
|
|
||||||
"price": 11890000,
|
|
||||||
"price_formatted": "11 890 000 Kč",
|
|
||||||
"locality": "Argentinská, Praha - Holešovice",
|
|
||||||
"lat": 50.1026043,
|
|
||||||
"lon": 14.4435365,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 3,
|
|
||||||
"area": 70,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981278-nabidka-prodej-bytu-argentinska-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 989817,
|
|
||||||
"name": "Prodej bytu 3+kk 88 m²",
|
|
||||||
"price": 13490000,
|
|
||||||
"price_formatted": "13 490 000 Kč",
|
|
||||||
"locality": "Miroslava Hajna, Praha - Letňany",
|
|
||||||
"lat": 50.1406487,
|
|
||||||
"lon": 14.5207541,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 88,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/989817-nabidka-prodej-bytu-miroslava-hajna-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 970257,
|
|
||||||
"name": "Prodej bytu 3+1 106 m²",
|
|
||||||
"price": 12950000,
|
|
||||||
"price_formatted": "12 950 000 Kč",
|
|
||||||
"locality": "Novákových, Praha - Libeň",
|
|
||||||
"lat": 50.1034771,
|
|
||||||
"lon": 14.4758735,
|
|
||||||
"disposition": "3+1",
|
|
||||||
"floor": 5,
|
|
||||||
"area": 106,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/970257-nabidka-prodej-bytu-novakovych-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 972406,
|
|
||||||
"name": "Prodej bytu 3+kk 83 m²",
|
|
||||||
"price": 10490000,
|
|
||||||
"price_formatted": "10 490 000 Kč",
|
|
||||||
"locality": "Na Výrovně, Praha - Stodůlky",
|
|
||||||
"lat": 50.0396067,
|
|
||||||
"lon": 14.3167022,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 83,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/972406-nabidka-prodej-bytu-na-vyrovne",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 967142,
|
|
||||||
"name": "Prodej bytu 3+kk 78 m²",
|
|
||||||
"price": 11648000,
|
|
||||||
"price_formatted": "11 648 000 Kč",
|
|
||||||
"locality": "Na Míčánkách, Praha - Vršovice",
|
|
||||||
"lat": 50.0713284,
|
|
||||||
"lon": 14.4638722,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 6,
|
|
||||||
"area": 78,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/967142-nabidka-prodej-bytu-na-micankach",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 955977,
|
|
||||||
"name": "Prodej bytu 4+kk 75 m²",
|
|
||||||
"price": 10363000,
|
|
||||||
"price_formatted": "10 363 000 Kč",
|
|
||||||
"locality": "Karla Guta, Praha - Uhříněves",
|
|
||||||
"lat": 50.03017,
|
|
||||||
"lon": 14.5940072,
|
|
||||||
"disposition": "4+kk",
|
|
||||||
"floor": 4,
|
|
||||||
"area": 75,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/955977-nabidka-prodej-bytu-karla-guta",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 974557,
|
|
||||||
"name": "Prodej bytu 4+kk 94 m²",
|
|
||||||
"price": 13499900,
|
|
||||||
"price_formatted": "13 499 900 Kč",
|
|
||||||
"locality": "V Dolině, Praha - Michle",
|
|
||||||
"lat": 50.0579963,
|
|
||||||
"lon": 14.4682887,
|
|
||||||
"disposition": "4+kk",
|
|
||||||
"floor": 8,
|
|
||||||
"area": 94,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/974557-nabidka-prodej-bytu-v-doline-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 988498,
|
|
||||||
"name": "Prodej bytu 3+1 75 m²",
|
|
||||||
"price": 11400000,
|
|
||||||
"price_formatted": "11 400 000 Kč",
|
|
||||||
"locality": "5. května, Praha - Nusle",
|
|
||||||
"lat": 50.0604096,
|
|
||||||
"lon": 14.4326302,
|
|
||||||
"disposition": "3+1",
|
|
||||||
"floor": 4,
|
|
||||||
"area": 75,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/988498-nabidka-prodej-bytu-5-kvetna-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 985285,
|
|
||||||
"name": "Prodej bytu 3+kk 70 m²",
|
|
||||||
"price": 12200000,
|
|
||||||
"price_formatted": "12 200 000 Kč",
|
|
||||||
"locality": "Klausova, Praha - Stodůlky",
|
|
||||||
"lat": 50.0370204,
|
|
||||||
"lon": 14.3432643,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 5,
|
|
||||||
"area": 70,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985285-nabidka-prodej-bytu-klausova-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 965526,
|
|
||||||
"name": "Prodej bytu 3+kk 77 m²",
|
|
||||||
"price": 11890000,
|
|
||||||
"price_formatted": "11 890 000 Kč",
|
|
||||||
"locality": "Vinohradská, Praha - Strašnice",
|
|
||||||
"lat": 50.0776726,
|
|
||||||
"lon": 14.4870072,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 16,
|
|
||||||
"area": 77,
|
|
||||||
"building_type": "Smíšená",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/965526-nabidka-prodej-bytu-vinohradska-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 924811,
|
|
||||||
"name": "Prodej bytu 3+kk 75 m²",
|
|
||||||
"price": 13390000,
|
|
||||||
"price_formatted": "13 390 000 Kč",
|
|
||||||
"locality": "Waltariho, Praha - Hloubětín",
|
|
||||||
"lat": 50.1076717,
|
|
||||||
"lon": 14.5248559,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 4,
|
|
||||||
"area": 75,
|
|
||||||
"building_type": "Smíšená",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/924811-nabidka-prodej-bytu-waltariho-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 985859,
|
|
||||||
"name": "Prodej bytu 3+1 80 m²",
|
|
||||||
"price": 9000000,
|
|
||||||
"price_formatted": "9 000 000 Kč",
|
|
||||||
"locality": "Staňkova, Praha - Háje",
|
|
||||||
"lat": 50.0377128,
|
|
||||||
"lon": 14.5311557,
|
|
||||||
"disposition": "3+1",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 80,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985859-nabidka-prodej-bytu-stankova-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 985583,
|
|
||||||
"name": "Prodej bytu 3+kk 76 m²",
|
|
||||||
"price": 10850000,
|
|
||||||
"price_formatted": "10 850 000 Kč",
|
|
||||||
"locality": "Boloňská, Praha - Horní Měcholupy",
|
|
||||||
"lat": 50.047328,
|
|
||||||
"lon": 14.5565277,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 4,
|
|
||||||
"area": 76,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/985583-nabidka-prodej-bytu-bolonska-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 981178,
|
|
||||||
"name": "Prodej bytu 4+kk 86 m²",
|
|
||||||
"price": 11990000,
|
|
||||||
"price_formatted": "11 990 000 Kč",
|
|
||||||
"locality": "Sušilova, Praha - Uhříněves",
|
|
||||||
"lat": 50.032081,
|
|
||||||
"lon": 14.5885148,
|
|
||||||
"disposition": "4+kk",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 86,
|
|
||||||
"building_type": "SKELET",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981178-nabidka-prodej-bytu-susilova-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 973216,
|
|
||||||
"name": "Prodej bytu 4+1 82 m²",
|
|
||||||
"price": 11357000,
|
|
||||||
"price_formatted": "11 357 000 Kč",
|
|
||||||
"locality": "Nad Kapličkou, Praha - Strašnice",
|
|
||||||
"lat": 50.0839509,
|
|
||||||
"lon": 14.4904493,
|
|
||||||
"disposition": "4+1",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 82,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/973216-nabidka-prodej-bytu-nad-kaplickou-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 868801,
|
|
||||||
"name": "Prodej bytu 3+kk 109 m²",
|
|
||||||
"price": 7299000,
|
|
||||||
"price_formatted": "7 299 000 Kč",
|
|
||||||
"locality": "Pod Karlovem, Praha - Vinohrady",
|
|
||||||
"lat": 50.0676313,
|
|
||||||
"lon": 14.432498,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 5,
|
|
||||||
"area": 109,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Družstevní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/868801-nabidka-prodej-bytu-pod-karlovem-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 868795,
|
|
||||||
"name": "Prodej bytu 3+kk 106 m²",
|
|
||||||
"price": 6299000,
|
|
||||||
"price_formatted": "6 299 000 Kč",
|
|
||||||
"locality": "Pod Karlovem, Praha - Vinohrady",
|
|
||||||
"lat": 50.0676313,
|
|
||||||
"lon": 14.432498,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 106,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Družstevní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/868795-nabidka-prodej-bytu-pod-karlovem-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 981890,
|
|
||||||
"name": "Prodej bytu 3+1 84 m²",
|
|
||||||
"price": 12980000,
|
|
||||||
"price_formatted": "12 980 000 Kč",
|
|
||||||
"locality": "Novákových, Praha - Libeň",
|
|
||||||
"lat": 50.103273,
|
|
||||||
"lon": 14.4746894,
|
|
||||||
"disposition": "3+1",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 84,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/981890-nabidka-prodej-bytu-novakovych-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 976276,
|
|
||||||
"name": "Prodej bytu 3+kk 75 m²",
|
|
||||||
"price": 13490000,
|
|
||||||
"price_formatted": "13 490 000 Kč",
|
|
||||||
"locality": "Svornosti, Praha - Smíchov",
|
|
||||||
"lat": 50.0673284,
|
|
||||||
"lon": 14.4095087,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 75,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/976276-nabidka-prodej-bytu-svornosti-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 950787,
|
|
||||||
"name": "Prodej bytu 3+kk 70 m²",
|
|
||||||
"price": 9999000,
|
|
||||||
"price_formatted": "9 999 000 Kč",
|
|
||||||
"locality": "Sečská, Praha - Strašnice",
|
|
||||||
"lat": 50.071191,
|
|
||||||
"lon": 14.5035501,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 3,
|
|
||||||
"area": 70,
|
|
||||||
"building_type": "Smíšená",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/950787-nabidka-prodej-bytu-secska-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 978045,
|
|
||||||
"name": "Prodej bytu 3+kk 76 m²",
|
|
||||||
"price": 11133000,
|
|
||||||
"price_formatted": "11 133 000 Kč",
|
|
||||||
"locality": "K Vinoři, Praha - Kbely",
|
|
||||||
"lat": 50.1329656,
|
|
||||||
"lon": 14.5618499,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 76,
|
|
||||||
"building_type": "Smíšená",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/978045-nabidka-prodej-bytu-k-vinori",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 974552,
|
|
||||||
"name": "Prodej bytu 3+1 75 m²",
|
|
||||||
"price": 11000000,
|
|
||||||
"price_formatted": "11 000 000 Kč",
|
|
||||||
"locality": "Vejražkova, Praha - Košíře",
|
|
||||||
"lat": 50.0637808,
|
|
||||||
"lon": 14.3612275,
|
|
||||||
"disposition": "3+1",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 75,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/974552-nabidka-prodej-bytu-vejrazkova-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"hash_id": 955010,
|
|
||||||
"name": "Prodej bytu 3+kk 70 m²",
|
|
||||||
"price": 12290000,
|
|
||||||
"price_formatted": "12 290 000 Kč",
|
|
||||||
"locality": "Břeclavská, Praha - Kyje",
|
|
||||||
"lat": 50.0951045,
|
|
||||||
"lon": 14.5454237,
|
|
||||||
"disposition": "3+kk",
|
|
||||||
"floor": 2,
|
|
||||||
"area": 70,
|
|
||||||
"building_type": "Cihlová",
|
|
||||||
"ownership": "Osobní",
|
|
||||||
"url": "https://www.bezrealitky.cz/nemovitosti-byty-domy/955010-nabidka-prodej-bytu-breclavska-hlavni-mesto-praha",
|
|
||||||
"source": "bezrealitky",
|
|
||||||
"image": ""
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
[]
|
|
||||||
5867
byty_idnes.json
5867
byty_idnes.json
File diff suppressed because it is too large
Load Diff
12072
byty_merged.json
12072
byty_merged.json
File diff suppressed because it is too large
Load Diff
@@ -1 +0,0 @@
|
|||||||
[]
|
|
||||||
7091
byty_realingo.json
7091
byty_realingo.json
File diff suppressed because it is too large
Load Diff
5570
byty_sreality.json
5570
byty_sreality.json
File diff suppressed because it is too large
Load Diff
123
docs/validation.md
Normal file
123
docs/validation.md
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
# Validation Recipe
|
||||||
|
|
||||||
|
End-to-end check that scraping, data persistence, history, and the status page all work correctly in Docker.
|
||||||
|
|
||||||
|
## What it verifies
|
||||||
|
|
||||||
|
- All scrapers run and write output to `DATA_DIR` (`/app/data`)
|
||||||
|
- `stats_*.json` land in `/app/data/` (not in `/app/`)
|
||||||
|
- `status.json` and `scraper_history.json` land in `/app/data/`
|
||||||
|
- `/api/status`, `/api/status/history`, and `/scrapers-status` serve correct data
|
||||||
|
- History accumulates across runs
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
### 1. Build the image
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make build
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Start a clean validation container
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop/remove any leftover container and volume from a previous run
|
||||||
|
docker stop maru-hleda-byt-validation 2>/dev/null; docker rm maru-hleda-byt-validation 2>/dev/null
|
||||||
|
docker volume rm maru-hleda-byt-validation-data 2>/dev/null
|
||||||
|
|
||||||
|
docker run -d --name maru-hleda-byt-validation \
|
||||||
|
-p 8081:8080 \
|
||||||
|
-v maru-hleda-byt-validation-data:/app/data \
|
||||||
|
maru-hleda-byt
|
||||||
|
```
|
||||||
|
|
||||||
|
Give the container ~3 seconds to start. The entrypoint launches a background full scrape automatically — suppress it so only controlled runs execute:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sleep 3
|
||||||
|
docker exec maru-hleda-byt-validation pkill -f run_all.sh 2>/dev/null || true
|
||||||
|
docker exec maru-hleda-byt-validation rm -f /app/data/scraper_running.json 2>/dev/null || true
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Run a limited scrape (run 1)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected output (last few lines):
|
||||||
|
```
|
||||||
|
Status uložen: /app/data/status.json
|
||||||
|
Historie uložena: /app/data/scraper_history.json (1 záznamů)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Verify data files are in `/app/data/`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec maru-hleda-byt-validation ls /app/data/
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected files:
|
||||||
|
```
|
||||||
|
byty_cityhome.json byty_idnes.json byty_merged.json
|
||||||
|
byty_realingo.json byty_sreality.json
|
||||||
|
mapa_bytu.html
|
||||||
|
scraper_history.json
|
||||||
|
stats_bezrealitky.json stats_cityhome.json stats_idnes.json
|
||||||
|
stats_realingo.json stats_sreality.json
|
||||||
|
status.json
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Run a second limited scrape (run 2)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec maru-hleda-byt-validation bash /app/run_all.sh --max-pages 1 --max-properties 10
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected last line: `Historie uložena: /app/data/scraper_history.json (2 záznamů)`
|
||||||
|
|
||||||
|
### 6. Verify history via API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s http://localhost:8081/api/status/history | python3 -c "
|
||||||
|
import json, sys
|
||||||
|
h = json.load(sys.stdin)
|
||||||
|
print(f'{len(h)} entries:')
|
||||||
|
for i, e in enumerate(h):
|
||||||
|
print(f' [{i}] {e[\"timestamp\"]} total={e[\"total_accepted\"]}')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: 2 entries with different timestamps.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s http://localhost:8081/api/status | python3 -c "
|
||||||
|
import json, sys; s=json.load(sys.stdin)
|
||||||
|
print(f'status={s[\"status\"]} total={s[\"total_accepted\"]} ts={s[\"timestamp\"]}')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: `status=done total=<N> ts=<latest timestamp>`
|
||||||
|
|
||||||
|
### 7. Check the status page
|
||||||
|
|
||||||
|
Open http://localhost:8081/scrapers-status in a browser (or `curl -s http://localhost:8081/scrapers-status | grep -c "clickable-row"` — should print `2`).
|
||||||
|
|
||||||
|
### 8. Clean up
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker stop maru-hleda-byt-validation && docker rm maru-hleda-byt-validation
|
||||||
|
docker volume rm maru-hleda-byt-validation-data
|
||||||
|
```
|
||||||
|
|
||||||
|
Or use the Makefile shortcut:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make validation-stop
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- PSN scraper does not support `--max-pages` and will always fail with this command; `success=False` in history is expected during validation.
|
||||||
|
- Bezrealitky may return 0 results with a 1-page limit; `byty_bezrealitky.json` will be absent from `/app/data/` in that case — this is normal.
|
||||||
|
- `make validation` (the Makefile target) runs the same limited scrape but does not suppress the background startup scrape, so two concurrent runs may occur. Use the manual steps above for a clean controlled test.
|
||||||
@@ -1,16 +1,15 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Generate status.json from scraper JSON outputs and run log."""
|
"""Generate status.json from scraper JSON outputs and per-scraper stats files."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
HERE = Path(__file__).parent
|
HERE = Path(__file__).parent
|
||||||
|
DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
|
||||||
|
|
||||||
SOURCE_FILES = {
|
SOURCE_FILES = {
|
||||||
"Sreality": "byty_sreality.json",
|
"Sreality": "byty_sreality.json",
|
||||||
@@ -21,7 +20,17 @@ SOURCE_FILES = {
|
|||||||
"CityHome": "byty_cityhome.json",
|
"CityHome": "byty_cityhome.json",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
STATS_FILES = {
|
||||||
|
"Sreality": "stats_sreality.json",
|
||||||
|
"Realingo": "stats_realingo.json",
|
||||||
|
"Bezrealitky": "stats_bezrealitky.json",
|
||||||
|
"iDNES": "stats_idnes.json",
|
||||||
|
"PSN": "stats_psn.json",
|
||||||
|
"CityHome": "stats_cityhome.json",
|
||||||
|
}
|
||||||
|
|
||||||
MERGED_FILE = "byty_merged.json"
|
MERGED_FILE = "byty_merged.json"
|
||||||
|
HISTORY_FILE = "scraper_history.json"
|
||||||
|
|
||||||
|
|
||||||
def count_source(path: Path) -> dict:
|
def count_source(path: Path) -> dict:
|
||||||
@@ -36,105 +45,51 @@ def count_source(path: Path) -> dict:
|
|||||||
return {"accepted": 0, "error": str(e)}
|
return {"accepted": 0, "error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
def parse_log(log_path: str) -> dict[str, dict]:
|
def read_scraper_stats(path: Path) -> dict:
|
||||||
"""Parse scraper run log and extract per-source statistics.
|
"""Load a per-scraper stats JSON. Returns {} on missing or corrupt file."""
|
||||||
|
if not path.exists():
|
||||||
Scrapers log summary lines like:
|
return {}
|
||||||
✓ Vyhovující byty: 12
|
try:
|
||||||
Vyloučeno (prodáno): 5
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
Staženo stránek: 3
|
return data if isinstance(data, dict) else {}
|
||||||
Staženo inzerátů: 48
|
except Exception:
|
||||||
Celkem bytů v cache: 120
|
|
||||||
and section headers like:
|
|
||||||
[2/6] Realingo
|
|
||||||
"""
|
|
||||||
if not log_path or not os.path.exists(log_path):
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
with open(log_path, encoding="utf-8") as f:
|
|
||||||
content = f.read()
|
|
||||||
|
|
||||||
# Split into per-source sections by the [N/6] Step header
|
def append_to_history(status: dict, keep: int) -> None:
|
||||||
# Each section header looks like "[2/6] Realingo\n----..."
|
"""Append the current status entry to scraper_history.json, keeping only `keep` latest."""
|
||||||
section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE)
|
history_path = DATA_DIR / HISTORY_FILE
|
||||||
sections_found = list(section_pattern.finditer(content))
|
history: list = []
|
||||||
|
if history_path.exists():
|
||||||
|
try:
|
||||||
|
history = json.loads(history_path.read_text(encoding="utf-8"))
|
||||||
|
if not isinstance(history, list):
|
||||||
|
history = []
|
||||||
|
except Exception:
|
||||||
|
history = []
|
||||||
|
|
||||||
if not sections_found:
|
history.append(status)
|
||||||
return {}
|
|
||||||
|
|
||||||
stats = {}
|
# Keep only the N most recent entries
|
||||||
for i, match in enumerate(sections_found):
|
if keep > 0 and len(history) > keep:
|
||||||
step_name = match.group(2).strip()
|
history = history[-keep:]
|
||||||
start = match.end()
|
|
||||||
end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content)
|
|
||||||
section_text = content[start:end]
|
|
||||||
|
|
||||||
# Identify which sources this section covers
|
history_path.write_text(json.dumps(history, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
# "PSN + CityHome" covers both
|
print(f"Historie uložena: {history_path} ({len(history)} záznamů)")
|
||||||
source_names = []
|
|
||||||
for name in SOURCE_FILES:
|
|
||||||
if name.lower() in step_name.lower():
|
|
||||||
source_names.append(name)
|
|
||||||
if not source_names:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Parse numeric summary lines
|
|
||||||
def extract(pattern: str) -> Optional[int]:
|
|
||||||
m = re.search(pattern, section_text)
|
|
||||||
return int(m.group(1)) if m else None
|
|
||||||
|
|
||||||
# Lines present in all/most scrapers
|
|
||||||
accepted = extract(r'Vyhovující byty[:\s]+(\d+)')
|
|
||||||
fetched = extract(r'Staženo inzerátů[:\s]+(\d+)')
|
|
||||||
pages = extract(r'Staženo stránek[:\s]+(\d+)')
|
|
||||||
cached = extract(r'Celkem bytů v cache[:\s]+(\d+)')
|
|
||||||
cache_hits = extract(r'Cache hit[:\s]+(\d+)')
|
|
||||||
|
|
||||||
# Rejection reasons — collect all into a dict
|
|
||||||
excluded = {}
|
|
||||||
for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text):
|
|
||||||
excluded[m.group(1)] = int(m.group(2))
|
|
||||||
# Also PSN-style "Vyloučeno (prodáno): N"
|
|
||||||
total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)')
|
|
||||||
|
|
||||||
entry = {}
|
|
||||||
if accepted is not None:
|
|
||||||
entry["accepted"] = accepted
|
|
||||||
if fetched is not None:
|
|
||||||
entry["fetched"] = fetched
|
|
||||||
if pages is not None:
|
|
||||||
entry["pages"] = pages
|
|
||||||
if cached is not None:
|
|
||||||
entry["cached"] = cached
|
|
||||||
if cache_hits is not None:
|
|
||||||
entry["cache_hits"] = cache_hits
|
|
||||||
if excluded:
|
|
||||||
entry["excluded"] = excluded
|
|
||||||
elif total_excluded is not None:
|
|
||||||
entry["excluded_total"] = total_excluded
|
|
||||||
|
|
||||||
for name in source_names:
|
|
||||||
stats[name] = entry
|
|
||||||
|
|
||||||
return stats
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
start_time = None
|
parser = argparse.ArgumentParser(description="Generate status.json from scraper outputs.")
|
||||||
duration_sec = None
|
parser.add_argument("--start-time", dest="start_time", default=None,
|
||||||
|
help="ISO timestamp of scrape start (default: now)")
|
||||||
|
parser.add_argument("--duration", dest="duration", type=int, default=None,
|
||||||
|
help="Run duration in seconds")
|
||||||
|
parser.add_argument("--keep", dest="keep", type=int, default=20,
|
||||||
|
help="Number of history entries to keep (default: 20, 0=unlimited)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
if len(sys.argv) >= 3:
|
start_time = args.start_time or datetime.now().isoformat(timespec="seconds")
|
||||||
start_time = sys.argv[1]
|
duration_sec = args.duration
|
||||||
try:
|
|
||||||
duration_sec = int(sys.argv[2])
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if not start_time:
|
|
||||||
start_time = datetime.now().isoformat(timespec="seconds")
|
|
||||||
|
|
||||||
log_path = sys.argv[3] if len(sys.argv) >= 4 else None
|
|
||||||
log_stats = parse_log(log_path)
|
|
||||||
|
|
||||||
sources = []
|
sources = []
|
||||||
for name, filename in SOURCE_FILES.items():
|
for name, filename in SOURCE_FILES.items():
|
||||||
@@ -142,14 +97,12 @@ def main():
|
|||||||
info = count_source(path)
|
info = count_source(path)
|
||||||
info["name"] = name
|
info["name"] = name
|
||||||
|
|
||||||
# Merge log stats
|
# Merge in stats from the per-scraper stats file (authoritative for run data)
|
||||||
ls = log_stats.get(name, {})
|
stats = read_scraper_stats(DATA_DIR / STATS_FILES[name])
|
||||||
for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"):
|
for key in ("accepted", "fetched", "pages", "cache_hits", "excluded", "excluded_total",
|
||||||
if k in ls:
|
"success", "duration_sec", "error"):
|
||||||
info[k] = ls[k]
|
if key in stats:
|
||||||
# Override accepted from log if available (log is authoritative for latest run)
|
info[key] = stats[key]
|
||||||
if "accepted" in ls:
|
|
||||||
info["accepted"] = ls["accepted"]
|
|
||||||
|
|
||||||
sources.append(info)
|
sources.append(info)
|
||||||
|
|
||||||
@@ -168,17 +121,21 @@ def main():
|
|||||||
|
|
||||||
duplicates_removed = total_accepted - deduplicated if deduplicated else 0
|
duplicates_removed = total_accepted - deduplicated if deduplicated else 0
|
||||||
|
|
||||||
|
# Top-level success: True if no source has an error
|
||||||
|
success = not any("error" in s for s in sources)
|
||||||
|
|
||||||
status = {
|
status = {
|
||||||
"status": "done",
|
"status": "done",
|
||||||
"timestamp": start_time,
|
"timestamp": start_time,
|
||||||
"duration_sec": duration_sec,
|
"duration_sec": duration_sec,
|
||||||
|
"success": success,
|
||||||
"total_accepted": total_accepted,
|
"total_accepted": total_accepted,
|
||||||
"deduplicated": deduplicated,
|
"deduplicated": deduplicated,
|
||||||
"duplicates_removed": duplicates_removed,
|
"duplicates_removed": duplicates_removed,
|
||||||
"sources": sources,
|
"sources": sources,
|
||||||
}
|
}
|
||||||
|
|
||||||
out = HERE / "status.json"
|
out = DATA_DIR / "status.json"
|
||||||
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
|
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
print(f"Status uložen: {out}")
|
print(f"Status uložen: {out}")
|
||||||
print(f" Celkem bytů (před dedup): {total_accepted}")
|
print(f" Celkem bytů (před dedup): {total_accepted}")
|
||||||
@@ -197,6 +154,8 @@ def main():
|
|||||||
parts.append(f"[CHYBA: {err}]")
|
parts.append(f"[CHYBA: {err}]")
|
||||||
print(" " + " ".join(parts))
|
print(" " + " ".join(parts))
|
||||||
|
|
||||||
|
append_to_history(status, args.keep)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
1245
mapa_bytu.html
1245
mapa_bytu.html
File diff suppressed because it is too large
Load Diff
@@ -9,6 +9,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import unicodedata
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from scrape_and_map import generate_map, format_price
|
from scrape_and_map import generate_map, format_price
|
||||||
@@ -19,14 +20,8 @@ def normalize_street(locality: str) -> str:
|
|||||||
# "Studentská, Praha 6 - Dejvice" → "studentska"
|
# "Studentská, Praha 6 - Dejvice" → "studentska"
|
||||||
# "Rýnská, Praha" → "rynska"
|
# "Rýnská, Praha" → "rynska"
|
||||||
street = locality.split(",")[0].strip().lower()
|
street = locality.split(",")[0].strip().lower()
|
||||||
# Remove diacritics (simple Czech)
|
# Remove diacritics using Unicode decomposition (handles all Czech characters)
|
||||||
replacements = {
|
street = unicodedata.normalize("NFKD", street).encode("ascii", "ignore").decode("ascii")
|
||||||
"á": "a", "č": "c", "ď": "d", "é": "e", "ě": "e",
|
|
||||||
"í": "i", "ň": "n", "ó": "o", "ř": "r", "š": "s",
|
|
||||||
"ť": "t", "ú": "u", "ů": "u", "ý": "y", "ž": "z",
|
|
||||||
}
|
|
||||||
for src, dst in replacements.items():
|
|
||||||
street = street.replace(src, dst)
|
|
||||||
# Remove non-alphanumeric
|
# Remove non-alphanumeric
|
||||||
street = re.sub(r"[^a-z0-9]", "", street)
|
street = re.sub(r"[^a-z0-9]", "", street)
|
||||||
return street
|
return street
|
||||||
@@ -79,6 +74,10 @@ def main():
|
|||||||
if key in seen_keys:
|
if key in seen_keys:
|
||||||
dupes += 1
|
dupes += 1
|
||||||
existing = seen_keys[key]
|
existing = seen_keys[key]
|
||||||
|
# Preserve earliest first_seen across sources
|
||||||
|
dup_fs = e.get("first_seen", "")
|
||||||
|
if dup_fs and (not existing.get("first_seen") or dup_fs < existing["first_seen"]):
|
||||||
|
existing["first_seen"] = dup_fs
|
||||||
# Log it
|
# Log it
|
||||||
print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')} m² "
|
print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')} m² "
|
||||||
f"({e.get('source', '?')} vs {existing.get('source', '?')})")
|
f"({e.get('source', '?')} vs {existing.get('source', '?')})")
|
||||||
|
|||||||
@@ -1,116 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Minimal HTTP API server for persisting apartment ratings.
|
|
||||||
|
|
||||||
GET /api/ratings → returns ratings.json contents
|
|
||||||
POST /api/ratings → saves entire ratings object
|
|
||||||
GET /api/ratings/export → same as GET, but with download header
|
|
||||||
|
|
||||||
Ratings file: /app/data/ratings.json (or ./ratings.json locally)
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
PORT = int(os.environ.get("RATINGS_PORT", 8081))
|
|
||||||
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
|
|
||||||
RATINGS_FILE = DATA_DIR / "ratings.json"
|
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format="%(asctime)s [ratings] %(levelname)s %(message)s",
|
|
||||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
||||||
)
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def load_ratings() -> dict:
|
|
||||||
try:
|
|
||||||
if RATINGS_FILE.exists():
|
|
||||||
return json.loads(RATINGS_FILE.read_text(encoding="utf-8"))
|
|
||||||
except Exception as e:
|
|
||||||
log.error("Failed to load ratings: %s", e)
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
def save_ratings(data: dict) -> None:
|
|
||||||
RATINGS_FILE.write_text(
|
|
||||||
json.dumps(data, ensure_ascii=False, indent=2),
|
|
||||||
encoding="utf-8",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class RatingsHandler(BaseHTTPRequestHandler):
|
|
||||||
def log_message(self, format, *args):
|
|
||||||
# Suppress default HTTP access log (we use our own)
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _send_json(self, status: int, body: dict, extra_headers=None):
|
|
||||||
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
|
|
||||||
self.send_response(status)
|
|
||||||
self.send_header("Content-Type", "application/json; charset=utf-8")
|
|
||||||
self.send_header("Content-Length", str(len(payload)))
|
|
||||||
self.send_header("Access-Control-Allow-Origin", "*")
|
|
||||||
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
|
||||||
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
|
||||||
if extra_headers:
|
|
||||||
for k, v in extra_headers.items():
|
|
||||||
self.send_header(k, v)
|
|
||||||
self.end_headers()
|
|
||||||
self.wfile.write(payload)
|
|
||||||
|
|
||||||
def do_OPTIONS(self):
|
|
||||||
# CORS preflight
|
|
||||||
self.send_response(204)
|
|
||||||
self.send_header("Access-Control-Allow-Origin", "*")
|
|
||||||
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
|
||||||
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
|
||||||
self.end_headers()
|
|
||||||
|
|
||||||
def do_GET(self):
|
|
||||||
if self.path in ("/api/ratings", "/api/ratings/export"):
|
|
||||||
ratings = load_ratings()
|
|
||||||
extra = None
|
|
||||||
if self.path == "/api/ratings/export":
|
|
||||||
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
|
|
||||||
log.info("GET %s → %d ratings", self.path, len(ratings))
|
|
||||||
self._send_json(200, ratings, extra)
|
|
||||||
else:
|
|
||||||
self._send_json(404, {"error": "not found"})
|
|
||||||
|
|
||||||
def do_POST(self):
|
|
||||||
if self.path == "/api/ratings":
|
|
||||||
length = int(self.headers.get("Content-Length", 0))
|
|
||||||
if length == 0:
|
|
||||||
self._send_json(400, {"error": "empty body"})
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
raw = self.rfile.read(length)
|
|
||||||
data = json.loads(raw.decode("utf-8"))
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("Bad request body: %s", e)
|
|
||||||
self._send_json(400, {"error": "invalid JSON"})
|
|
||||||
return
|
|
||||||
if not isinstance(data, dict):
|
|
||||||
self._send_json(400, {"error": "expected JSON object"})
|
|
||||||
return
|
|
||||||
save_ratings(data)
|
|
||||||
log.info("POST /api/ratings → saved %d ratings", len(data))
|
|
||||||
self._send_json(200, {"ok": True, "count": len(data)})
|
|
||||||
else:
|
|
||||||
self._send_json(404, {"error": "not found"})
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
log.info("Ratings server starting on port %d, data dir: %s", PORT, DATA_DIR)
|
|
||||||
log.info("Ratings file: %s", RATINGS_FILE)
|
|
||||||
server = HTTPServer(("0.0.0.0", PORT), RatingsHandler)
|
|
||||||
try:
|
|
||||||
server.serve_forever()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
log.info("Stopped.")
|
|
||||||
sys.exit(0)
|
|
||||||
114
regen_map.py
114
regen_map.py
@@ -1,114 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Přegeneruje mapu z již stažených dat (byty_sreality.json).
|
|
||||||
Doplní chybějící plochy ze Sreality API, opraví URL, aplikuje filtry.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
import urllib.request
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from scrape_and_map import (
|
|
||||||
generate_map, format_price, MIN_AREA, HEADERS, DETAIL_API
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def api_get(url: str) -> dict:
|
|
||||||
req = urllib.request.Request(url, headers=HEADERS)
|
|
||||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
||||||
return json.loads(resp.read().decode("utf-8"))
|
|
||||||
|
|
||||||
|
|
||||||
def fix_sreality_url(estate: dict) -> str:
|
|
||||||
"""Fix the Sreality URL to include disposition segment (only if missing)."""
|
|
||||||
disp = estate.get("disposition", "")
|
|
||||||
slug_map = {
|
|
||||||
"1+kk": "1+kk", "1+1": "1+1", "2+kk": "2+kk", "2+1": "2+1",
|
|
||||||
"3+kk": "3+kk", "3+1": "3+1", "4+kk": "4+kk", "4+1": "4+1",
|
|
||||||
"5+kk": "5+kk", "5+1": "5+1", "6+": "6-a-vice", "Atypický": "atypicky",
|
|
||||||
}
|
|
||||||
slug = slug_map.get(disp, "byt")
|
|
||||||
old_url = estate.get("url", "")
|
|
||||||
parts = old_url.split("/")
|
|
||||||
try:
|
|
||||||
byt_idx = parts.index("byt")
|
|
||||||
# Only insert if disposition slug is not already there
|
|
||||||
if byt_idx + 1 < len(parts) and parts[byt_idx + 1] == slug:
|
|
||||||
return old_url # already correct
|
|
||||||
parts.insert(byt_idx + 1, slug)
|
|
||||||
return "/".join(parts)
|
|
||||||
except ValueError:
|
|
||||||
return old_url
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_area(hash_id: int) -> int | None:
|
|
||||||
"""Fetch area from detail API."""
|
|
||||||
try:
|
|
||||||
url = DETAIL_API.format(hash_id)
|
|
||||||
detail = api_get(url)
|
|
||||||
for item in detail.get("items", []):
|
|
||||||
name = item.get("name", "")
|
|
||||||
if "žitná ploch" in name or "zitna ploch" in name.lower():
|
|
||||||
return int(item["value"])
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
json_path = Path("byty_sreality.json")
|
|
||||||
if not json_path.exists():
|
|
||||||
print("Soubor byty_sreality.json nenalezen. Nejprve spusť scrape_and_map.py")
|
|
||||||
return
|
|
||||||
|
|
||||||
estates = json.loads(json_path.read_text(encoding="utf-8"))
|
|
||||||
print(f"Načteno {len(estates)} bytů z byty_sreality.json")
|
|
||||||
|
|
||||||
# Step 1: Fetch missing areas
|
|
||||||
missing_area = [e for e in estates if e.get("area") is None]
|
|
||||||
print(f"Doplňuji plochu u {len(missing_area)} bytů...")
|
|
||||||
|
|
||||||
for i, e in enumerate(missing_area):
|
|
||||||
time.sleep(0.3)
|
|
||||||
area = fetch_area(e["hash_id"])
|
|
||||||
if area is not None:
|
|
||||||
e["area"] = area
|
|
||||||
if (i + 1) % 50 == 0:
|
|
||||||
print(f" {i + 1}/{len(missing_area)} ...")
|
|
||||||
|
|
||||||
# Count results
|
|
||||||
with_area = sum(1 for e in estates if e.get("area") is not None)
|
|
||||||
print(f"Plocha doplněna: {with_area}/{len(estates)}")
|
|
||||||
|
|
||||||
# Step 2: Fix URLs
|
|
||||||
for e in estates:
|
|
||||||
e["url"] = fix_sreality_url(e)
|
|
||||||
|
|
||||||
# Step 3: Filter by min area
|
|
||||||
filtered = []
|
|
||||||
excluded = 0
|
|
||||||
for e in estates:
|
|
||||||
area = e.get("area")
|
|
||||||
if area is not None and area < MIN_AREA:
|
|
||||||
excluded += 1
|
|
||||||
continue
|
|
||||||
filtered.append(e)
|
|
||||||
|
|
||||||
print(f"Vyloučeno (< {MIN_AREA} m²): {excluded}")
|
|
||||||
print(f"Zbývá: {len(filtered)} bytů")
|
|
||||||
|
|
||||||
# Save updated data
|
|
||||||
filtered_path = Path("byty_sreality.json")
|
|
||||||
filtered_path.write_text(
|
|
||||||
json.dumps(filtered, ensure_ascii=False, indent=2),
|
|
||||||
encoding="utf-8",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate map
|
|
||||||
generate_map(filtered)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
21
run_all.sh
21
run_all.sh
@@ -20,8 +20,10 @@ START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S")
|
|||||||
START_EPOCH=$(date +%s)
|
START_EPOCH=$(date +%s)
|
||||||
LOG_FILE="$(pwd)/scrape_run.log"
|
LOG_FILE="$(pwd)/scrape_run.log"
|
||||||
|
|
||||||
# Mark status as running
|
# Mark scraper as running; cleaned up on exit (even on error/kill)
|
||||||
echo '{"status":"running"}' > status.json
|
LOCK_FILE="${DATA_DIR:-.}/scraper_running.json"
|
||||||
|
echo '{"running":true,"started_at":"'"$START_TIME"'"}' > "$LOCK_FILE"
|
||||||
|
trap 'rm -f "$LOCK_FILE"' EXIT
|
||||||
|
|
||||||
show_help() {
|
show_help() {
|
||||||
echo "Usage: ./run_all.sh [OPTIONS]"
|
echo "Usage: ./run_all.sh [OPTIONS]"
|
||||||
@@ -32,16 +34,19 @@ show_help() {
|
|||||||
echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje"
|
echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje"
|
||||||
echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje"
|
echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje"
|
||||||
echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)"
|
echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)"
|
||||||
|
echo " --keep N Počet běhů v historii (výchozí: 5, 0=neomezeno)"
|
||||||
echo " -h, --help Zobrazí tuto nápovědu"
|
echo " -h, --help Zobrazí tuto nápovědu"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " ./run_all.sh # plný běh"
|
echo " ./run_all.sh # plný běh"
|
||||||
echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test"
|
echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test"
|
||||||
echo " ./run_all.sh --log-level DEBUG # s debug logováním"
|
echo " ./run_all.sh --log-level DEBUG # s debug logováním"
|
||||||
|
echo " ./run_all.sh --keep 10 # uchovej 10 běhů v historii"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Parse arguments
|
# Parse arguments
|
||||||
SCRAPER_ARGS=""
|
SCRAPER_ARGS=""
|
||||||
|
KEEP_ARG=""
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
-h|--help)
|
-h|--help)
|
||||||
@@ -52,6 +57,10 @@ while [[ $# -gt 0 ]]; do
|
|||||||
SCRAPER_ARGS="$SCRAPER_ARGS $1 $2"
|
SCRAPER_ARGS="$SCRAPER_ARGS $1 $2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--keep)
|
||||||
|
KEEP_ARG="--keep $2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown argument: $1"
|
echo "Unknown argument: $1"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -75,9 +84,6 @@ exec > >(tee -a "$LOG_FILE") 2>&1
|
|||||||
step "Sreality"
|
step "Sreality"
|
||||||
python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
||||||
|
|
||||||
step "Realingo"
|
|
||||||
python3 scrape_realingo.py $SCRAPER_ARGS || { echo -e "${RED}✗ Realingo selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
|
||||||
|
|
||||||
step "Bezrealitky"
|
step "Bezrealitky"
|
||||||
python3 scrape_bezrealitky.py $SCRAPER_ARGS || { echo -e "${RED}✗ Bezrealitky selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
python3 scrape_bezrealitky.py $SCRAPER_ARGS || { echo -e "${RED}✗ Bezrealitky selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
||||||
|
|
||||||
@@ -92,6 +98,9 @@ PID_CH=$!
|
|||||||
wait $PID_PSN || { echo -e "${RED}✗ PSN selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
wait $PID_PSN || { echo -e "${RED}✗ PSN selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
||||||
wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
||||||
|
|
||||||
|
step "Realingo"
|
||||||
|
python3 scrape_realingo.py $SCRAPER_ARGS || { echo -e "${RED}✗ Realingo selhalo${NC}"; FAILED=$((FAILED + 1)); }
|
||||||
|
|
||||||
# ── Sloučení + mapa ──────────────────────────────────────────
|
# ── Sloučení + mapa ──────────────────────────────────────────
|
||||||
|
|
||||||
step "Sloučení dat a generování mapy"
|
step "Sloučení dat a generování mapy"
|
||||||
@@ -103,7 +112,7 @@ python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((F
|
|||||||
|
|
||||||
END_EPOCH=$(date +%s)
|
END_EPOCH=$(date +%s)
|
||||||
DURATION=$((END_EPOCH - START_EPOCH))
|
DURATION=$((END_EPOCH - START_EPOCH))
|
||||||
python3 generate_status.py "$START_TIME" "$DURATION" "$LOG_FILE"
|
python3 generate_status.py --start-time "$START_TIME" --duration "$DURATION" $KEEP_ARG
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "============================================================"
|
echo "============================================================"
|
||||||
|
|||||||
@@ -13,8 +13,11 @@ import math
|
|||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from scraper_stats import write_stats, validate_listing
|
||||||
|
|
||||||
|
STATS_FILE = "stats_sreality.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -42,9 +45,9 @@ HEADERS = {
|
|||||||
|
|
||||||
|
|
||||||
def api_get(url: str) -> dict:
|
def api_get(url: str) -> dict:
|
||||||
"""Fetch JSON from Sreality API."""
|
"""Fetch JSON from Sreality API with retry."""
|
||||||
logger.debug(f"HTTP GET request: {url}")
|
for attempt in range(3):
|
||||||
logger.debug(f"Headers: {HEADERS}")
|
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
|
||||||
req = urllib.request.Request(url, headers=HEADERS)
|
req = urllib.request.Request(url, headers=HEADERS)
|
||||||
try:
|
try:
|
||||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
@@ -52,8 +55,15 @@ def api_get(url: str) -> dict:
|
|||||||
logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes")
|
logger.debug(f"HTTP response: status={resp.status}, size={len(response_data)} bytes")
|
||||||
logger.debug(f"Response preview: {response_data[:200]}")
|
logger.debug(f"Response preview: {response_data[:200]}")
|
||||||
return json.loads(response_data)
|
return json.loads(response_data)
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
raise
|
||||||
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
||||||
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
|
if attempt < 2:
|
||||||
|
wait = (attempt + 1) * 2
|
||||||
|
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
|
||||||
|
time.sleep(wait)
|
||||||
|
else:
|
||||||
|
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
@@ -209,6 +219,8 @@ def load_cache(json_path: str = "byty_sreality.json") -> dict[int, dict]:
|
|||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||||
"""Main scraping function. Returns list of filtered estates."""
|
"""Main scraping function. Returns list of filtered estates."""
|
||||||
|
_run_start = time.time()
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
all_estates_raw = []
|
all_estates_raw = []
|
||||||
cache = load_cache()
|
cache = load_cache()
|
||||||
|
|
||||||
@@ -348,7 +360,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
"url": sreality_url(hash_id, seo),
|
"url": sreality_url(hash_id, seo),
|
||||||
"image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""),
|
"image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""),
|
||||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"last_changed": datetime.now().strftime("%Y-%m-%d"),
|
||||||
}
|
}
|
||||||
|
if not validate_listing(result, "sreality"):
|
||||||
|
continue
|
||||||
results.append(result)
|
results.append(result)
|
||||||
details_fetched += 1
|
details_fetched += 1
|
||||||
|
|
||||||
@@ -366,6 +382,21 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||||
logger.info(f"{'=' * 60}")
|
logger.info(f"{'=' * 60}")
|
||||||
|
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "Sreality",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - _run_start, 1),
|
||||||
|
"success": True,
|
||||||
|
"accepted": len(results),
|
||||||
|
"fetched": len(unique_estates),
|
||||||
|
"cache_hits": cache_hits,
|
||||||
|
"excluded": {
|
||||||
|
"panel/síd": excluded_panel,
|
||||||
|
"<69 m²": excluded_small,
|
||||||
|
"bez GPS": excluded_no_gps,
|
||||||
|
"bez detailu": excluded_no_detail,
|
||||||
|
},
|
||||||
|
})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -409,18 +440,30 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
|||||||
]
|
]
|
||||||
for bcolor, blabel in bands:
|
for bcolor, blabel in bands:
|
||||||
price_legend_items += (
|
price_legend_items += (
|
||||||
f'<div style="display:flex;align-items:center;gap:6px;margin:2px 0;">'
|
f'<div class="price-band" data-color="{bcolor}" onclick="toggleColorFilter(\'{bcolor}\')" '
|
||||||
|
f'style="display:flex;align-items:center;gap:6px;margin:2px 0;padding:2px 4px;'
|
||||||
|
f'border-radius:4px;border:2px solid transparent;">'
|
||||||
f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};'
|
f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};'
|
||||||
f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>'
|
f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>'
|
||||||
f'<span>{blabel}</span></div>'
|
f'<span>{blabel}</span></div>'
|
||||||
)
|
)
|
||||||
|
price_legend_items += (
|
||||||
|
'<div id="price-filter-reset" style="display:none;margin:3px 0 0 4px;">'
|
||||||
|
'<a href="#" onclick="resetColorFilter();return false;" '
|
||||||
|
'style="font-size:11px;color:#1976D2;text-decoration:none;">✕ Zobrazit všechny ceny</a>'
|
||||||
|
'</div>'
|
||||||
|
)
|
||||||
# New marker indicator — bigger dot, no extra border
|
# New marker indicator — bigger dot, no extra border
|
||||||
price_legend_items += (
|
price_legend_items += (
|
||||||
'<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;'
|
'<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;'
|
||||||
'padding-top:6px;border-top:1px solid #eee;">'
|
'padding-top:6px;border-top:1px solid #eee;">'
|
||||||
'<span style="width:18px;height:18px;border-radius:50%;background:#66BB6A;'
|
'<span style="display:inline-flex;align-items:center;gap:3px;flex-shrink:0;">'
|
||||||
'display:inline-block;box-shadow:0 1px 4px rgba(0,0,0,0.35);flex-shrink:0;"></span>'
|
'<span style="width:14px;height:14px;border-radius:50%;background:#66BB6A;'
|
||||||
'<span>Nové (z dnešního scrapu) — větší</span></div>'
|
'display:inline-block;box-shadow:0 1px 3px rgba(0,0,0,0.3);"></span>'
|
||||||
|
'<span style="font-size:8px;font-weight:700;background:#FFD600;color:#333;'
|
||||||
|
'padding:1px 3px;border-radius:2px;">NEW</span>'
|
||||||
|
'</span>'
|
||||||
|
'<span>Nové (≤ 1 den)</span></div>'
|
||||||
)
|
)
|
||||||
|
|
||||||
markers_js = ""
|
markers_js = ""
|
||||||
@@ -442,18 +485,32 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
|||||||
source_label = source_labels.get(source, source)
|
source_label = source_labels.get(source, source)
|
||||||
source_color = source_colors.get(source, "#999")
|
source_color = source_colors.get(source, "#999")
|
||||||
|
|
||||||
hash_id = e.get("hash_id", "")
|
hash_id = f"{source}_{e.get('hash_id', '')}"
|
||||||
|
|
||||||
scraped_at = e.get("scraped_at", "")
|
first_seen = e.get("first_seen", "")
|
||||||
is_new = scraped_at == datetime.now().strftime("%Y-%m-%d")
|
last_changed = e.get("last_changed", "")
|
||||||
|
today = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
|
||||||
|
is_new = first_seen in (today, yesterday)
|
||||||
|
|
||||||
new_badge = (
|
new_badge = (
|
||||||
'<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;'
|
'<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;'
|
||||||
'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>'
|
'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>'
|
||||||
if is_new else ""
|
if is_new else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
date_parts = []
|
||||||
|
if first_seen:
|
||||||
|
date_parts.append(f'Přidáno: {first_seen}')
|
||||||
|
if last_changed and last_changed != first_seen:
|
||||||
|
date_parts.append(f'Změněno: {last_changed}')
|
||||||
|
date_row = (
|
||||||
|
f'<span style="font-size:11px;color:#888;">{" · ".join(date_parts)}</span><br>'
|
||||||
|
if date_parts else ""
|
||||||
|
)
|
||||||
|
|
||||||
popup = (
|
popup = (
|
||||||
f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">'
|
f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}" data-first-seen="{first_seen}" data-last-changed="{last_changed}">'
|
||||||
f'<b style="font-size:14px;">{format_price(e["price"])}</b>'
|
f'<b style="font-size:14px;">{format_price(e["price"])}</b>'
|
||||||
f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;'
|
f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;'
|
||||||
f'padding:1px 6px;border-radius:3px;">{source_label}</span>{new_badge}<br>'
|
f'padding:1px 6px;border-radius:3px;">{source_label}</span>{new_badge}<br>'
|
||||||
@@ -461,7 +518,9 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
|||||||
f'{floor_note}<br><br>'
|
f'{floor_note}<br><br>'
|
||||||
f'<b>{e["locality"]}</b><br>'
|
f'<b>{e["locality"]}</b><br>'
|
||||||
f'Stavba: {building_text}<br>'
|
f'Stavba: {building_text}<br>'
|
||||||
f'Vlastnictví: {ownership_text}<br><br>'
|
f'Vlastnictví: {ownership_text}<br>'
|
||||||
|
f'{date_row}'
|
||||||
|
f'<br>'
|
||||||
f'<a href="{e["url"]}" target="_blank" '
|
f'<a href="{e["url"]}" target="_blank" '
|
||||||
f'style="color:{source_color};text-decoration:none;font-weight:bold;">'
|
f'style="color:{source_color};text-decoration:none;font-weight:bold;">'
|
||||||
f'→ Otevřít na {source_label}</a>'
|
f'→ Otevřít na {source_label}</a>'
|
||||||
@@ -493,7 +552,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
|||||||
else:
|
else:
|
||||||
marker_fn = "addMarker"
|
marker_fn = "addMarker"
|
||||||
markers_js += (
|
markers_js += (
|
||||||
f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n"
|
f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}', '{first_seen}', '{last_changed}');\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build legend — price per m² bands + disposition counts
|
# Build legend — price per m² bands + disposition counts
|
||||||
@@ -559,12 +618,12 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
|||||||
.heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }}
|
.heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }}
|
||||||
.heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }}
|
.heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }}
|
||||||
.reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }}
|
.reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }}
|
||||||
@keyframes pulse-new {{
|
.new-badge-icon {{ background: none !important; border: none !important; pointer-events: none !important; }}
|
||||||
0% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
|
.new-badge {{
|
||||||
50% {{ stroke-opacity: 0.4; stroke-width: 6px; r: 12; }}
|
font-size: 9px; font-weight: 700; color: #333; background: #FFD600;
|
||||||
100% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
|
padding: 1px 4px; border-radius: 3px; white-space: nowrap;
|
||||||
|
box-shadow: 0 1px 3px rgba(0,0,0,0.3); letter-spacing: 0.5px;
|
||||||
}}
|
}}
|
||||||
.marker-new {{ animation: pulse-new 2s ease-in-out infinite; }}
|
|
||||||
.info-panel {{
|
.info-panel {{
|
||||||
position: absolute; top: 10px; right: 10px; z-index: 1000;
|
position: absolute; top: 10px; right: 10px; z-index: 1000;
|
||||||
background: white; padding: 16px; border-radius: 10px;
|
background: white; padding: 16px; border-radius: 10px;
|
||||||
@@ -597,6 +656,10 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
|||||||
.info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }}
|
.info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }}
|
||||||
.filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }}
|
.filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }}
|
||||||
.filter-section label {{ display: flex; align-items: center; gap: 6px; margin: 3px 0; cursor: pointer; }}
|
.filter-section label {{ display: flex; align-items: center; gap: 6px; margin: 3px 0; cursor: pointer; }}
|
||||||
|
.price-band {{ cursor: pointer; transition: background 0.12s; }}
|
||||||
|
.price-band:hover {{ background: #f0f0f0; }}
|
||||||
|
.price-band.active {{ border-color: #333 !important; background: #e8f0fe; }}
|
||||||
|
.price-band.dimmed {{ opacity: 0.35; }}
|
||||||
.filter-section input[type="checkbox"] {{ accent-color: #1976D2; }}
|
.filter-section input[type="checkbox"] {{ accent-color: #1976D2; }}
|
||||||
#floor-filter {{ margin-top: 8px; }}
|
#floor-filter {{ margin-top: 8px; }}
|
||||||
#floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }}
|
#floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }}
|
||||||
@@ -635,11 +698,23 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
|||||||
</div>
|
</div>
|
||||||
<div style="margin-top:6px;">
|
<div style="margin-top:6px;">
|
||||||
<label>Max cena:
|
<label>Max cena:
|
||||||
<select id="max-price" onchange="applyFilters()">
|
<input type="number" id="max-price" value="13500000" max="14000000" step="500000"
|
||||||
<option value="13500000">13 500 000 Kč</option>
|
style="width:130px;padding:2px 4px;border:1px solid #ccc;border-radius:3px;"
|
||||||
<option value="12000000">12 000 000 Kč</option>
|
onchange="applyFilters()" onkeyup="applyFilters()"> Kč
|
||||||
<option value="10000000">10 000 000 Kč</option>
|
</label>
|
||||||
<option value="8000000">8 000 000 Kč</option>
|
</div>
|
||||||
|
<div style="margin-top:6px;">
|
||||||
|
<label>Přidáno / změněno:
|
||||||
|
<select id="days-filter" onchange="applyFilters()" style="width:100%;padding:4px;border-radius:4px;border:1px solid #ccc;">
|
||||||
|
<option value="0">Vše</option>
|
||||||
|
<option value="1">za 1 den</option>
|
||||||
|
<option value="2">za 2 dny</option>
|
||||||
|
<option value="3">za 3 dny</option>
|
||||||
|
<option value="4">za 4 dny</option>
|
||||||
|
<option value="5">za 5 dní</option>
|
||||||
|
<option value="7">za 7 dní</option>
|
||||||
|
<option value="14">za 14 dní</option>
|
||||||
|
<option value="30">za 30 dní</option>
|
||||||
</select>
|
</select>
|
||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
@@ -653,7 +728,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
|
|||||||
Skrýt zamítnuté
|
Skrýt zamítnuté
|
||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
<div class="status-link"><a href="status.html">Scraper status</a></div>
|
<div class="status-link"><a href="/scrapers-status">Scraper status</a></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
@@ -673,9 +748,39 @@ L.tileLayer('https://{{s}}.basemaps.cartocdn.com/light_only_labels/{{z}}/{{x}}/{
|
|||||||
pane: 'shadowPane',
|
pane: 'shadowPane',
|
||||||
}}).addTo(map);
|
}}).addTo(map);
|
||||||
|
|
||||||
|
var selectedColors = [];
|
||||||
|
|
||||||
|
function toggleColorFilter(color) {{
|
||||||
|
var idx = selectedColors.indexOf(color);
|
||||||
|
if (idx >= 0) selectedColors.splice(idx, 1);
|
||||||
|
else selectedColors.push(color);
|
||||||
|
document.querySelectorAll('.price-band').forEach(function(el) {{
|
||||||
|
var c = el.getAttribute('data-color');
|
||||||
|
if (selectedColors.length === 0) {{
|
||||||
|
el.classList.remove('active', 'dimmed');
|
||||||
|
}} else if (selectedColors.indexOf(c) >= 0) {{
|
||||||
|
el.classList.add('active'); el.classList.remove('dimmed');
|
||||||
|
}} else {{
|
||||||
|
el.classList.add('dimmed'); el.classList.remove('active');
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
document.getElementById('price-filter-reset').style.display =
|
||||||
|
selectedColors.length > 0 ? 'block' : 'none';
|
||||||
|
applyFilters();
|
||||||
|
}}
|
||||||
|
|
||||||
|
function resetColorFilter() {{
|
||||||
|
selectedColors = [];
|
||||||
|
document.querySelectorAll('.price-band').forEach(function(el) {{
|
||||||
|
el.classList.remove('active', 'dimmed');
|
||||||
|
}});
|
||||||
|
document.getElementById('price-filter-reset').style.display = 'none';
|
||||||
|
applyFilters();
|
||||||
|
}}
|
||||||
|
|
||||||
var allMarkers = [];
|
var allMarkers = [];
|
||||||
|
|
||||||
function addMarker(lat, lon, color, popup, hashId) {{
|
function addMarker(lat, lon, color, popup, hashId, firstSeen, lastChanged) {{
|
||||||
var marker = L.circleMarker([lat, lon], {{
|
var marker = L.circleMarker([lat, lon], {{
|
||||||
radius: 8,
|
radius: 8,
|
||||||
fillColor: color,
|
fillColor: color,
|
||||||
@@ -684,26 +789,35 @@ function addMarker(lat, lon, color, popup, hashId) {{
|
|||||||
opacity: 1,
|
opacity: 1,
|
||||||
fillOpacity: 0.85,
|
fillOpacity: 0.85,
|
||||||
}}).bindPopup(popup);
|
}}).bindPopup(popup);
|
||||||
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId }};
|
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, firstSeen: firstSeen || '', lastChanged: lastChanged || '' }};
|
||||||
allMarkers.push(marker);
|
allMarkers.push(marker);
|
||||||
marker.addTo(map);
|
marker.addTo(map);
|
||||||
}}
|
}}
|
||||||
|
|
||||||
function addNewMarker(lat, lon, color, popup, hashId) {{
|
function addNewMarker(lat, lon, color, popup, hashId, firstSeen, lastChanged) {{
|
||||||
var marker = L.circleMarker([lat, lon], {{
|
var marker = L.circleMarker([lat, lon], {{
|
||||||
radius: 12,
|
radius: 8,
|
||||||
fillColor: color,
|
fillColor: color,
|
||||||
color: color,
|
color: '#fff',
|
||||||
weight: 4,
|
weight: 2,
|
||||||
opacity: 0.35,
|
opacity: 1,
|
||||||
fillOpacity: 0.95,
|
fillOpacity: 0.85,
|
||||||
}}).bindPopup(popup);
|
}}).bindPopup(popup);
|
||||||
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true }};
|
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true, firstSeen: firstSeen || '', lastChanged: lastChanged || '' }};
|
||||||
allMarkers.push(marker);
|
allMarkers.push(marker);
|
||||||
marker.addTo(map);
|
marker.addTo(map);
|
||||||
marker.on('add', function() {{
|
var badge = L.marker([lat, lon], {{
|
||||||
if (marker._path) marker._path.classList.add('marker-new');
|
icon: L.divIcon({{
|
||||||
|
className: 'new-badge-icon',
|
||||||
|
html: '<span class="new-badge">NEW</span>',
|
||||||
|
iconSize: [32, 14],
|
||||||
|
iconAnchor: [-6, 7],
|
||||||
|
}}),
|
||||||
|
interactive: false,
|
||||||
|
pane: 'markerPane',
|
||||||
}});
|
}});
|
||||||
|
badge.addTo(map);
|
||||||
|
marker._newBadge = badge;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
function heartIcon(color) {{
|
function heartIcon(color) {{
|
||||||
@@ -736,11 +850,11 @@ function starIcon() {{
|
|||||||
}});
|
}});
|
||||||
}}
|
}}
|
||||||
|
|
||||||
function addHeartMarker(lat, lon, color, popup, hashId) {{
|
function addHeartMarker(lat, lon, color, popup, hashId, firstSeen, lastChanged) {{
|
||||||
var marker = L.marker([lat, lon], {{
|
var marker = L.marker([lat, lon], {{
|
||||||
icon: heartIcon(color),
|
icon: heartIcon(color),
|
||||||
}}).bindPopup(popup);
|
}}).bindPopup(popup);
|
||||||
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isHeart: true }};
|
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isHeart: true, firstSeen: firstSeen || '', lastChanged: lastChanged || '' }};
|
||||||
allMarkers.push(marker);
|
allMarkers.push(marker);
|
||||||
marker.addTo(map);
|
marker.addTo(map);
|
||||||
}}
|
}}
|
||||||
@@ -759,6 +873,11 @@ function loadRatings() {{
|
|||||||
|
|
||||||
function saveRatings(ratings) {{
|
function saveRatings(ratings) {{
|
||||||
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
|
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
|
||||||
|
fetch('/api/ratings', {{
|
||||||
|
method: 'POST',
|
||||||
|
headers: {{'Content-Type': 'application/json'}},
|
||||||
|
body: JSON.stringify(ratings)
|
||||||
|
}}).catch(function() {{}});
|
||||||
}}
|
}}
|
||||||
|
|
||||||
function addRejectStrike(marker) {{
|
function addRejectStrike(marker) {{
|
||||||
@@ -806,6 +925,7 @@ function applyMarkerStyle(marker, status) {{
|
|||||||
}} else {{
|
}} else {{
|
||||||
if (status === 'fav') {{
|
if (status === 'fav') {{
|
||||||
removeRejectStrike(marker);
|
removeRejectStrike(marker);
|
||||||
|
if (marker._newBadge && map.hasLayer(marker._newBadge)) map.removeLayer(marker._newBadge);
|
||||||
if (!marker._data._origCircle) marker._data._origCircle = true;
|
if (!marker._data._origCircle) marker._data._origCircle = true;
|
||||||
var popup = marker.getPopup();
|
var popup = marker.getPopup();
|
||||||
var popupContent = popup ? popup.getContent() : '';
|
var popupContent = popup ? popup.getContent() : '';
|
||||||
@@ -829,6 +949,7 @@ function applyMarkerStyle(marker, status) {{
|
|||||||
}}
|
}}
|
||||||
// Add strikethrough line over the marker
|
// Add strikethrough line over the marker
|
||||||
addRejectStrike(marker);
|
addRejectStrike(marker);
|
||||||
|
if (marker._newBadge && map.hasLayer(marker._newBadge)) map.removeLayer(marker._newBadge);
|
||||||
}} else {{
|
}} else {{
|
||||||
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
|
if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
|
||||||
revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }});
|
revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }});
|
||||||
@@ -841,6 +962,7 @@ function applyMarkerStyle(marker, status) {{
|
|||||||
}}
|
}}
|
||||||
if (marker._path) marker._path.classList.remove('marker-rejected');
|
if (marker._path) marker._path.classList.remove('marker-rejected');
|
||||||
removeRejectStrike(marker);
|
removeRejectStrike(marker);
|
||||||
|
if (marker._newBadge && !map.hasLayer(marker._newBadge)) marker._newBadge.addTo(map);
|
||||||
}}
|
}}
|
||||||
}}
|
}}
|
||||||
}}
|
}}
|
||||||
@@ -996,11 +1118,21 @@ map.on('popupopen', function(e) {{
|
|||||||
// ── Filters ────────────────────────────────────────────────────
|
// ── Filters ────────────────────────────────────────────────────
|
||||||
function applyFilters() {{
|
function applyFilters() {{
|
||||||
var minFloor = parseInt(document.getElementById('min-floor').value);
|
var minFloor = parseInt(document.getElementById('min-floor').value);
|
||||||
var maxPrice = parseInt(document.getElementById('max-price').value);
|
var maxPriceEl = document.getElementById('max-price');
|
||||||
|
var maxPrice = parseInt(maxPriceEl.value) || 14000000;
|
||||||
|
if (maxPrice > 14000000) {{ maxPrice = 14000000; maxPriceEl.value = 14000000; }}
|
||||||
var hideRejected = document.getElementById('hide-rejected').checked;
|
var hideRejected = document.getElementById('hide-rejected').checked;
|
||||||
|
var daysFilter = parseInt(document.getElementById('days-filter').value) || 0;
|
||||||
var ratings = loadRatings();
|
var ratings = loadRatings();
|
||||||
var visible = 0;
|
var visible = 0;
|
||||||
|
|
||||||
|
var cutoff = null;
|
||||||
|
if (daysFilter > 0) {{
|
||||||
|
cutoff = new Date();
|
||||||
|
cutoff.setDate(cutoff.getDate() - daysFilter);
|
||||||
|
cutoff.setHours(0, 0, 0, 0);
|
||||||
|
}}
|
||||||
|
|
||||||
allMarkers.forEach(function(m) {{
|
allMarkers.forEach(function(m) {{
|
||||||
var popup = m.getPopup().getContent();
|
var popup = m.getPopup().getContent();
|
||||||
var floorMatch = popup.match(/(\\d+)\\. NP/);
|
var floorMatch = popup.match(/(\\d+)\\. NP/);
|
||||||
@@ -1013,6 +1145,14 @@ function applyFilters() {{
|
|||||||
if (floor !== null && floor < minFloor) show = false;
|
if (floor !== null && floor < minFloor) show = false;
|
||||||
if (price > maxPrice) show = false;
|
if (price > maxPrice) show = false;
|
||||||
|
|
||||||
|
if (cutoff) {{
|
||||||
|
var fs = m._data.firstSeen ? new Date(m._data.firstSeen) : null;
|
||||||
|
var lc = m._data.lastChanged ? new Date(m._data.lastChanged) : null;
|
||||||
|
if (!((fs && fs >= cutoff) || (lc && lc >= cutoff))) show = false;
|
||||||
|
}}
|
||||||
|
|
||||||
|
if (selectedColors.length > 0 && selectedColors.indexOf(m._data.color) < 0) show = false;
|
||||||
|
|
||||||
var r = ratings[m._data.hashId];
|
var r = ratings[m._data.hashId];
|
||||||
if (hideRejected && r && r.status === 'reject') show = false;
|
if (hideRejected && r && r.status === 'reject') show = false;
|
||||||
|
|
||||||
@@ -1021,10 +1161,12 @@ function applyFilters() {{
|
|||||||
visible++;
|
visible++;
|
||||||
// Show strike line if rejected and visible
|
// Show strike line if rejected and visible
|
||||||
if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map);
|
if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map);
|
||||||
|
if (m._newBadge && !map.hasLayer(m._newBadge)) m._newBadge.addTo(map);
|
||||||
}} else {{
|
}} else {{
|
||||||
if (map.hasLayer(m)) map.removeLayer(m);
|
if (map.hasLayer(m)) map.removeLayer(m);
|
||||||
// Hide strike line when marker hidden
|
// Hide strike line when marker hidden
|
||||||
if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike);
|
if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike);
|
||||||
|
if (m._newBadge && map.hasLayer(m._newBadge)) map.removeLayer(m._newBadge);
|
||||||
}}
|
}}
|
||||||
}});
|
}});
|
||||||
|
|
||||||
@@ -1039,8 +1181,25 @@ function applyFilters() {{
|
|||||||
document.getElementById('visible-count').textContent = visible;
|
document.getElementById('visible-count').textContent = visible;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
// Initialize ratings on load
|
// Initialize ratings: load from server, merge with localStorage, then restore
|
||||||
|
function initRatings() {{
|
||||||
|
var local = loadRatings();
|
||||||
|
fetch('/api/ratings')
|
||||||
|
.then(function(r) {{ return r.ok ? r.json() : null; }})
|
||||||
|
.then(function(server) {{
|
||||||
|
if (server && typeof server === 'object') {{
|
||||||
|
var merged = Object.assign({{}}, local, server);
|
||||||
|
localStorage.setItem(RATINGS_KEY, JSON.stringify(merged));
|
||||||
|
}}
|
||||||
restoreRatings();
|
restoreRatings();
|
||||||
|
updateRatingCounts();
|
||||||
|
}})
|
||||||
|
.catch(function() {{
|
||||||
|
restoreRatings();
|
||||||
|
updateRatingCounts();
|
||||||
|
}});
|
||||||
|
}}
|
||||||
|
initRatings();
|
||||||
|
|
||||||
// ── Panel toggle ──────────────────────────────────────────────
|
// ── Panel toggle ──────────────────────────────────────────────
|
||||||
function togglePanel() {{
|
function togglePanel() {{
|
||||||
@@ -1089,8 +1248,22 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
try:
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scraper failed: {e}", exc_info=True)
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "Sreality",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - start, 1),
|
||||||
|
"success": False,
|
||||||
|
"accepted": 0,
|
||||||
|
"fetched": 0,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
raise
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
# Save raw data as JSON backup
|
# Save raw data as JSON backup
|
||||||
|
|||||||
@@ -15,6 +15,9 @@ import re
|
|||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from scraper_stats import write_stats, validate_listing
|
||||||
|
|
||||||
|
STATS_FILE = "stats_bezrealitky.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -68,19 +71,35 @@ HEADERS = {
|
|||||||
BASE_URL = "https://www.bezrealitky.cz"
|
BASE_URL = "https://www.bezrealitky.cz"
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_url(url: str, retries: int = 3) -> str:
|
||||||
|
"""Fetch URL and return HTML string with retry on transient errors."""
|
||||||
|
for attempt in range(retries):
|
||||||
|
try:
|
||||||
|
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
|
||||||
|
req = urllib.request.Request(url, headers=HEADERS)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=30)
|
||||||
|
html = resp.read().decode("utf-8")
|
||||||
|
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||||
|
return html
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
raise
|
||||||
|
except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
|
||||||
|
if attempt < retries - 1:
|
||||||
|
wait = (attempt + 1) * 2
|
||||||
|
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
|
||||||
|
time.sleep(wait)
|
||||||
|
else:
|
||||||
|
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(page: int) -> tuple[list[dict], int]:
|
def fetch_page(page: int) -> tuple[list[dict], int]:
|
||||||
"""
|
"""
|
||||||
Fetch a listing page from Bezrealitky.
|
Fetch a listing page from Bezrealitky.
|
||||||
Returns (list of advert dicts from Apollo cache, total count).
|
Returns (list of advert dicts from Apollo cache, total count).
|
||||||
"""
|
"""
|
||||||
url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
|
url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
|
||||||
logger.debug(f"HTTP GET request: {url}")
|
html = fetch_url(url)
|
||||||
logger.debug(f"Headers: {HEADERS}")
|
|
||||||
req = urllib.request.Request(url, headers=HEADERS)
|
|
||||||
try:
|
|
||||||
resp = urllib.request.urlopen(req, timeout=30)
|
|
||||||
html = resp.read().decode("utf-8")
|
|
||||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
|
||||||
|
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||||
@@ -110,20 +129,13 @@ def fetch_page(page: int) -> tuple[list[dict], int]:
|
|||||||
|
|
||||||
logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
|
logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
|
||||||
return adverts, total
|
return adverts, total
|
||||||
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
|
||||||
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_detail(uri: str) -> dict | None:
|
def fetch_detail(uri: str) -> dict | None:
|
||||||
"""Fetch detail page for a listing."""
|
"""Fetch detail page for a listing."""
|
||||||
try:
|
try:
|
||||||
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
|
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
|
||||||
logger.debug(f"HTTP GET request: {url}")
|
html = fetch_url(url)
|
||||||
req = urllib.request.Request(url, headers=HEADERS)
|
|
||||||
resp = urllib.request.urlopen(req, timeout=30)
|
|
||||||
html = resp.read().decode("utf-8")
|
|
||||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
|
||||||
|
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||||
@@ -171,6 +183,8 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
|
|||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||||
|
_run_start = time.time()
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
cache = load_cache()
|
cache = load_cache()
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
@@ -357,7 +371,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
"source": "bezrealitky",
|
"source": "bezrealitky",
|
||||||
"image": "",
|
"image": "",
|
||||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"last_changed": datetime.now().strftime("%Y-%m-%d"),
|
||||||
}
|
}
|
||||||
|
if not validate_listing(result, "bezrealitky"):
|
||||||
|
continue
|
||||||
results.append(result)
|
results.append(result)
|
||||||
properties_fetched += 1
|
properties_fetched += 1
|
||||||
|
|
||||||
@@ -374,6 +392,25 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||||
logger.info(f"{'=' * 60}")
|
logger.info(f"{'=' * 60}")
|
||||||
|
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "Bezrealitky",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - _run_start, 1),
|
||||||
|
"success": True,
|
||||||
|
"accepted": len(results),
|
||||||
|
"fetched": len(all_adverts),
|
||||||
|
"pages": page - 1,
|
||||||
|
"cache_hits": cache_hits,
|
||||||
|
"excluded": {
|
||||||
|
"dispozice": excluded_disp,
|
||||||
|
"cena": excluded_price,
|
||||||
|
"plocha": excluded_area,
|
||||||
|
"bez GPS": excluded_no_gps,
|
||||||
|
"panel/síd": excluded_panel,
|
||||||
|
"patro": excluded_floor,
|
||||||
|
"bez detailu": excluded_detail,
|
||||||
|
},
|
||||||
|
})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -394,8 +431,22 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
try:
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scraper failed: {e}", exc_info=True)
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "Bezrealitky",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - start, 1),
|
||||||
|
"success": False,
|
||||||
|
"accepted": 0,
|
||||||
|
"fetched": 0,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
raise
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_bezrealitky.json")
|
json_path = Path("byty_bezrealitky.json")
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ import time
|
|||||||
import urllib.request
|
import urllib.request
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from scraper_stats import write_stats, validate_listing
|
||||||
|
|
||||||
|
STATS_FILE = "stats_cityhome.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -203,6 +206,8 @@ def extract_project_gps(html: str) -> tuple[float, float] | None:
|
|||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||||
|
_run_start = time.time()
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
|
logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
|
||||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||||
@@ -250,6 +255,16 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
else:
|
else:
|
||||||
logger.info(f"✗ {slug}: GPS nenalezeno")
|
logger.info(f"✗ {slug}: GPS nenalezeno")
|
||||||
|
|
||||||
|
# Load previous output for first_seen/last_changed tracking
|
||||||
|
_prev_cache: dict[str, dict] = {}
|
||||||
|
_prev_path = Path("byty_cityhome.json")
|
||||||
|
if _prev_path.exists():
|
||||||
|
try:
|
||||||
|
for _item in json.loads(_prev_path.read_text(encoding="utf-8")):
|
||||||
|
_prev_cache[str(_item["hash_id"])] = _item
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Step 3: Filter listings
|
# Step 3: Filter listings
|
||||||
logger.info(f"\nFáze 3: Filtrování...")
|
logger.info(f"\nFáze 3: Filtrování...")
|
||||||
results = []
|
results = []
|
||||||
@@ -357,7 +372,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
"source": "cityhome",
|
"source": "cityhome",
|
||||||
"image": "",
|
"image": "",
|
||||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"first_seen": _prev_cache.get(f"cityhome_{slug}_{listing['unit_name']}", {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")),
|
||||||
|
"last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(f"cityhome_{slug}_{listing['unit_name']}", {}).get("price") != price else _prev_cache[f"cityhome_{slug}_{listing['unit_name']}"].get("last_changed", datetime.now().strftime("%Y-%m-%d")),
|
||||||
}
|
}
|
||||||
|
if not validate_listing(result, "cityhome"):
|
||||||
|
continue
|
||||||
results.append(result)
|
results.append(result)
|
||||||
properties_fetched += 1
|
properties_fetched += 1
|
||||||
|
|
||||||
@@ -374,6 +393,23 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||||
logger.info(f"{'=' * 60}")
|
logger.info(f"{'=' * 60}")
|
||||||
|
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "CityHome",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - _run_start, 1),
|
||||||
|
"success": True,
|
||||||
|
"accepted": len(results),
|
||||||
|
"fetched": len(all_listings),
|
||||||
|
"excluded": {
|
||||||
|
"prodáno": excluded_sold,
|
||||||
|
"typ": excluded_type,
|
||||||
|
"dispozice": excluded_disp,
|
||||||
|
"cena": excluded_price,
|
||||||
|
"plocha": excluded_area,
|
||||||
|
"patro": excluded_floor,
|
||||||
|
"bez GPS": excluded_no_gps,
|
||||||
|
},
|
||||||
|
})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -394,8 +430,22 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
try:
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scraper failed: {e}", exc_info=True)
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "CityHome",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - start, 1),
|
||||||
|
"success": False,
|
||||||
|
"accepted": 0,
|
||||||
|
"fetched": 0,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
raise
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_cityhome.json")
|
json_path = Path("byty_cityhome.json")
|
||||||
|
|||||||
@@ -15,8 +15,10 @@ import re
|
|||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from html.parser import HTMLParser
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from scraper_stats import write_stats, validate_listing
|
||||||
|
|
||||||
|
STATS_FILE = "stats_idnes.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -279,6 +281,8 @@ def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
|
|||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||||
|
_run_start = time.time()
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
cache = load_cache()
|
cache = load_cache()
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
@@ -460,7 +464,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
"source": "idnes",
|
"source": "idnes",
|
||||||
"image": "",
|
"image": "",
|
||||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"last_changed": datetime.now().strftime("%Y-%m-%d"),
|
||||||
}
|
}
|
||||||
|
if not validate_listing(result, "idnes"):
|
||||||
|
continue
|
||||||
results.append(result)
|
results.append(result)
|
||||||
properties_fetched += 1
|
properties_fetched += 1
|
||||||
|
|
||||||
@@ -478,6 +486,25 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||||
logger.info(f"{'=' * 60}")
|
logger.info(f"{'=' * 60}")
|
||||||
|
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "iDNES",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - _run_start, 1),
|
||||||
|
"success": True,
|
||||||
|
"accepted": len(results),
|
||||||
|
"fetched": len(all_listings),
|
||||||
|
"pages": page,
|
||||||
|
"cache_hits": cache_hits,
|
||||||
|
"excluded": {
|
||||||
|
"cena": excluded_price,
|
||||||
|
"plocha": excluded_area,
|
||||||
|
"dispozice": excluded_disp,
|
||||||
|
"panel/síd": excluded_panel,
|
||||||
|
"patro": excluded_floor,
|
||||||
|
"bez GPS": excluded_no_gps,
|
||||||
|
"bez detailu": excluded_detail,
|
||||||
|
},
|
||||||
|
})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -498,8 +525,22 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
try:
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scraper failed: {e}", exc_info=True)
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "iDNES",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - start, 1),
|
||||||
|
"success": False,
|
||||||
|
"accepted": 0,
|
||||||
|
"fetched": 0,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
raise
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_idnes.json")
|
json_path = Path("byty_idnes.json")
|
||||||
|
|||||||
@@ -15,6 +15,9 @@ import time
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
from scraper_stats import write_stats, validate_listing
|
||||||
|
|
||||||
|
STATS_FILE = "stats_psn.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -35,9 +38,10 @@ BASE_URL = "https://psn.cz"
|
|||||||
UNITS_API = f"{BASE_URL}/api/units-list"
|
UNITS_API = f"{BASE_URL}/api/units-list"
|
||||||
|
|
||||||
|
|
||||||
def fetch_json(url: str) -> dict:
|
def fetch_json(url: str, retries: int = 3) -> dict:
|
||||||
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
|
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare) with retry."""
|
||||||
logger.debug(f"HTTP GET: {url}")
|
for attempt in range(retries):
|
||||||
|
logger.debug(f"HTTP GET (attempt {attempt + 1}/{retries}): {url}")
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["curl", "-s", "-L", "--max-time", "30",
|
["curl", "-s", "-L", "--max-time", "30",
|
||||||
"-H", f"User-Agent: {UA}",
|
"-H", f"User-Agent: {UA}",
|
||||||
@@ -45,9 +49,14 @@ def fetch_json(url: str) -> dict:
|
|||||||
url],
|
url],
|
||||||
capture_output=True, text=True, timeout=60
|
capture_output=True, text=True, timeout=60
|
||||||
)
|
)
|
||||||
if result.returncode != 0:
|
if result.returncode == 0:
|
||||||
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
|
|
||||||
return json.loads(result.stdout)
|
return json.loads(result.stdout)
|
||||||
|
if attempt < retries - 1:
|
||||||
|
wait = (attempt + 1) * 2
|
||||||
|
logger.warning(f"curl failed (retry {attempt + 1}/{retries} after {wait}s): {result.stderr[:200]}")
|
||||||
|
time.sleep(wait)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"curl failed after {retries} attempts ({result.returncode}): {result.stderr[:200]}")
|
||||||
|
|
||||||
|
|
||||||
def fix_gps(lat, lng):
|
def fix_gps(lat, lng):
|
||||||
@@ -67,6 +76,8 @@ def format_price(price: int) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def scrape(max_properties: int | None = None):
|
def scrape(max_properties: int | None = None):
|
||||||
|
_run_start = time.time()
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Stahuji inzeráty z PSN.cz")
|
logger.info("Stahuji inzeráty z PSN.cz")
|
||||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||||
@@ -93,11 +104,30 @@ def scrape(max_properties: int | None = None):
|
|||||||
data = fetch_json(url)
|
data = fetch_json(url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Chyba při stahování: {e}", exc_info=True)
|
logger.error(f"Chyba při stahování: {e}", exc_info=True)
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "PSN",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - _run_start, 1),
|
||||||
|
"success": False,
|
||||||
|
"accepted": 0,
|
||||||
|
"fetched": 0,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
return []
|
return []
|
||||||
|
|
||||||
all_units = data.get("units", {}).get("data", [])
|
all_units = data.get("units", {}).get("data", [])
|
||||||
logger.info(f"Staženo jednotek celkem: {len(all_units)}")
|
logger.info(f"Staženo jednotek celkem: {len(all_units)}")
|
||||||
|
|
||||||
|
# Load previous output for first_seen/last_changed tracking
|
||||||
|
_prev_cache: dict[str, dict] = {}
|
||||||
|
_prev_path = Path("byty_psn.json")
|
||||||
|
if _prev_path.exists():
|
||||||
|
try:
|
||||||
|
for _item in json.loads(_prev_path.read_text(encoding="utf-8")):
|
||||||
|
_prev_cache[str(_item["hash_id"])] = _item
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Filtrování
|
# Filtrování
|
||||||
results = []
|
results = []
|
||||||
excluded = {
|
excluded = {
|
||||||
@@ -228,7 +258,11 @@ def scrape(max_properties: int | None = None):
|
|||||||
"source": "psn",
|
"source": "psn",
|
||||||
"image": "",
|
"image": "",
|
||||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"first_seen": _prev_cache.get(str(unit_id), {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")),
|
||||||
|
"last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(str(unit_id), {}).get("price") != int(price) else _prev_cache[str(unit_id)].get("last_changed", datetime.now().strftime("%Y-%m-%d")),
|
||||||
}
|
}
|
||||||
|
if not validate_listing(result, "psn"):
|
||||||
|
continue
|
||||||
results.append(result)
|
results.append(result)
|
||||||
properties_fetched += 1
|
properties_fetched += 1
|
||||||
|
|
||||||
@@ -241,6 +275,15 @@ def scrape(max_properties: int | None = None):
|
|||||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||||
logger.info(f"{'=' * 60}")
|
logger.info(f"{'=' * 60}")
|
||||||
|
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "PSN",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - _run_start, 1),
|
||||||
|
"success": True,
|
||||||
|
"accepted": len(results),
|
||||||
|
"fetched": len(all_units),
|
||||||
|
"excluded": excluded,
|
||||||
|
})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -259,8 +302,22 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
try:
|
||||||
estates = scrape(max_properties=args.max_properties)
|
estates = scrape(max_properties=args.max_properties)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scraper failed: {e}", exc_info=True)
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "PSN",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - start, 1),
|
||||||
|
"success": False,
|
||||||
|
"accepted": 0,
|
||||||
|
"fetched": 0,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
raise
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_psn.json")
|
json_path = Path("byty_psn.json")
|
||||||
|
|||||||
@@ -15,6 +15,9 @@ import re
|
|||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from scraper_stats import write_stats, validate_listing
|
||||||
|
|
||||||
|
STATS_FILE = "stats_realingo.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -53,6 +56,28 @@ HEADERS = {
|
|||||||
BASE_URL = "https://www.realingo.cz"
|
BASE_URL = "https://www.realingo.cz"
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_url(url: str, retries: int = 3) -> str:
|
||||||
|
"""Fetch URL and return HTML string with retry on transient errors."""
|
||||||
|
for attempt in range(retries):
|
||||||
|
try:
|
||||||
|
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
|
||||||
|
req = urllib.request.Request(url, headers=HEADERS)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=30)
|
||||||
|
html = resp.read().decode("utf-8")
|
||||||
|
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||||
|
return html
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
raise
|
||||||
|
except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
|
||||||
|
if attempt < retries - 1:
|
||||||
|
wait = (attempt + 1) * 2
|
||||||
|
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
|
||||||
|
time.sleep(wait)
|
||||||
|
else:
|
||||||
|
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
|
def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
|
||||||
"""Fetch a page of Prague listings. Returns (items, total_count)."""
|
"""Fetch a page of Prague listings. Returns (items, total_count)."""
|
||||||
if page == 1:
|
if page == 1:
|
||||||
@@ -60,14 +85,7 @@ def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
|
|||||||
else:
|
else:
|
||||||
url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
|
url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
|
||||||
|
|
||||||
logger.debug(f"HTTP GET request: {url}")
|
html = fetch_url(url)
|
||||||
logger.debug(f"Headers: {HEADERS}")
|
|
||||||
req = urllib.request.Request(url, headers=HEADERS)
|
|
||||||
try:
|
|
||||||
resp = urllib.request.urlopen(req, timeout=30)
|
|
||||||
html = resp.read().decode("utf-8")
|
|
||||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
|
||||||
|
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||||
html, re.DOTALL
|
html, re.DOTALL
|
||||||
@@ -80,21 +98,13 @@ def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
|
|||||||
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
|
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
|
||||||
logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
|
logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
|
||||||
return offer_list["data"], offer_list["total"]
|
return offer_list["data"], offer_list["total"]
|
||||||
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
|
||||||
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_detail(listing_url: str) -> dict | None:
|
def fetch_detail(listing_url: str) -> dict | None:
|
||||||
"""Fetch detail page for a listing to get floor, building type, etc."""
|
"""Fetch detail page for a listing to get floor, building type, etc."""
|
||||||
try:
|
try:
|
||||||
url = f"{BASE_URL}{listing_url}"
|
url = f"{BASE_URL}{listing_url}"
|
||||||
logger.debug(f"HTTP GET request: {url}")
|
html = fetch_url(url)
|
||||||
req = urllib.request.Request(url, headers=HEADERS)
|
|
||||||
resp = urllib.request.urlopen(req, timeout=30)
|
|
||||||
html = resp.read().decode("utf-8")
|
|
||||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
|
||||||
|
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||||
html, re.DOTALL
|
html, re.DOTALL
|
||||||
@@ -136,6 +146,8 @@ def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
|
|||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||||
|
_run_start = time.time()
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
cache = load_cache()
|
cache = load_cache()
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
@@ -316,7 +328,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
"source": "realingo",
|
"source": "realingo",
|
||||||
"image": "",
|
"image": "",
|
||||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
"last_changed": datetime.now().strftime("%Y-%m-%d"),
|
||||||
}
|
}
|
||||||
|
if not validate_listing(result, "realingo"):
|
||||||
|
continue
|
||||||
results.append(result)
|
results.append(result)
|
||||||
properties_fetched += 1
|
properties_fetched += 1
|
||||||
|
|
||||||
@@ -333,6 +349,25 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|||||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||||
logger.info(f"{'=' * 60}")
|
logger.info(f"{'=' * 60}")
|
||||||
|
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "Realingo",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - _run_start, 1),
|
||||||
|
"success": True,
|
||||||
|
"accepted": len(results),
|
||||||
|
"fetched": len(all_listings),
|
||||||
|
"pages": page - 1,
|
||||||
|
"cache_hits": cache_hits,
|
||||||
|
"excluded": {
|
||||||
|
"dispozice": excluded_category,
|
||||||
|
"cena": excluded_price,
|
||||||
|
"plocha": excluded_area,
|
||||||
|
"bez GPS": excluded_no_gps,
|
||||||
|
"panel/síd": excluded_panel,
|
||||||
|
"patro": excluded_floor,
|
||||||
|
"bez detailu": excluded_detail,
|
||||||
|
},
|
||||||
|
})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -353,8 +388,22 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_run_ts = datetime.now().isoformat(timespec="seconds")
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
try:
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scraper failed: {e}", exc_info=True)
|
||||||
|
write_stats(STATS_FILE, {
|
||||||
|
"source": "Realingo",
|
||||||
|
"timestamp": _run_ts,
|
||||||
|
"duration_sec": round(time.time() - start, 1),
|
||||||
|
"success": False,
|
||||||
|
"accepted": 0,
|
||||||
|
"fetched": 0,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
raise
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_realingo.json")
|
json_path = Path("byty_realingo.json")
|
||||||
|
|||||||
55
scraper_stats.py
Normal file
55
scraper_stats.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
"""Shared utilities for scraper run statistics and listing validation."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
HERE = Path(__file__).parent
|
||||||
|
DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
|
||||||
|
|
||||||
|
_val_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_REQUIRED_FIELDS = ("hash_id", "price", "locality", "lat", "lon", "url", "source")
|
||||||
|
|
||||||
|
|
||||||
|
def validate_listing(listing: dict, context: str = "") -> bool:
|
||||||
|
"""
|
||||||
|
Validate a listing dict before it is written to the output JSON.
|
||||||
|
Returns True if valid, False if the listing should be skipped.
|
||||||
|
Logs a warning for each invalid listing.
|
||||||
|
"""
|
||||||
|
prefix = f"[{context}] " if context else ""
|
||||||
|
|
||||||
|
for field in _REQUIRED_FIELDS:
|
||||||
|
val = listing.get(field)
|
||||||
|
if val is None or val == "":
|
||||||
|
_val_log.warning(f"{prefix}Skipping listing — missing field '{field}': {listing.get('hash_id', '?')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
price = listing.get("price")
|
||||||
|
if not isinstance(price, (int, float)) or price <= 0:
|
||||||
|
_val_log.warning(f"{prefix}Skipping listing — invalid price={price!r}: {listing.get('hash_id', '?')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
lat, lon = listing.get("lat"), listing.get("lon")
|
||||||
|
if not isinstance(lat, (int, float)) or not isinstance(lon, (int, float)):
|
||||||
|
_val_log.warning(f"{prefix}Skipping listing — non-numeric GPS lat={lat!r} lon={lon!r}: {listing.get('hash_id', '?')}")
|
||||||
|
return False
|
||||||
|
if not (47.0 <= lat <= 52.0) or not (12.0 <= lon <= 19.0):
|
||||||
|
_val_log.warning(f"{prefix}Skipping listing — GPS outside Czech Republic lat={lat} lon={lon}: {listing.get('hash_id', '?')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
area = listing.get("area")
|
||||||
|
if area is not None and (not isinstance(area, (int, float)) or area <= 0):
|
||||||
|
_val_log.warning(f"{prefix}Skipping listing — invalid area={area!r}: {listing.get('hash_id', '?')}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def write_stats(filename: str, stats: dict) -> None:
|
||||||
|
"""Write scraper run stats dict to the data directory."""
|
||||||
|
path = DATA_DIR / filename
|
||||||
|
path.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
477
server.py
Normal file
477
server.py
Normal file
@@ -0,0 +1,477 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
General-purpose HTTP server for maru-hleda-byt.
|
||||||
|
|
||||||
|
Serves static files from DATA_DIR and additionally handles:
|
||||||
|
GET /scrapers-status → SSR scraper status page
|
||||||
|
GET /api/ratings → ratings.json contents
|
||||||
|
POST /api/ratings → save entire ratings object
|
||||||
|
GET /api/ratings/export → same as GET, with download header
|
||||||
|
GET /api/status → status.json contents (JSON)
|
||||||
|
GET /api/status/history → scraper_history.json contents (JSON)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import functools
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
PORT = int(os.environ.get("SERVER_PORT", 8080))
|
||||||
|
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
|
||||||
|
RATINGS_FILE = DATA_DIR / "ratings.json"
|
||||||
|
_LOG_LEVEL = getattr(logging, os.environ.get("LOG_LEVEL", "INFO").upper(), logging.INFO)
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=_LOG_LEVEL,
|
||||||
|
format="%(asctime)s [server] %(levelname)s %(message)s",
|
||||||
|
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||||
|
)
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
COLORS = {
|
||||||
|
"sreality": "#1976D2",
|
||||||
|
"realingo": "#7B1FA2",
|
||||||
|
"bezrealitky": "#E65100",
|
||||||
|
"idnes": "#C62828",
|
||||||
|
"psn": "#2E7D32",
|
||||||
|
"cityhome": "#00838F",
|
||||||
|
}
|
||||||
|
|
||||||
|
MONTHS_CZ = [
|
||||||
|
"ledna", "února", "března", "dubna", "května", "června",
|
||||||
|
"července", "srpna", "září", "října", "listopadu", "prosince",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _load_json(path: Path, default=None):
|
||||||
|
"""Read and parse JSON file; return default on missing or parse error."""
|
||||||
|
log.debug("_load_json: %s", path.resolve())
|
||||||
|
try:
|
||||||
|
if path.exists():
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Failed to load %s: %s", path, e)
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_date(iso_str: str) -> str:
|
||||||
|
"""Format ISO timestamp as Czech date string."""
|
||||||
|
try:
|
||||||
|
d = datetime.fromisoformat(iso_str)
|
||||||
|
return f"{d.day}. {MONTHS_CZ[d.month - 1]} {d.year}, {d.hour:02d}:{d.minute:02d}"
|
||||||
|
except Exception:
|
||||||
|
return iso_str
|
||||||
|
|
||||||
|
|
||||||
|
def load_ratings() -> dict:
|
||||||
|
return _load_json(RATINGS_FILE, default={})
|
||||||
|
|
||||||
|
|
||||||
|
def save_ratings(data: dict) -> None:
|
||||||
|
RATINGS_FILE.write_text(
|
||||||
|
json.dumps(data, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── SSR status page ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_CSS = """\
|
||||||
|
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||||||
|
body {
|
||||||
|
font-family: system-ui, -apple-system, sans-serif;
|
||||||
|
background: #f5f5f5; color: #333;
|
||||||
|
padding: 24px; max-width: 640px; margin: 0 auto;
|
||||||
|
}
|
||||||
|
h1 { font-size: 22px; margin-bottom: 4px; }
|
||||||
|
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
|
||||||
|
.card {
|
||||||
|
background: white; border-radius: 12px; padding: 20px;
|
||||||
|
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
|
||||||
|
}
|
||||||
|
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
|
||||||
|
.timestamp { font-size: 28px; font-weight: 700; color: #1976D2; }
|
||||||
|
.timestamp-sub { font-size: 13px; color: #999; margin-top: 2px; }
|
||||||
|
.summary-row {
|
||||||
|
display: flex; justify-content: space-between; align-items: center;
|
||||||
|
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
|
||||||
|
}
|
||||||
|
.summary-row:last-child { border-bottom: none; }
|
||||||
|
.summary-label { font-size: 13px; color: #666; }
|
||||||
|
.summary-value { font-size: 18px; font-weight: 700; }
|
||||||
|
.badge {
|
||||||
|
display: inline-block; padding: 2px 8px; border-radius: 4px;
|
||||||
|
font-size: 11px; font-weight: 600; color: white;
|
||||||
|
}
|
||||||
|
.badge-ok { background: #4CAF50; }
|
||||||
|
.badge-err { background: #F44336; }
|
||||||
|
.badge-skip { background: #FF9800; }
|
||||||
|
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
|
||||||
|
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; }
|
||||||
|
.bar-fill { height: 100%; border-radius: 4px; }
|
||||||
|
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
|
||||||
|
.loader-wrap {
|
||||||
|
display: flex; flex-direction: column; align-items: center;
|
||||||
|
justify-content: center; padding: 60px 0;
|
||||||
|
}
|
||||||
|
.spinner {
|
||||||
|
width: 40px; height: 40px; border: 4px solid #e0e0e0;
|
||||||
|
border-top-color: #1976D2; border-radius: 50%;
|
||||||
|
animation: spin 0.8s linear infinite;
|
||||||
|
}
|
||||||
|
@keyframes spin { to { transform: rotate(360deg); } }
|
||||||
|
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
|
||||||
|
.link-row { text-align: center; margin-top: 8px; }
|
||||||
|
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
|
||||||
|
.history-table { width: 100%; border-collapse: collapse; font-size: 12px; }
|
||||||
|
.history-table th {
|
||||||
|
text-align: left; font-weight: 600; color: #999; font-size: 11px;
|
||||||
|
padding: 4px 6px 8px 6px; border-bottom: 2px solid #f0f0f0;
|
||||||
|
}
|
||||||
|
.history-table td { padding: 7px 6px; border-bottom: 1px solid #f5f5f5; vertical-align: middle; }
|
||||||
|
.history-table tr:last-child td { border-bottom: none; }
|
||||||
|
.history-table tr.latest td { background: #f8fbff; font-weight: 600; }
|
||||||
|
.src-nums { display: flex; gap: 4px; flex-wrap: wrap; }
|
||||||
|
.src-chip {
|
||||||
|
display: inline-block; padding: 1px 5px; border-radius: 3px;
|
||||||
|
font-size: 10px; color: white; font-variant-numeric: tabular-nums;
|
||||||
|
}
|
||||||
|
.clickable-row { cursor: pointer; }
|
||||||
|
.clickable-row:hover td { background: #f0f7ff !important; }
|
||||||
|
/* Modal */
|
||||||
|
#md-overlay {
|
||||||
|
position: fixed; inset: 0; background: rgba(0,0,0,0.45);
|
||||||
|
display: flex; align-items: flex-start; justify-content: center;
|
||||||
|
z-index: 1000; padding: 40px 16px; overflow-y: auto;
|
||||||
|
}
|
||||||
|
#md-box {
|
||||||
|
background: white; border-radius: 12px; padding: 24px;
|
||||||
|
width: 100%; max-width: 620px; position: relative;
|
||||||
|
box-shadow: 0 8px 32px rgba(0,0,0,0.24); margin: auto;
|
||||||
|
}
|
||||||
|
#md-close {
|
||||||
|
position: absolute; top: 10px; right: 14px;
|
||||||
|
background: none; border: none; font-size: 26px; cursor: pointer;
|
||||||
|
color: #aaa; line-height: 1;
|
||||||
|
}
|
||||||
|
#md-close:hover { color: #333; }
|
||||||
|
#md-box h3 { font-size: 15px; margin-bottom: 14px; padding-right: 24px; }
|
||||||
|
.md-summary { display: flex; gap: 20px; flex-wrap: wrap; font-size: 13px; margin-bottom: 16px; color: #555; }
|
||||||
|
.md-summary b { color: #333; }
|
||||||
|
.detail-table { width: 100%; border-collapse: collapse; font-size: 12px; }
|
||||||
|
.detail-table th {
|
||||||
|
text-align: left; color: #999; font-size: 11px; font-weight: 600;
|
||||||
|
padding: 4px 8px 6px 0; border-bottom: 2px solid #f0f0f0; white-space: nowrap;
|
||||||
|
}
|
||||||
|
.detail-table td { padding: 6px 8px 6px 0; border-bottom: 1px solid #f5f5f5; vertical-align: top; }
|
||||||
|
.detail-table tr:last-child td { border-bottom: none; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
_SOURCE_ORDER = ["Sreality", "Realingo", "Bezrealitky", "iDNES", "PSN", "CityHome"]
|
||||||
|
_SOURCE_ABBR = ["Sre", "Rea", "Bez", "iDN", "PSN", "CH"]
|
||||||
|
|
||||||
|
|
||||||
|
def _sources_html(sources: list) -> str:
|
||||||
|
if not sources:
|
||||||
|
return ""
|
||||||
|
max_count = max((s.get("accepted", 0) for s in sources), default=1) or 1
|
||||||
|
parts = ['<div class="card"><h2>Zdroje</h2>']
|
||||||
|
for s in sources:
|
||||||
|
name = s.get("name", "?")
|
||||||
|
accepted = s.get("accepted", 0)
|
||||||
|
error = s.get("error")
|
||||||
|
exc = s.get("excluded", {})
|
||||||
|
excluded_total = sum(exc.values()) if isinstance(exc, dict) else s.get("excluded_total", 0)
|
||||||
|
color = COLORS.get(name.lower(), "#999")
|
||||||
|
pct = round(accepted / max_count * 100) if max_count else 0
|
||||||
|
if error:
|
||||||
|
badge = '<span class="badge badge-err">chyba</span>'
|
||||||
|
elif accepted == 0:
|
||||||
|
badge = '<span class="badge badge-skip">0</span>'
|
||||||
|
else:
|
||||||
|
badge = '<span class="badge badge-ok">OK</span>'
|
||||||
|
parts.append(
|
||||||
|
f'<div style="margin-bottom:12px;">'
|
||||||
|
f'<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">'
|
||||||
|
f'<span style="font-weight:600;font-size:14px;">{name} {badge}</span>'
|
||||||
|
f'<span style="font-size:12px;color:#999;">{excluded_total} vyloučených</span>'
|
||||||
|
f'</div>'
|
||||||
|
f'<div class="bar-row">'
|
||||||
|
f'<div class="bar-track"><div class="bar-fill" style="width:{pct}%;background:{color};"></div></div>'
|
||||||
|
f'<span class="bar-count">{accepted}</span>'
|
||||||
|
f'</div></div>'
|
||||||
|
)
|
||||||
|
parts.append("</div>")
|
||||||
|
return "".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _history_html(history: list) -> str:
|
||||||
|
if not history:
|
||||||
|
return ""
|
||||||
|
rows = list(reversed(history))
|
||||||
|
parts = [
|
||||||
|
'<div class="card">'
|
||||||
|
'<h2>Historie běhů <span style="font-size:11px;font-weight:400;color:#bbb;">– klikni pro detaily</span></h2>',
|
||||||
|
'<table class="history-table"><thead><tr>',
|
||||||
|
'<th>Datum</th><th>Trvání</th><th>Přijato / Dedup</th><th>Zdroje</th><th>OK</th>',
|
||||||
|
'</tr></thead><tbody>',
|
||||||
|
]
|
||||||
|
for i, entry in enumerate(rows):
|
||||||
|
row_class = ' class="latest clickable-row"' if i == 0 else ' class="clickable-row"'
|
||||||
|
src_map = {s["name"]: s for s in entry.get("sources", []) if "name" in s}
|
||||||
|
chips = "".join(
|
||||||
|
f'<span class="src-chip" style="background:{"#F44336" if (src_map.get(name) or {}).get("error") else COLORS.get(name.lower(), "#999")}" title="{name}">'
|
||||||
|
f'{abbr} {src_map[name].get("accepted", 0) if name in src_map else "-"}</span>'
|
||||||
|
for name, abbr in zip(_SOURCE_ORDER, _SOURCE_ABBR)
|
||||||
|
)
|
||||||
|
ok_badge = (
|
||||||
|
'<span class="badge badge-err">chyba</span>'
|
||||||
|
if entry.get("success") is False
|
||||||
|
else '<span class="badge badge-ok">OK</span>'
|
||||||
|
)
|
||||||
|
dur = f'{entry["duration_sec"]}s' if entry.get("duration_sec") is not None else "-"
|
||||||
|
parts.append(
|
||||||
|
f'<tr{row_class} data-idx="{i}">'
|
||||||
|
f'<td>{_fmt_date(entry.get("timestamp", ""))}</td>'
|
||||||
|
f'<td>{dur}</td>'
|
||||||
|
f'<td>{entry.get("total_accepted", "-")} / {entry.get("deduplicated", "-")}</td>'
|
||||||
|
f'<td><div class="src-nums">{chips}</div></td>'
|
||||||
|
f'<td>{ok_badge}</td>'
|
||||||
|
f'</tr>'
|
||||||
|
)
|
||||||
|
parts.append("</tbody></table></div>")
|
||||||
|
return "".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _modal_script(rows_json: str) -> str:
|
||||||
|
"""Return the modal overlay HTML + JS for the history detail popup."""
|
||||||
|
return (
|
||||||
|
'<div id="md-overlay" style="display:none">'
|
||||||
|
'<div id="md-box"><button id="md-close">×</button>'
|
||||||
|
'<div id="md-body"></div></div></div>\n'
|
||||||
|
'<script>\n(function(){\n'
|
||||||
|
f'var H={rows_json};\n'
|
||||||
|
'var C={"sreality":"#1976D2","realingo":"#7B1FA2","bezrealitky":"#E65100","idnes":"#C62828","psn":"#2E7D32","cityhome":"#00838F"};\n'
|
||||||
|
'var MN=["ledna","února","března","dubna","května","června","července","srpna","září","října","listopadu","prosince"];\n'
|
||||||
|
'function fd(s){var d=new Date(s);return d.getDate()+". "+MN[d.getMonth()]+" "+d.getFullYear()+", "+String(d.getHours()).padStart(2,"0")+":"+String(d.getMinutes()).padStart(2,"0");}\n'
|
||||||
|
'function openModal(idx){\n'
|
||||||
|
' var e=H[idx],src=e.sources||[];\n'
|
||||||
|
' var h="<h3>Detaily b\u011bhu \u2013 "+fd(e.timestamp)+"</h3>";\n'
|
||||||
|
' h+="<div class=\\"md-summary\\">";\n'
|
||||||
|
' if(e.duration_sec!=null) h+="<span><b>Trvání:</b> "+e.duration_sec+"s</span>";\n'
|
||||||
|
' if(e.total_accepted!=null) h+="<span><b>Přijato:</b> "+e.total_accepted+"</span>";\n'
|
||||||
|
' if(e.deduplicated!=null) h+="<span><b>Po dedup:</b> "+e.deduplicated+"</span>";\n'
|
||||||
|
' h+="</div>";\n'
|
||||||
|
' h+="<table class=\\"detail-table\\"><thead><tr>";\n'
|
||||||
|
' h+="<th>Zdroj</th><th>Přijato</th><th>Staženo</th><th>Stránky</th><th>Cache</th><th>Vyloučeno</th><th>Čas</th><th>OK</th>";\n'
|
||||||
|
' h+="</tr></thead><tbody>";\n'
|
||||||
|
' src.forEach(function(s){\n'
|
||||||
|
' var nm=s.name||"?",col=C[nm.toLowerCase()]||"#999";\n'
|
||||||
|
' var exc=s.excluded||{};\n'
|
||||||
|
' var excStr=Object.entries(exc).filter(function(kv){return kv[1]>0;}).map(function(kv){return kv[0]+": "+kv[1];}).join(", ")||"\u2013";\n'
|
||||||
|
' var ok=s.error?"<span class=\\"badge badge-err\\" title=\\""+s.error+"\\">chyba</span>":"<span class=\\"badge badge-ok\\">OK</span>";\n'
|
||||||
|
' var dot="<span style=\\"display:inline-block;width:8px;height:8px;border-radius:50%;background:"+col+";margin-right:5px;\\"></span>";\n'
|
||||||
|
' h+="<tr>";\n'
|
||||||
|
' h+="<td>"+dot+nm+"</td>";\n'
|
||||||
|
' h+="<td>"+(s.accepted!=null?s.accepted:"\u2013")+"</td>";\n'
|
||||||
|
' h+="<td>"+(s.fetched!=null?s.fetched:"\u2013")+"</td>";\n'
|
||||||
|
' h+="<td>"+(s.pages!=null?s.pages:"\u2013")+"</td>";\n'
|
||||||
|
' h+="<td>"+(s.cache_hits!=null?s.cache_hits:"\u2013")+"</td>";\n'
|
||||||
|
' h+="<td style=\\"font-size:11px;color:#666;\\">"+excStr+"</td>";\n'
|
||||||
|
' h+="<td>"+(s.duration_sec!=null?s.duration_sec+"s":"\u2013")+"</td>";\n'
|
||||||
|
' h+="<td>"+ok+"</td></tr>";\n'
|
||||||
|
' });\n'
|
||||||
|
' h+="</tbody></table>";\n'
|
||||||
|
' document.getElementById("md-body").innerHTML=h;\n'
|
||||||
|
' document.getElementById("md-overlay").style.display="flex";\n'
|
||||||
|
'}\n'
|
||||||
|
'function closeModal(){document.getElementById("md-overlay").style.display="none";}\n'
|
||||||
|
'var tb=document.querySelector(".history-table tbody");\n'
|
||||||
|
'if(tb)tb.addEventListener("click",function(e){var tr=e.target.closest("tr[data-idx]");if(tr)openModal(parseInt(tr.dataset.idx,10));});\n'
|
||||||
|
'document.getElementById("md-close").addEventListener("click",closeModal);\n'
|
||||||
|
'document.getElementById("md-overlay").addEventListener("click",function(e){if(e.target===this)closeModal();});\n'
|
||||||
|
'document.addEventListener("keydown",function(e){if(e.key==="Escape")closeModal();});\n'
|
||||||
|
'})();\n</script>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _render_status_html(status: dict | None, history: list, is_running: bool = False) -> str:
|
||||||
|
"""Generate the complete HTML page for /scrapers-status."""
|
||||||
|
head_open = (
|
||||||
|
'<!DOCTYPE html>\n<html lang="cs">\n<head>\n'
|
||||||
|
'<meta charset="UTF-8">\n'
|
||||||
|
'<meta name="viewport" content="width=device-width, initial-scale=1.0">\n'
|
||||||
|
f'<title>Scraper status</title>\n<style>{_CSS}</style>\n'
|
||||||
|
)
|
||||||
|
page_header = '<h1>Scraper status</h1>\n<div class="subtitle">maru-hleda-byt</div>\n'
|
||||||
|
footer = '<div class="link-row"><a href="/mapa_bytu.html">Otevřít mapu</a></div>'
|
||||||
|
|
||||||
|
if status is None:
|
||||||
|
return (
|
||||||
|
head_open + '</head>\n<body>\n' + page_header
|
||||||
|
+ '<div class="card"><p style="color:#F44336">Status není k dispozici.</p></div>\n'
|
||||||
|
+ footer + '\n</body>\n</html>'
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_running:
|
||||||
|
return (
|
||||||
|
head_open
|
||||||
|
+ '<meta http-equiv="refresh" content="30">\n'
|
||||||
|
+ '</head>\n<body>\n' + page_header
|
||||||
|
+ '<div class="loader-wrap"><div class="spinner"></div>'
|
||||||
|
+ '<div class="loader-text">Scraper právě běží…</div></div>\n'
|
||||||
|
+ footer + '\n</body>\n</html>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Done state ────────────────────────────────────────────────────────────
|
||||||
|
ts = status.get("timestamp", "")
|
||||||
|
duration = status.get("duration_sec")
|
||||||
|
total_accepted = status.get("total_accepted", 0)
|
||||||
|
deduplicated = status.get("deduplicated")
|
||||||
|
|
||||||
|
ts_card = (
|
||||||
|
'<div class="card"><h2>Poslední scrape</h2>'
|
||||||
|
f'<div class="timestamp">{_fmt_date(ts)}</div>'
|
||||||
|
+ (f'<div class="timestamp-sub">Trvání: {round(duration)}s</div>' if duration is not None else "")
|
||||||
|
+ '</div>'
|
||||||
|
)
|
||||||
|
|
||||||
|
sum_card = (
|
||||||
|
'<div class="card"><h2>Souhrn</h2>'
|
||||||
|
f'<div class="summary-row"><span class="summary-label">Vyhovujících bytů</span>'
|
||||||
|
f'<span class="summary-value" style="color:#4CAF50">{total_accepted}</span></div>'
|
||||||
|
+ (
|
||||||
|
f'<div class="summary-row"><span class="summary-label">Po deduplikaci (v mapě)</span>'
|
||||||
|
f'<span class="summary-value" style="color:#1976D2">{deduplicated}</span></div>'
|
||||||
|
if deduplicated is not None else ""
|
||||||
|
)
|
||||||
|
+ '</div>'
|
||||||
|
)
|
||||||
|
|
||||||
|
rows_for_js = list(reversed(history))
|
||||||
|
body = (
|
||||||
|
page_header
|
||||||
|
+ ts_card + "\n"
|
||||||
|
+ sum_card + "\n"
|
||||||
|
+ _sources_html(status.get("sources", [])) + "\n"
|
||||||
|
+ _history_html(history) + "\n"
|
||||||
|
+ footer
|
||||||
|
)
|
||||||
|
modal = _modal_script(json.dumps(rows_for_js, ensure_ascii=False))
|
||||||
|
return head_open + '</head>\n<body>\n' + body + '\n' + modal + '\n</body>\n</html>'
|
||||||
|
|
||||||
|
|
||||||
|
# ── HTTP handler ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class Handler(SimpleHTTPRequestHandler):
|
||||||
|
def log_message(self, format, *args):
|
||||||
|
pass # suppress default access log; use our own where needed
|
||||||
|
|
||||||
|
def _send_json(self, status: int, body, extra_headers=None):
|
||||||
|
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
|
||||||
|
self.send_response(status)
|
||||||
|
self.send_header("Content-Type", "application/json; charset=utf-8")
|
||||||
|
self.send_header("Content-Length", str(len(payload)))
|
||||||
|
self.send_header("Access-Control-Allow-Origin", "*")
|
||||||
|
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||||
|
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||||
|
if extra_headers:
|
||||||
|
for k, v in extra_headers.items():
|
||||||
|
self.send_header(k, v)
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(payload)
|
||||||
|
|
||||||
|
def do_OPTIONS(self):
|
||||||
|
self.send_response(204)
|
||||||
|
self.send_header("Access-Control-Allow-Origin", "*")
|
||||||
|
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||||
|
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
if self.path.startswith("/api/"):
|
||||||
|
self._handle_api_get()
|
||||||
|
elif self.path.rstrip("/") == "/scrapers-status":
|
||||||
|
self._serve_status_page()
|
||||||
|
else:
|
||||||
|
log.debug("GET %s → static file: %s", self.path, self.translate_path(self.path))
|
||||||
|
super().do_GET()
|
||||||
|
|
||||||
|
def _handle_api_get(self):
|
||||||
|
if self.path in ("/api/ratings", "/api/ratings/export"):
|
||||||
|
ratings = load_ratings()
|
||||||
|
extra = None
|
||||||
|
if self.path == "/api/ratings/export":
|
||||||
|
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
|
||||||
|
log.info("GET %s → %d ratings", self.path, len(ratings))
|
||||||
|
self._send_json(200, ratings, extra)
|
||||||
|
elif self.path == "/api/status":
|
||||||
|
data = _load_json(DATA_DIR / "status.json")
|
||||||
|
if data is None:
|
||||||
|
self._send_json(404, {"error": "status not available"})
|
||||||
|
return
|
||||||
|
log.info("GET /api/status → ok")
|
||||||
|
self._send_json(200, data)
|
||||||
|
elif self.path == "/api/status/history":
|
||||||
|
data = _load_json(DATA_DIR / "scraper_history.json", default=[])
|
||||||
|
if not isinstance(data, list):
|
||||||
|
data = []
|
||||||
|
log.info("GET /api/status/history → %d entries", len(data))
|
||||||
|
self._send_json(200, data)
|
||||||
|
else:
|
||||||
|
self._send_json(404, {"error": "not found"})
|
||||||
|
|
||||||
|
def _serve_status_page(self):
|
||||||
|
status = _load_json(DATA_DIR / "status.json")
|
||||||
|
history = _load_json(DATA_DIR / "scraper_history.json", default=[])
|
||||||
|
if not isinstance(history, list):
|
||||||
|
history = []
|
||||||
|
is_running = (DATA_DIR / "scraper_running.json").exists()
|
||||||
|
html = _render_status_html(status, history, is_running)
|
||||||
|
payload = html.encode("utf-8")
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "text/html; charset=utf-8")
|
||||||
|
self.send_header("Content-Length", str(len(payload)))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(payload)
|
||||||
|
|
||||||
|
def do_POST(self):
|
||||||
|
if self.path == "/api/ratings":
|
||||||
|
length = int(self.headers.get("Content-Length", 0))
|
||||||
|
if length == 0:
|
||||||
|
self._send_json(400, {"error": "empty body"})
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
raw = self.rfile.read(length)
|
||||||
|
data = json.loads(raw.decode("utf-8"))
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Bad request body: %s", e)
|
||||||
|
self._send_json(400, {"error": "invalid JSON"})
|
||||||
|
return
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
self._send_json(400, {"error": "expected JSON object"})
|
||||||
|
return
|
||||||
|
save_ratings(data)
|
||||||
|
log.info("POST /api/ratings → saved %d ratings", len(data))
|
||||||
|
self._send_json(200, {"ok": True, "count": len(data)})
|
||||||
|
else:
|
||||||
|
self._send_json(404, {"error": "not found"})
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
log.info("Server starting on port %d, data dir: %s", PORT, DATA_DIR)
|
||||||
|
handler = functools.partial(Handler, directory=str(DATA_DIR))
|
||||||
|
server = HTTPServer(("0.0.0.0", PORT), handler)
|
||||||
|
try:
|
||||||
|
server.serve_forever()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
log.info("Stopped.")
|
||||||
|
sys.exit(0)
|
||||||
204
status.html
204
status.html
@@ -1,204 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html lang="cs">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>Scraper status</title>
|
|
||||||
<style>
|
|
||||||
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
||||||
body {
|
|
||||||
font-family: system-ui, -apple-system, sans-serif;
|
|
||||||
background: #f5f5f5; color: #333;
|
|
||||||
padding: 24px; max-width: 640px; margin: 0 auto;
|
|
||||||
}
|
|
||||||
h1 { font-size: 22px; margin-bottom: 4px; }
|
|
||||||
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
|
|
||||||
.card {
|
|
||||||
background: white; border-radius: 12px; padding: 20px;
|
|
||||||
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
|
|
||||||
}
|
|
||||||
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
|
|
||||||
.timestamp {
|
|
||||||
font-size: 28px; font-weight: 700; color: #1976D2;
|
|
||||||
}
|
|
||||||
.timestamp-ago { font-size: 13px; color: #999; margin-top: 2px; }
|
|
||||||
|
|
||||||
/* Source table */
|
|
||||||
.source-table { width: 100%; border-collapse: collapse; }
|
|
||||||
.source-table td { padding: 8px 0; border-bottom: 1px solid #f0f0f0; font-size: 14px; }
|
|
||||||
.source-table tr:last-child td { border-bottom: none; }
|
|
||||||
.source-table .name { font-weight: 600; }
|
|
||||||
.source-table .count { text-align: right; font-variant-numeric: tabular-nums; }
|
|
||||||
.source-table .rejected { text-align: right; color: #999; font-size: 12px; }
|
|
||||||
.badge {
|
|
||||||
display: inline-block; padding: 2px 8px; border-radius: 4px;
|
|
||||||
font-size: 11px; font-weight: 600; color: white;
|
|
||||||
}
|
|
||||||
.badge-ok { background: #4CAF50; }
|
|
||||||
.badge-err { background: #F44336; }
|
|
||||||
.badge-skip { background: #FF9800; }
|
|
||||||
|
|
||||||
/* Summary bar */
|
|
||||||
.summary-row {
|
|
||||||
display: flex; justify-content: space-between; align-items: center;
|
|
||||||
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
|
|
||||||
}
|
|
||||||
.summary-row:last-child { border-bottom: none; }
|
|
||||||
.summary-label { font-size: 13px; color: #666; }
|
|
||||||
.summary-value { font-size: 18px; font-weight: 700; }
|
|
||||||
|
|
||||||
/* Source bar chart */
|
|
||||||
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
|
|
||||||
.bar-label { width: 90px; font-size: 12px; text-align: right; color: #666; }
|
|
||||||
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; position: relative; }
|
|
||||||
.bar-fill { height: 100%; border-radius: 4px; transition: width 0.5s ease; }
|
|
||||||
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
|
|
||||||
|
|
||||||
/* Loader */
|
|
||||||
.loader-wrap {
|
|
||||||
display: flex; flex-direction: column; align-items: center;
|
|
||||||
justify-content: center; padding: 60px 0;
|
|
||||||
}
|
|
||||||
.spinner {
|
|
||||||
width: 40px; height: 40px; border: 4px solid #e0e0e0;
|
|
||||||
border-top-color: #1976D2; border-radius: 50%;
|
|
||||||
animation: spin 0.8s linear infinite;
|
|
||||||
}
|
|
||||||
@keyframes spin { to { transform: rotate(360deg); } }
|
|
||||||
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
|
|
||||||
|
|
||||||
.error-msg { color: #F44336; padding: 40px 0; text-align: center; }
|
|
||||||
.link-row { text-align: center; margin-top: 8px; }
|
|
||||||
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<h1>Scraper status</h1>
|
|
||||||
<div class="subtitle">maru-hleda-byt</div>
|
|
||||||
|
|
||||||
<div id="content">
|
|
||||||
<div class="loader-wrap">
|
|
||||||
<div class="spinner"></div>
|
|
||||||
<div class="loader-text">Nacitam status...</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="link-row"><a href="mapa_bytu.html">Otevrit mapu</a></div>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
var COLORS = {
|
|
||||||
sreality: '#1976D2',
|
|
||||||
realingo: '#7B1FA2',
|
|
||||||
bezrealitky: '#E65100',
|
|
||||||
idnes: '#C62828',
|
|
||||||
psn: '#2E7D32',
|
|
||||||
cityhome: '#00838F',
|
|
||||||
};
|
|
||||||
|
|
||||||
function timeAgo(dateStr) {
|
|
||||||
var d = new Date(dateStr);
|
|
||||||
var now = new Date();
|
|
||||||
var diff = Math.floor((now - d) / 1000);
|
|
||||||
if (diff < 60) return 'prave ted';
|
|
||||||
if (diff < 3600) return Math.floor(diff / 60) + ' min zpet';
|
|
||||||
if (diff < 86400) return Math.floor(diff / 3600) + ' hod zpet';
|
|
||||||
return Math.floor(diff / 86400) + ' dni zpet';
|
|
||||||
}
|
|
||||||
|
|
||||||
function formatDate(dateStr) {
|
|
||||||
var d = new Date(dateStr);
|
|
||||||
var day = d.getDate();
|
|
||||||
var months = ['ledna','unora','brezna','dubna','kvetna','cervna',
|
|
||||||
'cervence','srpna','zari','rijna','listopadu','prosince'];
|
|
||||||
var hh = String(d.getHours()).padStart(2, '0');
|
|
||||||
var mm = String(d.getMinutes()).padStart(2, '0');
|
|
||||||
return day + '. ' + months[d.getMonth()] + ' ' + d.getFullYear() + ', ' + hh + ':' + mm;
|
|
||||||
}
|
|
||||||
|
|
||||||
function render(data) {
|
|
||||||
// Check if scrape is currently running
|
|
||||||
if (data.status === 'running') {
|
|
||||||
document.getElementById('content').innerHTML =
|
|
||||||
'<div class="loader-wrap">' +
|
|
||||||
'<div class="spinner"></div>' +
|
|
||||||
'<div class="loader-text">Scraper prave bezi...</div>' +
|
|
||||||
'</div>';
|
|
||||||
setTimeout(loadStatus, 30000);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
var sources = data.sources || [];
|
|
||||||
var totalOk = 0, totalRej = 0;
|
|
||||||
var maxCount = 0;
|
|
||||||
sources.forEach(function(s) {
|
|
||||||
totalOk += s.accepted || 0;
|
|
||||||
totalRej += s.rejected || 0;
|
|
||||||
if (s.accepted > maxCount) maxCount = s.accepted;
|
|
||||||
});
|
|
||||||
|
|
||||||
var html = '';
|
|
||||||
|
|
||||||
// Timestamp card
|
|
||||||
html += '<div class="card">';
|
|
||||||
html += '<h2>Posledni scrape</h2>';
|
|
||||||
html += '<div class="timestamp">' + formatDate(data.timestamp) + '</div>';
|
|
||||||
html += '<div class="timestamp-ago">' + timeAgo(data.timestamp) + '</div>';
|
|
||||||
if (data.duration_sec) {
|
|
||||||
html += '<div class="timestamp-ago">Trvani: ' + Math.round(data.duration_sec) + 's</div>';
|
|
||||||
}
|
|
||||||
html += '</div>';
|
|
||||||
|
|
||||||
// Summary card
|
|
||||||
html += '<div class="card">';
|
|
||||||
html += '<h2>Souhrn</h2>';
|
|
||||||
html += '<div class="summary-row"><span class="summary-label">Vyhovujicich bytu</span><span class="summary-value" style="color:#4CAF50">' + totalOk + '</span></div>';
|
|
||||||
html += '<div class="summary-row"><span class="summary-label">Vyloucenych</span><span class="summary-value" style="color:#999">' + totalRej + '</span></div>';
|
|
||||||
if (data.deduplicated !== undefined) {
|
|
||||||
html += '<div class="summary-row"><span class="summary-label">Po deduplikaci (v mape)</span><span class="summary-value" style="color:#1976D2">' + data.deduplicated + '</span></div>';
|
|
||||||
}
|
|
||||||
html += '</div>';
|
|
||||||
|
|
||||||
// Sources card
|
|
||||||
html += '<div class="card">';
|
|
||||||
html += '<h2>Zdroje</h2>';
|
|
||||||
sources.forEach(function(s) {
|
|
||||||
var color = COLORS[s.name.toLowerCase()] || '#999';
|
|
||||||
var pct = maxCount > 0 ? Math.round((s.accepted / maxCount) * 100) : 0;
|
|
||||||
var badge = s.error
|
|
||||||
? '<span class="badge badge-err">chyba</span>'
|
|
||||||
: (s.accepted === 0 ? '<span class="badge badge-skip">0</span>' : '<span class="badge badge-ok">OK</span>');
|
|
||||||
|
|
||||||
html += '<div style="margin-bottom:12px;">';
|
|
||||||
html += '<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">';
|
|
||||||
html += '<span style="font-weight:600;font-size:14px;">' + s.name + ' ' + badge + '</span>';
|
|
||||||
html += '<span style="font-size:12px;color:#999;">' + (s.rejected || 0) + ' vyloucenych</span>';
|
|
||||||
html += '</div>';
|
|
||||||
html += '<div class="bar-row">';
|
|
||||||
html += '<div class="bar-track"><div class="bar-fill" style="width:' + pct + '%;background:' + color + ';"></div></div>';
|
|
||||||
html += '<span class="bar-count">' + (s.accepted || 0) + '</span>';
|
|
||||||
html += '</div>';
|
|
||||||
html += '</div>';
|
|
||||||
});
|
|
||||||
html += '</div>';
|
|
||||||
|
|
||||||
document.getElementById('content').innerHTML = html;
|
|
||||||
}
|
|
||||||
|
|
||||||
function loadStatus() {
|
|
||||||
fetch('status.json?t=' + Date.now())
|
|
||||||
.then(function(r) {
|
|
||||||
if (!r.ok) throw new Error(r.status);
|
|
||||||
return r.json();
|
|
||||||
})
|
|
||||||
.then(render)
|
|
||||||
.catch(function(err) {
|
|
||||||
document.getElementById('content').innerHTML =
|
|
||||||
'<div class="error-msg">Status zatim neni k dispozici.<br><small>(' + err.message + ')</small></div>';
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
loadStatus();
|
|
||||||
</script>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
Reference in New Issue
Block a user