diff --git a/.gitea/workflows/build.yaml b/.gitea/workflows/build.yaml new file mode 100644 index 0000000..9563caa --- /dev/null +++ b/.gitea/workflows/build.yaml @@ -0,0 +1,35 @@ +name: Build and Push + +on: + workflow_dispatch: + inputs: + tag: + description: 'Image tag' + required: true + default: 'latest' + push: + tags: + - '*' + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - uses: actions/checkout@v4 + + - name: Login to Gitea registry + run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login -u ${{ github.actor }} --password-stdin gitea.home.hrajfrisbee.cz + + - name: Build and push + run: | + TAG=${{ github.ref_name }} + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + TAG=${{ inputs.tag }} + fi + IMAGE=gitea.home.hrajfrisbee.cz/${{ github.repository }}:$TAG + docker build -f build/Dockerfile -t $IMAGE . + docker push $IMAGE diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..885cbd0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.vscode/ +__pycache__/ +byty_*.json diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c0fd73f --- /dev/null +++ b/Makefile @@ -0,0 +1,71 @@ +IMAGE_NAME := maru-hleda-byt +CONTAINER_NAME := maru-hleda-byt +VOLUME_NAME := maru-hleda-byt-data +VALIDATION_CONTAINER := maru-hleda-byt-validation +VALIDATION_VOLUME := maru-hleda-byt-validation-data +PORT := 8080 + +.PHONY: build run stop logs scrape restart clean help validation validation-local validation-stop validation-local-debug + +help: + @echo "Available targets:" + @echo " build - Build the Docker image" + @echo " run - Build and run the Docker container in the background" + @echo " stop - Stop and remove the running container" + @echo " logs - Show live container logs" + @echo " scrape - Run the scraping script inside the container" + @echo " validation - Run scraping with limits (1 page, 10 properties) in Docker container" + @echo " validation-stop - Stop the validation Docker container" + @echo " validation-local - Run scraping with limits (1 page, 10 properties) locally with Python" + @echo " validation-local-debug - Run validation locally with DEBUG logging" + @echo " restart - Restart the container (stop and run again)" + @echo " clean - Stop container and remove the Docker image" + @echo " help - Show this help message" + +build: + docker build -f build/Dockerfile -t $(IMAGE_NAME) . + +run: build + docker run -d --name $(CONTAINER_NAME) \ + -p $(PORT):8080 \ + -v $(VOLUME_NAME):/app/data \ + --restart unless-stopped \ + $(IMAGE_NAME) + @echo "Map will be at http://localhost:$(PORT)/mapa_bytu.html" + +stop: + docker stop $(CONTAINER_NAME) && docker rm $(CONTAINER_NAME) + +logs: + docker logs -f $(CONTAINER_NAME) + +scrape: + docker exec $(CONTAINER_NAME) bash /app/run_all.sh + +validation: build + @docker stop $(VALIDATION_CONTAINER) 2>/dev/null || true + @docker rm $(VALIDATION_CONTAINER) 2>/dev/null || true + docker run -d --name $(VALIDATION_CONTAINER) \ + -p 8081:8080 \ + -v $(VALIDATION_VOLUME):/app/data \ + --restart unless-stopped \ + $(IMAGE_NAME) + @sleep 2 + docker exec $(VALIDATION_CONTAINER) bash /app/run_all.sh --max-pages 1 --max-properties 10 + @echo "Validation map will be at http://localhost:8081/mapa_bytu.html" + +validation-stop: + @docker stop $(VALIDATION_CONTAINER) 2>/dev/null || true + @docker rm $(VALIDATION_CONTAINER) 2>/dev/null || true + @echo "Validation container stopped and removed" + +validation-local: + ./run_all.sh --max-pages 1 --max-properties 10 + +validation-local-debug: + ./run_all.sh --max-pages 1 --max-properties 10 --log-level DEBUG + +restart: stop run + +clean: stop + docker rmi $(IMAGE_NAME) diff --git a/README.md b/README.md new file mode 100644 index 0000000..cd7e52f --- /dev/null +++ b/README.md @@ -0,0 +1,244 @@ +# Maru hleda byt + +Apartment search aggregator for Prague. Scrapes listings from 6 Czech real estate portals, filters them by configurable criteria, deduplicates across sources, and generates a single interactive map with all matching apartments. + +Built for a specific use case: finding a 3+kk or larger apartment in Prague, excluding panel construction ("panelak") and housing estates ("sidliste"), with personal rating support. + +## How it works + +``` +┌─────────────────────────────────────────────────────────────┐ +│ run_all.sh │ +│ Orchestrates all scrapers, then merges results into map │ +├─────────┬──────────┬──────────┬────────┬────────┬───────────┤ +│Sreality │Realingo │Bezreal. │iDNES │PSN │CityHome │ +│ (API) │ (HTML) │ (HTML) │ (HTML) │ (HTML) │ (HTML) │ +├─────────┴──────────┴──────────┴────────┴────────┴───────────┤ +│ merge_and_map.py │ +│ Loads all byty_*.json, deduplicates, generates HTML map │ +├─────────────────────────────────────────────────────────────┤ +│ mapa_bytu.html │ +│ Interactive Leaflet.js map with filters & ratings │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Pipeline + +1. **Scraping** -- Each scraper independently fetches listings from its portal, applies filters, and saves results to a JSON file (`byty_.json`). +2. **Merging** -- `merge_and_map.py` loads all 6 JSON files, deduplicates listings (by street name + price + area), and generates the final `mapa_bytu.html`. +3. **Serving** -- The HTML map can be opened locally as a file, or served via Docker with a built-in HTTP server. + +### Execution order in `run_all.sh` + +Scrapers run sequentially (to avoid overwhelming any single portal), except PSN and CityHome which run in parallel (different sites). If a scraper fails, it is logged but does not abort the pipeline -- remaining scrapers continue. + +``` +1. scrape_and_map.py (Sreality) +2. scrape_realingo.py (Realingo) +3. scrape_bezrealitky.py (Bezrealitky) +4. scrape_idnes.py (iDNES Reality) +5. scrape_psn.py + scrape_cityhome.py (parallel) +6. merge_and_map.py (merge + map generation) +``` + +## Scrapers + +All scrapers share the same CLI interface and a consistent two-phase approach: + +1. **Phase 1** -- Fetch listing pages (paginated) to get a list of all available apartments. +2. **Phase 2** -- Fetch detail pages for each listing to get floor, construction type, and other data needed for filtering. + +Each scraper uses a **JSON file cache**: if a listing's `hash_id` and `price` haven't changed since the last run, the cached data is reused and the detail page is not re-fetched. This significantly reduces runtime on subsequent runs. + +### Source details + +| Scraper | Portal | Data source | Output file | Notes | +|---------|--------|-------------|-------------|-------| +| `scrape_and_map.py` | [Sreality.cz](https://sreality.cz) | REST API (JSON) | `byty_sreality.json` | Main scraper. Also contains `generate_map()` used by all other scripts. | +| `scrape_realingo.py` | [Realingo.cz](https://realingo.cz) | `__NEXT_DATA__` JSON in HTML | `byty_realingo.json` | Next.js app, data extracted from server-side props. | +| `scrape_bezrealitky.py` | [Bezrealitky.cz](https://bezrealitky.cz) | `__NEXT_DATA__` Apollo cache in HTML | `byty_bezrealitky.json` | Next.js app with Apollo GraphQL cache in page source. | +| `scrape_idnes.py` | [Reality iDNES](https://reality.idnes.cz) | HTML parsing (regex) | `byty_idnes.json` | Traditional HTML site. GPS extracted from `dataLayer.push()`. Retry logic with 5 attempts and exponential backoff. | +| `scrape_psn.py` | [PSN.cz](https://psn.cz) | RSC (React Server Components) escaped JSON in HTML | `byty_psn.json` | Uses `curl` instead of `urllib` due to Cloudflare SSL issues. Hardcoded list of Prague projects with GPS coordinates. | +| `scrape_cityhome.py` | [city-home.cz](https://city-home.cz) | HTML table parsing (data attributes on ``) | `byty_cityhome.json` | CityHome/SATPO developer projects. GPS fetched from project locality pages. | + +### Scraper filter criteria + +All scrapers apply the same core filters (with minor per-source variations): + +| Filter | Value | Notes | +|--------|-------|-------| +| **Max price** | 13 500 000 CZK | PSN and CityHome use 14 000 000 CZK | +| **Min area** | 69 m^2 | | +| **Min floor** | 2. NP (2nd floor) | 2nd floor apartments are included but flagged on the map | +| **Dispositions** | 3+kk, 3+1, 4+kk, 4+1, 5+kk, 5+1, 6+ | | +| **Region** | Praha | | +| **Construction** | Excludes panel ("panelak") | | +| **Location** | Excludes housing estates ("sidliste") | | + +## Utility scripts + +### `merge_and_map.py` + +Merges all `byty_*.json` files into `byty_merged.json` and generates `mapa_bytu.html`. + +**Deduplication logic:** Two listings are considered duplicates if they share the same normalized street name + price + area. PSN and CityHome have priority during dedup (loaded first), so their listings are kept over duplicates from other portals. + +### `regen_map.py` + +Regenerates the map from existing `byty_sreality.json` data without re-scraping. Fetches missing area values from the Sreality API, fixes URLs, and re-applies the area filter. Useful for tweaking map output after data has already been collected. + +## Interactive map (`mapa_bytu.html`) + +The generated map is a standalone HTML file using Leaflet.js with CARTO basemap tiles. Features: + +- **Color-coded markers** by disposition (3+kk = blue, 3+1 = green, 4+kk = orange, etc.) +- **Heart-shaped markers** for PSN and CityHome listings (developer favorites) +- **Source badge** in each popup (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome) +- **Client-side filters:** minimum floor, maximum price, hide rejected +- **Rating system** (persisted in `localStorage`): + - Star -- mark as favorite (enlarged marker with pulsing glow) + - Reject -- dim the marker, optionally hide it + - Notes -- free-text notes per listing +- **2nd floor warning** -- listings on 2. NP show an orange warning in the popup +- **Statistics panel** -- total count, price range, average price, disposition breakdown + +## CLI arguments + +All scrapers accept the same arguments. When run via `run_all.sh`, these arguments are forwarded to every scraper. + +``` +--max-pages N Maximum number of listing pages to scrape per source. + Limits the breadth of the initial listing fetch. + (For PSN: max pages per project) + +--max-properties N Maximum number of properties to fetch details for per source. + Limits the depth of the detail-fetching phase. + +--log-level LEVEL Logging verbosity. One of: DEBUG, INFO, WARNING, ERROR. + Default: INFO. + DEBUG shows HTTP request/response details, filter decisions + for every single listing, and cache hit/miss info. + +-h, --help Show help message (run_all.sh only). +``` + +### Examples + +```bash +# Full scrape (all pages, all properties) +./run_all.sh + +# Quick validation run (1 page per source, max 10 properties each) +./run_all.sh --max-pages 1 --max-properties 10 + +# Full scrape with debug logging +./run_all.sh --log-level DEBUG + +# Run a single scraper +python3 scrape_bezrealitky.py --max-pages 2 --max-properties 5 --log-level DEBUG +``` + +## Running with Docker + +The project includes a Docker setup for unattended operation with a cron-based schedule. + +### Container architecture + +``` +┌─────────────────────────────────────────┐ +│ Container (python:3.13-alpine) │ +│ │ +│ PID 1: python3 -m http.server :8080 │ +│ serves /app/data/ │ +│ │ +│ crond: runs run_all.sh at 06:00/18:00 │ +│ Europe/Prague timezone │ +│ │ +│ /app/ -- scripts (.py, .sh) │ +│ /app/data/ -- volume (JSON + HTML) │ +│ ^ symlinked from /app/byty_* │ +└─────────────────────────────────────────┘ +``` + +On startup, the HTTP server starts immediately. The initial scrape runs in the background. Subsequent cron runs update data in-place twice daily at 06:00 and 18:00 CET/CEST. + +### Quick start + +```bash +make run # Build image + start container on port 8080 +# Map available at http://localhost:8080/mapa_bytu.html +``` + +### Makefile targets + +| Target | Description | +|--------|-------------| +| `make help` | Show all available targets | +| `make build` | Build the Docker image | +| `make run` | Build and run the container (port 8080) | +| `make stop` | Stop and remove the container | +| `make logs` | Tail container logs | +| `make scrape` | Trigger a manual scrape inside the running container | +| `make restart` | Stop and re-run the container | +| `make clean` | Stop container and remove the Docker image | +| `make validation` | Run a limited scrape in a separate Docker container (port 8081) | +| `make validation-stop` | Stop the validation container | +| `make validation-local` | Run a limited scrape locally (1 page, 10 properties) | +| `make validation-local-debug` | Same as above with `--log-level DEBUG` | + +### Validation mode + +Validation targets run scrapers with `--max-pages 1 --max-properties 10` for a fast smoke test (~30 seconds instead of several minutes). The Docker validation target runs on port 8081 in a separate container so it doesn't interfere with production data. + +## Project structure + +``` +. +├── scrape_and_map.py # Sreality scraper + map generator (generate_map()) +├── scrape_realingo.py # Realingo scraper +├── scrape_bezrealitky.py # Bezrealitky scraper +├── scrape_idnes.py # iDNES Reality scraper +├── scrape_psn.py # PSN scraper +├── scrape_cityhome.py # CityHome scraper +├── merge_and_map.py # Merge all sources + generate final map +├── regen_map.py # Regenerate map from cached Sreality data +├── run_all.sh # Orchestrator script (runs all scrapers + merge) +├── mapa_bytu.html # Generated interactive map (output) +├── Makefile # Docker management + validation shortcuts +├── build/ +│ ├── Dockerfile # Container image definition (python:3.13-alpine) +│ ├── entrypoint.sh # Container entrypoint (HTTP server + cron + initial scrape) +│ ├── crontab # Cron schedule (06:00 and 18:00 CET) +│ └── CONTAINER.md # Container-specific documentation +└── .gitignore # Ignores byty_*.json, __pycache__, .vscode +``` + +## Dependencies + +**None.** All scrapers use only the Python standard library (`urllib`, `json`, `re`, `argparse`, `logging`, `html.parser`). The only external tool required is `curl` (used by `scrape_psn.py` for Cloudflare TLS compatibility). + +The Docker image is based on `python:3.13-alpine` (~70 MB) with `curl`, `bash`, and `tzdata` added. + +## Caching behavior + +Each scraper maintains a JSON file cache (`byty_.json`). On each run: + +1. The previous JSON file is loaded and indexed by `hash_id`. +2. For each listing found in the current run, if the `hash_id` exists in cache **and** the price is unchanged, the cached record is reused without fetching the detail page. +3. New or changed listings trigger a detail page fetch. +4. The JSON file is overwritten with the fresh results at the end. + +This means the first run is slow (fetches every detail page with rate-limiting delays), but subsequent runs are much faster as they only fetch details for new or changed listings. + +## Rate limiting + +Each scraper includes polite delays between requests: + +| Scraper | Delay between requests | +|---------|----------------------| +| Sreality | 0.3s (details), 0.5s (pages) | +| Realingo | 0.3s (details), 0.5s (pages) | +| Bezrealitky | 0.4s (details), 0.5s (pages) | +| iDNES | 0.4s (details), 1.0s (pages) + retry backoff (3/6/9/12s) | +| PSN | 0.5s (per project page) | +| CityHome | 0.5s (per project GPS fetch) | diff --git a/build/.dockerignore b/build/.dockerignore new file mode 100644 index 0000000..285b8b5 --- /dev/null +++ b/build/.dockerignore @@ -0,0 +1,5 @@ +.git +mapa_bytu.html +byty_*.json +*.pyc +__pycache__ diff --git a/build/CONTAINER.md b/build/CONTAINER.md new file mode 100644 index 0000000..dbb3820 --- /dev/null +++ b/build/CONTAINER.md @@ -0,0 +1,100 @@ +# Container Setup + +OCI container image for the apartment finder. Runs two processes: + +1. **Web server** (`python3 -m http.server`) serving `mapa_bytu.html` on port 8080 +2. **Cron job** running `run_all.sh` (all 6 scrapers + merge) every 12 hours + +## Architecture + +``` +┌─────────────────────────────────────────┐ +│ Container (python:3.13-alpine) │ +│ │ +│ PID 1: python3 -m http.server :8080 │ +│ serves /app/data/ │ +│ │ +│ crond: runs run_all.sh at 06:00/18:00 │ +│ Europe/Prague timezone │ +│ │ +│ /app/ ← scripts (.py, .sh) │ +│ /app/data/ ← volume (JSON + HTML) │ +│ ↑ symlinked from /app/byty_* │ +└─────────────────────────────────────────┘ +``` + +On startup, the web server starts immediately. The initial scrape runs in the background and populates data as it completes. Subsequent cron runs update the data in-place. + +## Build and Run + +```bash +# Build the image +docker build -t maru-hleda-byt . + +# Run with persistent data volume +docker run -d --name maru-hleda-byt \ + -p 8080:8080 \ + -v maru-hleda-byt-data:/app/data \ + --restart unless-stopped \ + maru-hleda-byt +``` + +Access the map at **http://localhost:8080/mapa_bytu.html** + +## Volume Persistence + +A named volume `maru-hleda-byt-data` stores: + +- `byty_*.json` — cached scraper data (6 source files + 1 merged) +- `mapa_bytu.html` — the generated interactive map + +The JSON cache is important: each scraper skips re-fetching properties that haven't changed. Without the volume, every container restart triggers a full re-scrape of all 6 portals (several minutes with rate limiting). + +## Cron Schedule + +Scrapers run at **06:00** and **18:00 Europe/Prague time** (CET/CEST). + +Cron output is forwarded to the container's stdout/stderr, visible via `docker logs`. + +## Operations + +```bash +# View logs (including cron and scraper output) +docker logs -f maru-hleda-byt + +# Check cron schedule +docker exec maru-hleda-byt crontab -l + +# Trigger a manual scrape +docker exec maru-hleda-byt bash /app/run_all.sh + +# Stop / start (data persists in volume) +docker stop maru-hleda-byt +docker start maru-hleda-byt + +# Rebuild after code changes +docker stop maru-hleda-byt && docker rm maru-hleda-byt +docker build -t maru-hleda-byt . +docker run -d --name maru-hleda-byt \ + -p 8080:8080 \ + -v maru-hleda-byt-data:/app/data \ + --restart unless-stopped \ + maru-hleda-byt +``` + +## Troubleshooting + +**Map shows 404**: The initial background scrape hasn't finished yet. Check `docker logs` for progress. First run takes a few minutes due to rate-limited API calls. + +**SSL errors from PSN scraper**: The `scrape_psn.py` uses `curl` (not Python urllib) specifically for Cloudflare SSL compatibility. Alpine's curl includes modern TLS via OpenSSL, so this should work. If not, check that `ca-certificates` is installed (`apk add ca-certificates`). + +**Health check failing**: The health check has a 5-minute start period to allow the initial scrape to complete. If it still fails, verify the HTTP server is running: `docker exec maru-hleda-byt wget -q -O /dev/null http://localhost:8080/`. + +**Timezone verification**: `docker exec maru-hleda-byt date` should show Czech time. + +## Image Details + +- **Base**: `python:3.13-alpine` (~55 MB) +- **Added packages**: `curl`, `bash`, `tzdata` (~10 MB) +- **No pip packages** — all scrapers use Python standard library only +- **Approximate image size**: ~70 MB diff --git a/build/Dockerfile b/build/Dockerfile new file mode 100644 index 0000000..f672cee --- /dev/null +++ b/build/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.13-alpine + +RUN apk add --no-cache curl bash tzdata \ + && cp /usr/share/zoneinfo/Europe/Prague /etc/localtime \ + && echo "Europe/Prague" > /etc/timezone + +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ + scrape_idnes.py scrape_psn.py scrape_cityhome.py \ + merge_and_map.py regen_map.py run_all.sh ./ + +COPY build/crontab /etc/crontabs/root +COPY build/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh run_all.sh + +RUN mkdir -p /app/data + +EXPOSE 8080 + +HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ + CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/build/Makefile b/build/Makefile new file mode 100644 index 0000000..a429a61 --- /dev/null +++ b/build/Makefile @@ -0,0 +1,31 @@ +IMAGE_NAME := maru-hleda-byt +CONTAINER_NAME := maru-hleda-byt +VOLUME_NAME := maru-hleda-byt-data +PORT := 8080 + +.PHONY: build run stop logs scrape restart clean + +build: + docker build -f build/Dockerfile -t $(IMAGE_NAME) . + +run: build + docker run -d --name $(CONTAINER_NAME) \ + -p $(PORT):8080 \ + -v $(VOLUME_NAME):/app/data \ + --restart unless-stopped \ + $(IMAGE_NAME) + @echo "Map will be at http://localhost:$(PORT)/mapa_bytu.html" + +stop: + docker stop $(CONTAINER_NAME) && docker rm $(CONTAINER_NAME) + +logs: + docker logs -f $(CONTAINER_NAME) + +scrape: + docker exec $(CONTAINER_NAME) bash /app/run_all.sh + +restart: stop run + +clean: stop + docker rmi $(IMAGE_NAME) diff --git a/build/crontab b/build/crontab new file mode 100644 index 0000000..1b3dfd8 --- /dev/null +++ b/build/crontab @@ -0,0 +1 @@ +0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 diff --git a/build/entrypoint.sh b/build/entrypoint.sh new file mode 100644 index 0000000..032afe5 --- /dev/null +++ b/build/entrypoint.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -euo pipefail + +DATA_DIR="/app/data" + +# Create symlinks so scripts (which write to /app/) persist data to the volume +for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ + byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \ + mapa_bytu.html; do + # Remove real file if it exists (e.g. baked into image) + [ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f" + ln -sf "$DATA_DIR/$f" "/app/$f" +done + +echo "[entrypoint] Starting crond..." +crond -b -l 2 + +echo "[entrypoint] Starting initial scrape in background..." +bash /app/run_all.sh & + +echo "[entrypoint] Starting HTTP server on port 8080..." +exec python3 -m http.server 8080 --directory "$DATA_DIR" diff --git a/mapa_bytu.html b/mapa_bytu.html index fc6888c..2bb58eb 100644 --- a/mapa_bytu.html +++ b/mapa_bytu.html @@ -3,7 +3,7 @@ -Byty v Praze — mapa (710 bytů) +Byty v Praze — mapa (62 bytů)