From 8373e5e9107aa4b1ea9638b4840486a14ccbf18d Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Sat, 14 Feb 2026 22:18:02 +0100 Subject: [PATCH 1/4] add docker build, makefile, and some more shit before we move forward --- .gitignore | 3 ++ Makefile | 42 +++++++++++++++++++ build/.dockerignore | 5 +++ build/CONTAINER.md | 100 ++++++++++++++++++++++++++++++++++++++++++++ build/Dockerfile | 26 ++++++++++++ build/Makefile | 31 ++++++++++++++ build/crontab | 1 + build/entrypoint.sh | 22 ++++++++++ mapa_bytu.html | 51 +++++++++++----------- project/todo.md | 14 +++++++ run_all.sh | 2 +- 11 files changed, 271 insertions(+), 26 deletions(-) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 build/.dockerignore create mode 100644 build/CONTAINER.md create mode 100644 build/Dockerfile create mode 100644 build/Makefile create mode 100644 build/crontab create mode 100644 build/entrypoint.sh create mode 100644 project/todo.md mode change 100644 => 100755 run_all.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..885cbd0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.vscode/ +__pycache__/ +byty_*.json diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b7d8d02 --- /dev/null +++ b/Makefile @@ -0,0 +1,42 @@ +IMAGE_NAME := maru-hleda-byt +CONTAINER_NAME := maru-hleda-byt +VOLUME_NAME := maru-hleda-byt-data +PORT := 8080 + +.PHONY: build run stop logs scrape restart clean help + +help: + @echo "Available targets:" + @echo " build - Build the Docker image" + @echo " run - Build and run the Docker container in the background" + @echo " stop - Stop and remove the running container" + @echo " logs - Show live container logs" + @echo " scrape - Run the scraping script inside the container" + @echo " restart - Restart the container (stop and run again)" + @echo " clean - Stop container and remove the Docker image" + @echo " help - Show this help message" + +build: + docker build -f build/Dockerfile -t $(IMAGE_NAME) . + +run: build + docker run -d --name $(CONTAINER_NAME) \ + -p $(PORT):8080 \ + -v $(VOLUME_NAME):/app/data \ + --restart unless-stopped \ + $(IMAGE_NAME) + @echo "Map will be at http://localhost:$(PORT)/mapa_bytu.html" + +stop: + docker stop $(CONTAINER_NAME) && docker rm $(CONTAINER_NAME) + +logs: + docker logs -f $(CONTAINER_NAME) + +scrape: + docker exec $(CONTAINER_NAME) bash /app/run_all.sh + +restart: stop run + +clean: stop + docker rmi $(IMAGE_NAME) diff --git a/build/.dockerignore b/build/.dockerignore new file mode 100644 index 0000000..285b8b5 --- /dev/null +++ b/build/.dockerignore @@ -0,0 +1,5 @@ +.git +mapa_bytu.html +byty_*.json +*.pyc +__pycache__ diff --git a/build/CONTAINER.md b/build/CONTAINER.md new file mode 100644 index 0000000..dbb3820 --- /dev/null +++ b/build/CONTAINER.md @@ -0,0 +1,100 @@ +# Container Setup + +OCI container image for the apartment finder. Runs two processes: + +1. **Web server** (`python3 -m http.server`) serving `mapa_bytu.html` on port 8080 +2. **Cron job** running `run_all.sh` (all 6 scrapers + merge) every 12 hours + +## Architecture + +``` +┌─────────────────────────────────────────┐ +│ Container (python:3.13-alpine) │ +│ │ +│ PID 1: python3 -m http.server :8080 │ +│ serves /app/data/ │ +│ │ +│ crond: runs run_all.sh at 06:00/18:00 │ +│ Europe/Prague timezone │ +│ │ +│ /app/ ← scripts (.py, .sh) │ +│ /app/data/ ← volume (JSON + HTML) │ +│ ↑ symlinked from /app/byty_* │ +└─────────────────────────────────────────┘ +``` + +On startup, the web server starts immediately. The initial scrape runs in the background and populates data as it completes. Subsequent cron runs update the data in-place. + +## Build and Run + +```bash +# Build the image +docker build -t maru-hleda-byt . + +# Run with persistent data volume +docker run -d --name maru-hleda-byt \ + -p 8080:8080 \ + -v maru-hleda-byt-data:/app/data \ + --restart unless-stopped \ + maru-hleda-byt +``` + +Access the map at **http://localhost:8080/mapa_bytu.html** + +## Volume Persistence + +A named volume `maru-hleda-byt-data` stores: + +- `byty_*.json` — cached scraper data (6 source files + 1 merged) +- `mapa_bytu.html` — the generated interactive map + +The JSON cache is important: each scraper skips re-fetching properties that haven't changed. Without the volume, every container restart triggers a full re-scrape of all 6 portals (several minutes with rate limiting). + +## Cron Schedule + +Scrapers run at **06:00** and **18:00 Europe/Prague time** (CET/CEST). + +Cron output is forwarded to the container's stdout/stderr, visible via `docker logs`. + +## Operations + +```bash +# View logs (including cron and scraper output) +docker logs -f maru-hleda-byt + +# Check cron schedule +docker exec maru-hleda-byt crontab -l + +# Trigger a manual scrape +docker exec maru-hleda-byt bash /app/run_all.sh + +# Stop / start (data persists in volume) +docker stop maru-hleda-byt +docker start maru-hleda-byt + +# Rebuild after code changes +docker stop maru-hleda-byt && docker rm maru-hleda-byt +docker build -t maru-hleda-byt . +docker run -d --name maru-hleda-byt \ + -p 8080:8080 \ + -v maru-hleda-byt-data:/app/data \ + --restart unless-stopped \ + maru-hleda-byt +``` + +## Troubleshooting + +**Map shows 404**: The initial background scrape hasn't finished yet. Check `docker logs` for progress. First run takes a few minutes due to rate-limited API calls. + +**SSL errors from PSN scraper**: The `scrape_psn.py` uses `curl` (not Python urllib) specifically for Cloudflare SSL compatibility. Alpine's curl includes modern TLS via OpenSSL, so this should work. If not, check that `ca-certificates` is installed (`apk add ca-certificates`). + +**Health check failing**: The health check has a 5-minute start period to allow the initial scrape to complete. If it still fails, verify the HTTP server is running: `docker exec maru-hleda-byt wget -q -O /dev/null http://localhost:8080/`. + +**Timezone verification**: `docker exec maru-hleda-byt date` should show Czech time. + +## Image Details + +- **Base**: `python:3.13-alpine` (~55 MB) +- **Added packages**: `curl`, `bash`, `tzdata` (~10 MB) +- **No pip packages** — all scrapers use Python standard library only +- **Approximate image size**: ~70 MB diff --git a/build/Dockerfile b/build/Dockerfile new file mode 100644 index 0000000..f672cee --- /dev/null +++ b/build/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.13-alpine + +RUN apk add --no-cache curl bash tzdata \ + && cp /usr/share/zoneinfo/Europe/Prague /etc/localtime \ + && echo "Europe/Prague" > /etc/timezone + +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ + scrape_idnes.py scrape_psn.py scrape_cityhome.py \ + merge_and_map.py regen_map.py run_all.sh ./ + +COPY build/crontab /etc/crontabs/root +COPY build/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh run_all.sh + +RUN mkdir -p /app/data + +EXPOSE 8080 + +HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ + CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/build/Makefile b/build/Makefile new file mode 100644 index 0000000..a429a61 --- /dev/null +++ b/build/Makefile @@ -0,0 +1,31 @@ +IMAGE_NAME := maru-hleda-byt +CONTAINER_NAME := maru-hleda-byt +VOLUME_NAME := maru-hleda-byt-data +PORT := 8080 + +.PHONY: build run stop logs scrape restart clean + +build: + docker build -f build/Dockerfile -t $(IMAGE_NAME) . + +run: build + docker run -d --name $(CONTAINER_NAME) \ + -p $(PORT):8080 \ + -v $(VOLUME_NAME):/app/data \ + --restart unless-stopped \ + $(IMAGE_NAME) + @echo "Map will be at http://localhost:$(PORT)/mapa_bytu.html" + +stop: + docker stop $(CONTAINER_NAME) && docker rm $(CONTAINER_NAME) + +logs: + docker logs -f $(CONTAINER_NAME) + +scrape: + docker exec $(CONTAINER_NAME) bash /app/run_all.sh + +restart: stop run + +clean: stop + docker rmi $(IMAGE_NAME) diff --git a/build/crontab b/build/crontab new file mode 100644 index 0000000..1b3dfd8 --- /dev/null +++ b/build/crontab @@ -0,0 +1 @@ +0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 diff --git a/build/entrypoint.sh b/build/entrypoint.sh new file mode 100644 index 0000000..032afe5 --- /dev/null +++ b/build/entrypoint.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -euo pipefail + +DATA_DIR="/app/data" + +# Create symlinks so scripts (which write to /app/) persist data to the volume +for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ + byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \ + mapa_bytu.html; do + # Remove real file if it exists (e.g. baked into image) + [ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f" + ln -sf "$DATA_DIR/$f" "/app/$f" +done + +echo "[entrypoint] Starting crond..." +crond -b -l 2 + +echo "[entrypoint] Starting initial scrape in background..." +bash /app/run_all.sh & + +echo "[entrypoint] Starting HTTP server on port 8080..." +exec python3 -m http.server 8080 --directory "$DATA_DIR" diff --git a/mapa_bytu.html b/mapa_bytu.html index fc6888c..9b0e5ac 100644 --- a/mapa_bytu.html +++ b/mapa_bytu.html @@ -3,7 +3,7 @@ -Byty v Praze — mapa (710 bytů) +Byty v Praze — mapa (711 bytů)