diff --git a/build/crontab b/build/crontab index 1b3dfd8..f822865 100644 --- a/build/crontab +++ b/build/crontab @@ -1 +1 @@ -0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2 +0 6,18 * * * cd /app && bash /app/run_all.sh --data-dir /app/data >> /proc/1/fd/1 2>> /proc/1/fd/2 diff --git a/build/entrypoint.sh b/build/entrypoint.sh index 032afe5..786526c 100644 --- a/build/entrypoint.sh +++ b/build/entrypoint.sh @@ -3,20 +3,11 @@ set -euo pipefail DATA_DIR="/app/data" -# Create symlinks so scripts (which write to /app/) persist data to the volume -for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ - byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \ - mapa_bytu.html; do - # Remove real file if it exists (e.g. baked into image) - [ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f" - ln -sf "$DATA_DIR/$f" "/app/$f" -done - echo "[entrypoint] Starting crond..." crond -b -l 2 echo "[entrypoint] Starting initial scrape in background..." -bash /app/run_all.sh & +bash /app/run_all.sh --data-dir "$DATA_DIR" & echo "[entrypoint] Starting HTTP server on port 8080..." exec python3 -m http.server 8080 --directory "$DATA_DIR" diff --git a/build/run.sh b/build/run.sh new file mode 100644 index 0000000..38f2234 --- /dev/null +++ b/build/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +docker rm -f maru-hleda-byt + +# gitea registry login with kacerr / token +docker run -d --name maru-hleda-byt \ + -p 8080:8080 \ + -v /srv/maru-hleda-byt/data:/app/data \ + gitea.home.hrajfrisbee.cz/littlemeat/maru-hleda-byt:0.01 \ No newline at end of file diff --git a/merge_and_map.py b/merge_and_map.py index 1a6a50e..ecc2ca7 100644 --- a/merge_and_map.py +++ b/merge_and_map.py @@ -7,6 +7,7 @@ PSN a CityHome mají při deduplikaci prioritu (načtou se první). """ from __future__ import annotations +import argparse import json import re from pathlib import Path @@ -40,7 +41,7 @@ def dedup_key(estate: dict) -> str: return f"{street}_{price}_{area}" -def main(): +def main(data_dir: str = "."): # Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci) sources = [ ("PSN", "byty_psn.json"), @@ -51,10 +52,11 @@ def main(): ("iDNES", "byty_idnes.json"), ] + data_path = Path(data_dir) all_estates = [] for label, filename in sources: - path = Path(filename) + path = data_path / filename if path.exists(): data = json.loads(path.read_text(encoding="utf-8")) # Ensure source is set (Sreality legacy) @@ -111,7 +113,7 @@ def main(): print(f" {src}: {count}") # Save merged data - merged_path = Path("byty_merged.json") + merged_path = data_path / "byty_merged.json" merged_path.write_text( json.dumps(deduplicated, ensure_ascii=False, indent=2), encoding="utf-8", @@ -119,8 +121,12 @@ def main(): print(f"\n✓ Sloučená data: {merged_path.resolve()}") # Generate map - generate_map(deduplicated) + generate_map(deduplicated, output_path=str(data_path / "mapa_bytu.html")) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description="Merge scraped data and generate map") + parser.add_argument("--data-dir", type=str, default=".", + help="Directory for reading/writing data files (default: current dir)") + args = parser.parse_args() + main(data_dir=args.data_dir) diff --git a/regen_map.py b/regen_map.py index 0d8f6f3..a4fbcbe 100644 --- a/regen_map.py +++ b/regen_map.py @@ -5,6 +5,7 @@ Doplní chybějící plochy ze Sreality API, opraví URL, aplikuje filtry. """ from __future__ import annotations +import argparse import json import time import urllib.request @@ -57,8 +58,9 @@ def fetch_area(hash_id: int) -> int | None: return None -def main(): - json_path = Path("byty_sreality.json") +def main(data_dir: str = "."): + data_path = Path(data_dir) + json_path = data_path / "byty_sreality.json" if not json_path.exists(): print("Soubor byty_sreality.json nenalezen. Nejprve spusť scrape_and_map.py") return @@ -100,15 +102,19 @@ def main(): print(f"Zbývá: {len(filtered)} bytů") # Save updated data - filtered_path = Path("byty_sreality.json") + filtered_path = data_path / "byty_sreality.json" filtered_path.write_text( json.dumps(filtered, ensure_ascii=False, indent=2), encoding="utf-8", ) # Generate map - generate_map(filtered) + generate_map(filtered, output_path=str(data_path / "mapa_bytu.html")) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description="Regenerate map from existing data") + parser.add_argument("--data-dir", type=str, default=".", + help="Directory for reading/writing data files (default: current dir)") + args = parser.parse_args() + main(data_dir=args.data_dir) diff --git a/run_all.sh b/run_all.sh index 1e347ec..b81d167 100755 --- a/run_all.sh +++ b/run_all.sh @@ -4,6 +4,7 @@ # Použití: ./run_all.sh # Nebo s limity: ./run_all.sh --max-pages 1 --max-properties 10 # Nebo s logováním: ./run_all.sh --log-level DEBUG +# S vlastním adresářem: ./run_all.sh --data-dir /app/data # ============================================================ set -euo pipefail cd "$(dirname "$0")" @@ -26,16 +27,19 @@ show_help() { echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje" echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje" echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)" + echo " --data-dir DIR Adresář pro čtení/zápis datových souborů (default: .)" echo " -h, --help Zobrazí tuto nápovědu" echo "" echo "Examples:" echo " ./run_all.sh # plný běh" echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test" echo " ./run_all.sh --log-level DEBUG # s debug logováním" + echo " ./run_all.sh --data-dir /app/data # Docker produkce" } # Parse arguments SCRAPER_ARGS="" +DATA_DIR="." while [[ $# -gt 0 ]]; do case $1 in -h|--help) @@ -46,6 +50,10 @@ while [[ $# -gt 0 ]]; do SCRAPER_ARGS="$SCRAPER_ARGS $1 $2" shift 2 ;; + --data-dir) + DATA_DIR="$2" + shift 2 + ;; *) echo "Unknown argument: $1" echo "" @@ -55,6 +63,8 @@ while [[ $# -gt 0 ]]; do esac done +SCRAPER_ARGS="$SCRAPER_ARGS --data-dir $DATA_DIR" + step() { CURRENT=$((CURRENT + 1)) echo "" @@ -87,7 +97,7 @@ wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED + # ── Sloučení + mapa ────────────────────────────────────────── step "Sloučení dat a generování mapy" -python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((FAILED + 1)); } +python3 merge_and_map.py --data-dir "$DATA_DIR" || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((FAILED + 1)); } # ── Otevření mapy ──────────────────────────────────────────── @@ -100,4 +110,4 @@ else fi echo "============================================================" -command -v open &>/dev/null && open mapa_bytu.html || true +command -v open &>/dev/null && open "$DATA_DIR/mapa_bytu.html" || true diff --git a/scrape_and_map.py b/scrape_and_map.py index 046ec0e..e295e5b 100644 --- a/scrape_and_map.py +++ b/scrape_and_map.py @@ -207,10 +207,10 @@ def load_cache(json_path: str = "byty_sreality.json") -> dict[int, dict]: return {} -def scrape(max_pages: int | None = None, max_properties: int | None = None): +def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): """Main scraping function. Returns list of filtered estates.""" all_estates_raw = [] - cache = load_cache() + cache = load_cache(str(Path(data_dir) / "byty_sreality.json")) logger.info("=" * 60) logger.info("Stahuji inzeráty ze Sreality.cz") @@ -939,6 +939,8 @@ if __name__ == "__main__": parser.add_argument("--max-properties", type=int, help="Maximum number of properties to fetch details for") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") + parser.add_argument("--data-dir", type=str, default=".", + help="Directory for reading/writing data files (default: current dir)") args = parser.parse_args() # Configure logging @@ -948,12 +950,13 @@ if __name__ == "__main__": handlers=[logging.StreamHandler()] ) + data_dir = Path(args.data_dir) start = time.time() - estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) + estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) if estates: # Save raw data as JSON backup - json_path = Path("byty_sreality.json") + json_path = data_dir / "byty_sreality.json" json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", @@ -961,7 +964,7 @@ if __name__ == "__main__": logger.info(f"✓ Data uložena: {json_path.resolve()}") # Generate map - map_path = generate_map(estates) + map_path = generate_map(estates, output_path=str(data_dir / "mapa_bytu.html")) elapsed = time.time() - start logger.info(f"\n⏱ Celkový čas: {elapsed:.0f} s") logger.info(f"\nOtevři v prohlížeči:\n file://{map_path}") diff --git a/scrape_bezrealitky.py b/scrape_bezrealitky.py index febe194..2533630 100644 --- a/scrape_bezrealitky.py +++ b/scrape_bezrealitky.py @@ -170,8 +170,8 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]: return {} -def scrape(max_pages: int | None = None, max_properties: int | None = None): - cache = load_cache() +def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): + cache = load_cache(str(Path(data_dir) / "byty_bezrealitky.json")) logger.info("=" * 60) logger.info("Stahuji inzeráty z Bezrealitky.cz") @@ -395,6 +395,8 @@ if __name__ == "__main__": help="Maximum number of properties to fetch details for") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") + parser.add_argument("--data-dir", type=str, default=".", + help="Directory for reading/writing data files (default: current dir)") args = parser.parse_args() # Configure logging @@ -404,11 +406,12 @@ if __name__ == "__main__": handlers=[logging.StreamHandler()] ) + data_dir = Path(args.data_dir) start = time.time() - estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) + estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) if estates: - json_path = Path("byty_bezrealitky.json") + json_path = data_dir / "byty_bezrealitky.json" json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", diff --git a/scrape_cityhome.py b/scrape_cityhome.py index 4361b81..264a800 100644 --- a/scrape_cityhome.py +++ b/scrape_cityhome.py @@ -194,8 +194,8 @@ def load_previous(json_path: str = "byty_cityhome.json") -> dict[str, str]: return {} -def scrape(max_pages: int | None = None, max_properties: int | None = None): - previous_first_seen = load_previous() +def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): + previous_first_seen = load_previous(str(Path(data_dir) / "byty_cityhome.json")) logger.info("=" * 60) logger.info("Stahuji inzeráty z CityHome (city-home.cz)") logger.info(f"Cena: do {format_price(MAX_PRICE)}") @@ -367,6 +367,8 @@ if __name__ == "__main__": help="Maximum number of properties to include in results") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") + parser.add_argument("--data-dir", type=str, default=".", + help="Directory for reading/writing data files (default: current dir)") args = parser.parse_args() # Configure logging @@ -376,11 +378,12 @@ if __name__ == "__main__": handlers=[logging.StreamHandler()] ) + data_dir = Path(args.data_dir) start = time.time() - estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) + estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) if estates: - json_path = Path("byty_cityhome.json") + json_path = data_dir / "byty_cityhome.json" json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", diff --git a/scrape_idnes.py b/scrape_idnes.py index c686a36..6d49138 100644 --- a/scrape_idnes.py +++ b/scrape_idnes.py @@ -278,8 +278,8 @@ def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]: return {} -def scrape(max_pages: int | None = None, max_properties: int | None = None): - cache = load_cache() +def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): + cache = load_cache(str(Path(data_dir) / "byty_idnes.json")) logger.info("=" * 60) logger.info("Stahuji inzeráty z Reality iDNES") @@ -499,6 +499,8 @@ if __name__ == "__main__": help="Maximum number of properties to fetch details for") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") + parser.add_argument("--data-dir", type=str, default=".", + help="Directory for reading/writing data files (default: current dir)") args = parser.parse_args() # Configure logging @@ -508,11 +510,12 @@ if __name__ == "__main__": handlers=[logging.StreamHandler()] ) + data_dir = Path(args.data_dir) start = time.time() - estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) + estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) if estates: - json_path = Path("byty_idnes.json") + json_path = data_dir / "byty_idnes.json" json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", diff --git a/scrape_psn.py b/scrape_psn.py index 3bfef1d..a51ace1 100644 --- a/scrape_psn.py +++ b/scrape_psn.py @@ -122,8 +122,8 @@ def load_previous(json_path: str = "byty_psn.json") -> dict[str, str]: return {} -def scrape(max_pages: int | None = None, max_properties: int | None = None): - previous_first_seen = load_previous() +def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): + previous_first_seen = load_previous(str(Path(data_dir) / "byty_psn.json")) logger.info("=" * 60) logger.info("Stahuji inzeráty z PSN.cz") logger.info(f"Cena: do {format_price(MAX_PRICE)}") @@ -346,6 +346,8 @@ if __name__ == "__main__": help="Maximum number of properties to include in results") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") + parser.add_argument("--data-dir", type=str, default=".", + help="Directory for reading/writing data files (default: current dir)") args = parser.parse_args() # Configure logging @@ -355,11 +357,12 @@ if __name__ == "__main__": handlers=[logging.StreamHandler()] ) + data_dir = Path(args.data_dir) start = time.time() - estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) + estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) if estates: - json_path = Path("byty_psn.json") + json_path = data_dir / "byty_psn.json" json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", diff --git a/scrape_realingo.py b/scrape_realingo.py index c6b201e..f39ede3 100644 --- a/scrape_realingo.py +++ b/scrape_realingo.py @@ -135,8 +135,8 @@ def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]: return {} -def scrape(max_pages: int | None = None, max_properties: int | None = None): - cache = load_cache() +def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."): + cache = load_cache(str(Path(data_dir) / "byty_realingo.json")) logger.info("=" * 60) logger.info("Stahuji inzeráty z Realingo.cz") @@ -354,6 +354,8 @@ if __name__ == "__main__": help="Maximum number of properties to fetch details for") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") + parser.add_argument("--data-dir", type=str, default=".", + help="Directory for reading/writing data files (default: current dir)") args = parser.parse_args() # Configure logging @@ -363,11 +365,12 @@ if __name__ == "__main__": handlers=[logging.StreamHandler()] ) + data_dir = Path(args.data_dir) start = time.time() - estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) + estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir) if estates: - json_path = Path("byty_realingo.json") + json_path = data_dir / "byty_realingo.json" json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8",