1 Commits
0.01 ... 0.02

Author SHA1 Message Date
Jan Novak
a09876d749 Add --data-dir CLI argument to replace symlink-based data persistence
All checks were successful
Build and Push / build (push) Successful in 7s
The Docker entrypoint previously created symlinks from /app/ to /app/data/
so that scripts writing relative paths would persist to the mounted volume.
This caused symlink loops in production when stale symlinks leaked into the
host data directory.

Instead, all scrapers, merge_and_map.py, regen_map.py, and run_all.sh now
accept a --data-dir argument (default: ".") that controls where data files
are read from and written to. The entrypoint and crontab pass
--data-dir /app/data, eliminating the need for symlinks entirely.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 22:56:41 +01:00
12 changed files with 88 additions and 48 deletions

View File

@@ -1 +1 @@
0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2
0 6,18 * * * cd /app && bash /app/run_all.sh --data-dir /app/data >> /proc/1/fd/1 2>> /proc/1/fd/2

View File

@@ -3,20 +3,11 @@ set -euo pipefail
DATA_DIR="/app/data"
# Create symlinks so scripts (which write to /app/) persist data to the volume
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \
mapa_bytu.html; do
# Remove real file if it exists (e.g. baked into image)
[ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f"
ln -sf "$DATA_DIR/$f" "/app/$f"
done
echo "[entrypoint] Starting crond..."
crond -b -l 2
echo "[entrypoint] Starting initial scrape in background..."
bash /app/run_all.sh &
bash /app/run_all.sh --data-dir "$DATA_DIR" &
echo "[entrypoint] Starting HTTP server on port 8080..."
exec python3 -m http.server 8080 --directory "$DATA_DIR"

9
build/run.sh Normal file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
docker rm -f maru-hleda-byt
# gitea registry login with kacerr / token
docker run -d --name maru-hleda-byt \
-p 8080:8080 \
-v /srv/maru-hleda-byt/data:/app/data \
gitea.home.hrajfrisbee.cz/littlemeat/maru-hleda-byt:0.01

View File

@@ -7,6 +7,7 @@ PSN a CityHome mají při deduplikaci prioritu (načtou se první).
"""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
@@ -40,7 +41,7 @@ def dedup_key(estate: dict) -> str:
return f"{street}_{price}_{area}"
def main():
def main(data_dir: str = "."):
# Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci)
sources = [
("PSN", "byty_psn.json"),
@@ -51,10 +52,11 @@ def main():
("iDNES", "byty_idnes.json"),
]
data_path = Path(data_dir)
all_estates = []
for label, filename in sources:
path = Path(filename)
path = data_path / filename
if path.exists():
data = json.loads(path.read_text(encoding="utf-8"))
# Ensure source is set (Sreality legacy)
@@ -111,7 +113,7 @@ def main():
print(f" {src}: {count}")
# Save merged data
merged_path = Path("byty_merged.json")
merged_path = data_path / "byty_merged.json"
merged_path.write_text(
json.dumps(deduplicated, ensure_ascii=False, indent=2),
encoding="utf-8",
@@ -119,8 +121,12 @@ def main():
print(f"\n✓ Sloučená data: {merged_path.resolve()}")
# Generate map
generate_map(deduplicated)
generate_map(deduplicated, output_path=str(data_path / "mapa_bytu.html"))
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser(description="Merge scraped data and generate map")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
main(data_dir=args.data_dir)

View File

@@ -5,6 +5,7 @@ Doplní chybějící plochy ze Sreality API, opraví URL, aplikuje filtry.
"""
from __future__ import annotations
import argparse
import json
import time
import urllib.request
@@ -57,8 +58,9 @@ def fetch_area(hash_id: int) -> int | None:
return None
def main():
json_path = Path("byty_sreality.json")
def main(data_dir: str = "."):
data_path = Path(data_dir)
json_path = data_path / "byty_sreality.json"
if not json_path.exists():
print("Soubor byty_sreality.json nenalezen. Nejprve spusť scrape_and_map.py")
return
@@ -100,15 +102,19 @@ def main():
print(f"Zbývá: {len(filtered)} bytů")
# Save updated data
filtered_path = Path("byty_sreality.json")
filtered_path = data_path / "byty_sreality.json"
filtered_path.write_text(
json.dumps(filtered, ensure_ascii=False, indent=2),
encoding="utf-8",
)
# Generate map
generate_map(filtered)
generate_map(filtered, output_path=str(data_path / "mapa_bytu.html"))
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser(description="Regenerate map from existing data")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
main(data_dir=args.data_dir)

View File

@@ -4,6 +4,7 @@
# Použití: ./run_all.sh
# Nebo s limity: ./run_all.sh --max-pages 1 --max-properties 10
# Nebo s logováním: ./run_all.sh --log-level DEBUG
# S vlastním adresářem: ./run_all.sh --data-dir /app/data
# ============================================================
set -euo pipefail
cd "$(dirname "$0")"
@@ -26,16 +27,19 @@ show_help() {
echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje"
echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje"
echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)"
echo " --data-dir DIR Adresář pro čtení/zápis datových souborů (default: .)"
echo " -h, --help Zobrazí tuto nápovědu"
echo ""
echo "Examples:"
echo " ./run_all.sh # plný běh"
echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test"
echo " ./run_all.sh --log-level DEBUG # s debug logováním"
echo " ./run_all.sh --data-dir /app/data # Docker produkce"
}
# Parse arguments
SCRAPER_ARGS=""
DATA_DIR="."
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
@@ -46,6 +50,10 @@ while [[ $# -gt 0 ]]; do
SCRAPER_ARGS="$SCRAPER_ARGS $1 $2"
shift 2
;;
--data-dir)
DATA_DIR="$2"
shift 2
;;
*)
echo "Unknown argument: $1"
echo ""
@@ -55,6 +63,8 @@ while [[ $# -gt 0 ]]; do
esac
done
SCRAPER_ARGS="$SCRAPER_ARGS --data-dir $DATA_DIR"
step() {
CURRENT=$((CURRENT + 1))
echo ""
@@ -87,7 +97,7 @@ wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED +
# ── Sloučení + mapa ──────────────────────────────────────────
step "Sloučení dat a generování mapy"
python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((FAILED + 1)); }
python3 merge_and_map.py --data-dir "$DATA_DIR" || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((FAILED + 1)); }
# ── Otevření mapy ────────────────────────────────────────────
@@ -100,4 +110,4 @@ else
fi
echo "============================================================"
command -v open &>/dev/null && open mapa_bytu.html || true
command -v open &>/dev/null && open "$DATA_DIR/mapa_bytu.html" || true

View File

@@ -207,10 +207,10 @@ def load_cache(json_path: str = "byty_sreality.json") -> dict[int, dict]:
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
"""Main scraping function. Returns list of filtered estates."""
all_estates_raw = []
cache = load_cache()
cache = load_cache(str(Path(data_dir) / "byty_sreality.json"))
logger.info("=" * 60)
logger.info("Stahuji inzeráty ze Sreality.cz")
@@ -939,6 +939,8 @@ if __name__ == "__main__":
parser.add_argument("--max-properties", type=int, help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
# Configure logging
@@ -948,12 +950,13 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()]
)
data_dir = Path(args.data_dir)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
if estates:
# Save raw data as JSON backup
json_path = Path("byty_sreality.json")
json_path = data_dir / "byty_sreality.json"
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
@@ -961,7 +964,7 @@ if __name__ == "__main__":
logger.info(f"✓ Data uložena: {json_path.resolve()}")
# Generate map
map_path = generate_map(estates)
map_path = generate_map(estates, output_path=str(data_dir / "mapa_bytu.html"))
elapsed = time.time() - start
logger.info(f"\n⏱ Celkový čas: {elapsed:.0f} s")
logger.info(f"\nOtevři v prohlížeči:\n file://{map_path}")

View File

@@ -170,8 +170,8 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache()
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
cache = load_cache(str(Path(data_dir) / "byty_bezrealitky.json"))
logger.info("=" * 60)
logger.info("Stahuji inzeráty z Bezrealitky.cz")
@@ -395,6 +395,8 @@ if __name__ == "__main__":
help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
# Configure logging
@@ -404,11 +406,12 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()]
)
data_dir = Path(args.data_dir)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
if estates:
json_path = Path("byty_bezrealitky.json")
json_path = data_dir / "byty_bezrealitky.json"
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",

View File

@@ -194,8 +194,8 @@ def load_previous(json_path: str = "byty_cityhome.json") -> dict[str, str]:
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
previous_first_seen = load_previous()
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
previous_first_seen = load_previous(str(Path(data_dir) / "byty_cityhome.json"))
logger.info("=" * 60)
logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
@@ -367,6 +367,8 @@ if __name__ == "__main__":
help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
# Configure logging
@@ -376,11 +378,12 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()]
)
data_dir = Path(args.data_dir)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
if estates:
json_path = Path("byty_cityhome.json")
json_path = data_dir / "byty_cityhome.json"
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",

View File

@@ -278,8 +278,8 @@ def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache()
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
cache = load_cache(str(Path(data_dir) / "byty_idnes.json"))
logger.info("=" * 60)
logger.info("Stahuji inzeráty z Reality iDNES")
@@ -499,6 +499,8 @@ if __name__ == "__main__":
help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
# Configure logging
@@ -508,11 +510,12 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()]
)
data_dir = Path(args.data_dir)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
if estates:
json_path = Path("byty_idnes.json")
json_path = data_dir / "byty_idnes.json"
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",

View File

@@ -122,8 +122,8 @@ def load_previous(json_path: str = "byty_psn.json") -> dict[str, str]:
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
previous_first_seen = load_previous()
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
previous_first_seen = load_previous(str(Path(data_dir) / "byty_psn.json"))
logger.info("=" * 60)
logger.info("Stahuji inzeráty z PSN.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
@@ -346,6 +346,8 @@ if __name__ == "__main__":
help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
# Configure logging
@@ -355,11 +357,12 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()]
)
data_dir = Path(args.data_dir)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
if estates:
json_path = Path("byty_psn.json")
json_path = data_dir / "byty_psn.json"
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",

View File

@@ -135,8 +135,8 @@ def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache()
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
cache = load_cache(str(Path(data_dir) / "byty_realingo.json"))
logger.info("=" * 60)
logger.info("Stahuji inzeráty z Realingo.cz")
@@ -354,6 +354,8 @@ if __name__ == "__main__":
help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
# Configure logging
@@ -363,11 +365,12 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()]
)
data_dir = Path(args.data_dir)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
if estates:
json_path = Path("byty_realingo.json")
json_path = data_dir / "byty_realingo.json"
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",