Add --data-dir CLI argument to replace symlink-based data persistence
All checks were successful
Build and Push / build (push) Successful in 7s

The Docker entrypoint previously created symlinks from /app/ to /app/data/
so that scripts writing relative paths would persist to the mounted volume.
This caused symlink loops in production when stale symlinks leaked into the
host data directory.

Instead, all scrapers, merge_and_map.py, regen_map.py, and run_all.sh now
accept a --data-dir argument (default: ".") that controls where data files
are read from and written to. The entrypoint and crontab pass
--data-dir /app/data, eliminating the need for symlinks entirely.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jan Novak
2026-02-15 22:56:41 +01:00
parent a1212c6312
commit a09876d749
12 changed files with 88 additions and 48 deletions

View File

@@ -207,10 +207,10 @@ def load_cache(json_path: str = "byty_sreality.json") -> dict[int, dict]:
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
"""Main scraping function. Returns list of filtered estates."""
all_estates_raw = []
cache = load_cache()
cache = load_cache(str(Path(data_dir) / "byty_sreality.json"))
logger.info("=" * 60)
logger.info("Stahuji inzeráty ze Sreality.cz")
@@ -939,6 +939,8 @@ if __name__ == "__main__":
parser.add_argument("--max-properties", type=int, help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
parser.add_argument("--data-dir", type=str, default=".",
help="Directory for reading/writing data files (default: current dir)")
args = parser.parse_args()
# Configure logging
@@ -948,12 +950,13 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()]
)
data_dir = Path(args.data_dir)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
if estates:
# Save raw data as JSON backup
json_path = Path("byty_sreality.json")
json_path = data_dir / "byty_sreality.json"
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
@@ -961,7 +964,7 @@ if __name__ == "__main__":
logger.info(f"✓ Data uložena: {json_path.resolve()}")
# Generate map
map_path = generate_map(estates)
map_path = generate_map(estates, output_path=str(data_dir / "mapa_bytu.html"))
elapsed = time.time() - start
logger.info(f"\n⏱ Celkový čas: {elapsed:.0f} s")
logger.info(f"\nOtevři v prohlížeči:\n file://{map_path}")