Add --data-dir CLI argument to replace symlink-based data persistence
All checks were successful
Build and Push / build (push) Successful in 7s
All checks were successful
Build and Push / build (push) Successful in 7s
The Docker entrypoint previously created symlinks from /app/ to /app/data/ so that scripts writing relative paths would persist to the mounted volume. This caused symlink loops in production when stale symlinks leaked into the host data directory. Instead, all scrapers, merge_and_map.py, regen_map.py, and run_all.sh now accept a --data-dir argument (default: ".") that controls where data files are read from and written to. The entrypoint and crontab pass --data-dir /app/data, eliminating the need for symlinks entirely. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1 +1 @@
|
|||||||
0 6,18 * * * cd /app && bash /app/run_all.sh >> /proc/1/fd/1 2>> /proc/1/fd/2
|
0 6,18 * * * cd /app && bash /app/run_all.sh --data-dir /app/data >> /proc/1/fd/1 2>> /proc/1/fd/2
|
||||||
|
|||||||
@@ -3,20 +3,11 @@ set -euo pipefail
|
|||||||
|
|
||||||
DATA_DIR="/app/data"
|
DATA_DIR="/app/data"
|
||||||
|
|
||||||
# Create symlinks so scripts (which write to /app/) persist data to the volume
|
|
||||||
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
|
|
||||||
byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \
|
|
||||||
mapa_bytu.html; do
|
|
||||||
# Remove real file if it exists (e.g. baked into image)
|
|
||||||
[ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f"
|
|
||||||
ln -sf "$DATA_DIR/$f" "/app/$f"
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "[entrypoint] Starting crond..."
|
echo "[entrypoint] Starting crond..."
|
||||||
crond -b -l 2
|
crond -b -l 2
|
||||||
|
|
||||||
echo "[entrypoint] Starting initial scrape in background..."
|
echo "[entrypoint] Starting initial scrape in background..."
|
||||||
bash /app/run_all.sh &
|
bash /app/run_all.sh --data-dir "$DATA_DIR" &
|
||||||
|
|
||||||
echo "[entrypoint] Starting HTTP server on port 8080..."
|
echo "[entrypoint] Starting HTTP server on port 8080..."
|
||||||
exec python3 -m http.server 8080 --directory "$DATA_DIR"
|
exec python3 -m http.server 8080 --directory "$DATA_DIR"
|
||||||
|
|||||||
9
build/run.sh
Normal file
9
build/run.sh
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
docker rm -f maru-hleda-byt
|
||||||
|
|
||||||
|
# gitea registry login with kacerr / token
|
||||||
|
docker run -d --name maru-hleda-byt \
|
||||||
|
-p 8080:8080 \
|
||||||
|
-v /srv/maru-hleda-byt/data:/app/data \
|
||||||
|
gitea.home.hrajfrisbee.cz/littlemeat/maru-hleda-byt:0.01
|
||||||
@@ -7,6 +7,7 @@ PSN a CityHome mají při deduplikaci prioritu (načtou se první).
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -40,7 +41,7 @@ def dedup_key(estate: dict) -> str:
|
|||||||
return f"{street}_{price}_{area}"
|
return f"{street}_{price}_{area}"
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main(data_dir: str = "."):
|
||||||
# Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci)
|
# Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci)
|
||||||
sources = [
|
sources = [
|
||||||
("PSN", "byty_psn.json"),
|
("PSN", "byty_psn.json"),
|
||||||
@@ -51,10 +52,11 @@ def main():
|
|||||||
("iDNES", "byty_idnes.json"),
|
("iDNES", "byty_idnes.json"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
data_path = Path(data_dir)
|
||||||
all_estates = []
|
all_estates = []
|
||||||
|
|
||||||
for label, filename in sources:
|
for label, filename in sources:
|
||||||
path = Path(filename)
|
path = data_path / filename
|
||||||
if path.exists():
|
if path.exists():
|
||||||
data = json.loads(path.read_text(encoding="utf-8"))
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
# Ensure source is set (Sreality legacy)
|
# Ensure source is set (Sreality legacy)
|
||||||
@@ -111,7 +113,7 @@ def main():
|
|||||||
print(f" {src}: {count}")
|
print(f" {src}: {count}")
|
||||||
|
|
||||||
# Save merged data
|
# Save merged data
|
||||||
merged_path = Path("byty_merged.json")
|
merged_path = data_path / "byty_merged.json"
|
||||||
merged_path.write_text(
|
merged_path.write_text(
|
||||||
json.dumps(deduplicated, ensure_ascii=False, indent=2),
|
json.dumps(deduplicated, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
@@ -119,8 +121,12 @@ def main():
|
|||||||
print(f"\n✓ Sloučená data: {merged_path.resolve()}")
|
print(f"\n✓ Sloučená data: {merged_path.resolve()}")
|
||||||
|
|
||||||
# Generate map
|
# Generate map
|
||||||
generate_map(deduplicated)
|
generate_map(deduplicated, output_path=str(data_path / "mapa_bytu.html"))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
parser = argparse.ArgumentParser(description="Merge scraped data and generate map")
|
||||||
|
parser.add_argument("--data-dir", type=str, default=".",
|
||||||
|
help="Directory for reading/writing data files (default: current dir)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(data_dir=args.data_dir)
|
||||||
|
|||||||
16
regen_map.py
16
regen_map.py
@@ -5,6 +5,7 @@ Doplní chybějící plochy ze Sreality API, opraví URL, aplikuje filtry.
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
@@ -57,8 +58,9 @@ def fetch_area(hash_id: int) -> int | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main(data_dir: str = "."):
|
||||||
json_path = Path("byty_sreality.json")
|
data_path = Path(data_dir)
|
||||||
|
json_path = data_path / "byty_sreality.json"
|
||||||
if not json_path.exists():
|
if not json_path.exists():
|
||||||
print("Soubor byty_sreality.json nenalezen. Nejprve spusť scrape_and_map.py")
|
print("Soubor byty_sreality.json nenalezen. Nejprve spusť scrape_and_map.py")
|
||||||
return
|
return
|
||||||
@@ -100,15 +102,19 @@ def main():
|
|||||||
print(f"Zbývá: {len(filtered)} bytů")
|
print(f"Zbývá: {len(filtered)} bytů")
|
||||||
|
|
||||||
# Save updated data
|
# Save updated data
|
||||||
filtered_path = Path("byty_sreality.json")
|
filtered_path = data_path / "byty_sreality.json"
|
||||||
filtered_path.write_text(
|
filtered_path.write_text(
|
||||||
json.dumps(filtered, ensure_ascii=False, indent=2),
|
json.dumps(filtered, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate map
|
# Generate map
|
||||||
generate_map(filtered)
|
generate_map(filtered, output_path=str(data_path / "mapa_bytu.html"))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
parser = argparse.ArgumentParser(description="Regenerate map from existing data")
|
||||||
|
parser.add_argument("--data-dir", type=str, default=".",
|
||||||
|
help="Directory for reading/writing data files (default: current dir)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(data_dir=args.data_dir)
|
||||||
|
|||||||
14
run_all.sh
14
run_all.sh
@@ -4,6 +4,7 @@
|
|||||||
# Použití: ./run_all.sh
|
# Použití: ./run_all.sh
|
||||||
# Nebo s limity: ./run_all.sh --max-pages 1 --max-properties 10
|
# Nebo s limity: ./run_all.sh --max-pages 1 --max-properties 10
|
||||||
# Nebo s logováním: ./run_all.sh --log-level DEBUG
|
# Nebo s logováním: ./run_all.sh --log-level DEBUG
|
||||||
|
# S vlastním adresářem: ./run_all.sh --data-dir /app/data
|
||||||
# ============================================================
|
# ============================================================
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
cd "$(dirname "$0")"
|
cd "$(dirname "$0")"
|
||||||
@@ -26,16 +27,19 @@ show_help() {
|
|||||||
echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje"
|
echo " --max-pages N Maximální počet stránek ke stažení z každého zdroje"
|
||||||
echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje"
|
echo " --max-properties N Maximální počet nemovitostí ke stažení z každého zdroje"
|
||||||
echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)"
|
echo " --log-level LEVEL Úroveň logování (DEBUG, INFO, WARNING, ERROR)"
|
||||||
|
echo " --data-dir DIR Adresář pro čtení/zápis datových souborů (default: .)"
|
||||||
echo " -h, --help Zobrazí tuto nápovědu"
|
echo " -h, --help Zobrazí tuto nápovědu"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " ./run_all.sh # plný běh"
|
echo " ./run_all.sh # plný běh"
|
||||||
echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test"
|
echo " ./run_all.sh --max-pages 1 --max-properties 10 # rychlý test"
|
||||||
echo " ./run_all.sh --log-level DEBUG # s debug logováním"
|
echo " ./run_all.sh --log-level DEBUG # s debug logováním"
|
||||||
|
echo " ./run_all.sh --data-dir /app/data # Docker produkce"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Parse arguments
|
# Parse arguments
|
||||||
SCRAPER_ARGS=""
|
SCRAPER_ARGS=""
|
||||||
|
DATA_DIR="."
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
-h|--help)
|
-h|--help)
|
||||||
@@ -46,6 +50,10 @@ while [[ $# -gt 0 ]]; do
|
|||||||
SCRAPER_ARGS="$SCRAPER_ARGS $1 $2"
|
SCRAPER_ARGS="$SCRAPER_ARGS $1 $2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--data-dir)
|
||||||
|
DATA_DIR="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown argument: $1"
|
echo "Unknown argument: $1"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -55,6 +63,8 @@ while [[ $# -gt 0 ]]; do
|
|||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
SCRAPER_ARGS="$SCRAPER_ARGS --data-dir $DATA_DIR"
|
||||||
|
|
||||||
step() {
|
step() {
|
||||||
CURRENT=$((CURRENT + 1))
|
CURRENT=$((CURRENT + 1))
|
||||||
echo ""
|
echo ""
|
||||||
@@ -87,7 +97,7 @@ wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED +
|
|||||||
# ── Sloučení + mapa ──────────────────────────────────────────
|
# ── Sloučení + mapa ──────────────────────────────────────────
|
||||||
|
|
||||||
step "Sloučení dat a generování mapy"
|
step "Sloučení dat a generování mapy"
|
||||||
python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((FAILED + 1)); }
|
python3 merge_and_map.py --data-dir "$DATA_DIR" || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((FAILED + 1)); }
|
||||||
|
|
||||||
# ── Otevření mapy ────────────────────────────────────────────
|
# ── Otevření mapy ────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -100,4 +110,4 @@ else
|
|||||||
fi
|
fi
|
||||||
echo "============================================================"
|
echo "============================================================"
|
||||||
|
|
||||||
command -v open &>/dev/null && open mapa_bytu.html || true
|
command -v open &>/dev/null && open "$DATA_DIR/mapa_bytu.html" || true
|
||||||
|
|||||||
@@ -207,10 +207,10 @@ def load_cache(json_path: str = "byty_sreality.json") -> dict[int, dict]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
|
||||||
"""Main scraping function. Returns list of filtered estates."""
|
"""Main scraping function. Returns list of filtered estates."""
|
||||||
all_estates_raw = []
|
all_estates_raw = []
|
||||||
cache = load_cache()
|
cache = load_cache(str(Path(data_dir) / "byty_sreality.json"))
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Stahuji inzeráty ze Sreality.cz")
|
logger.info("Stahuji inzeráty ze Sreality.cz")
|
||||||
@@ -939,6 +939,8 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--max-properties", type=int, help="Maximum number of properties to fetch details for")
|
parser.add_argument("--max-properties", type=int, help="Maximum number of properties to fetch details for")
|
||||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
help="Logging level (default: INFO)")
|
help="Logging level (default: INFO)")
|
||||||
|
parser.add_argument("--data-dir", type=str, default=".",
|
||||||
|
help="Directory for reading/writing data files (default: current dir)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@@ -948,12 +950,13 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
data_dir = Path(args.data_dir)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
# Save raw data as JSON backup
|
# Save raw data as JSON backup
|
||||||
json_path = Path("byty_sreality.json")
|
json_path = data_dir / "byty_sreality.json"
|
||||||
json_path.write_text(
|
json_path.write_text(
|
||||||
json.dumps(estates, ensure_ascii=False, indent=2),
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
@@ -961,7 +964,7 @@ if __name__ == "__main__":
|
|||||||
logger.info(f"✓ Data uložena: {json_path.resolve()}")
|
logger.info(f"✓ Data uložena: {json_path.resolve()}")
|
||||||
|
|
||||||
# Generate map
|
# Generate map
|
||||||
map_path = generate_map(estates)
|
map_path = generate_map(estates, output_path=str(data_dir / "mapa_bytu.html"))
|
||||||
elapsed = time.time() - start
|
elapsed = time.time() - start
|
||||||
logger.info(f"\n⏱ Celkový čas: {elapsed:.0f} s")
|
logger.info(f"\n⏱ Celkový čas: {elapsed:.0f} s")
|
||||||
logger.info(f"\nOtevři v prohlížeči:\n file://{map_path}")
|
logger.info(f"\nOtevři v prohlížeči:\n file://{map_path}")
|
||||||
|
|||||||
@@ -170,8 +170,8 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
|
||||||
cache = load_cache()
|
cache = load_cache(str(Path(data_dir) / "byty_bezrealitky.json"))
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Stahuji inzeráty z Bezrealitky.cz")
|
logger.info("Stahuji inzeráty z Bezrealitky.cz")
|
||||||
@@ -395,6 +395,8 @@ if __name__ == "__main__":
|
|||||||
help="Maximum number of properties to fetch details for")
|
help="Maximum number of properties to fetch details for")
|
||||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
help="Logging level (default: INFO)")
|
help="Logging level (default: INFO)")
|
||||||
|
parser.add_argument("--data-dir", type=str, default=".",
|
||||||
|
help="Directory for reading/writing data files (default: current dir)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@@ -404,11 +406,12 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
data_dir = Path(args.data_dir)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_bezrealitky.json")
|
json_path = data_dir / "byty_bezrealitky.json"
|
||||||
json_path.write_text(
|
json_path.write_text(
|
||||||
json.dumps(estates, ensure_ascii=False, indent=2),
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
|
|||||||
@@ -194,8 +194,8 @@ def load_previous(json_path: str = "byty_cityhome.json") -> dict[str, str]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
|
||||||
previous_first_seen = load_previous()
|
previous_first_seen = load_previous(str(Path(data_dir) / "byty_cityhome.json"))
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
|
logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
|
||||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||||
@@ -367,6 +367,8 @@ if __name__ == "__main__":
|
|||||||
help="Maximum number of properties to include in results")
|
help="Maximum number of properties to include in results")
|
||||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
help="Logging level (default: INFO)")
|
help="Logging level (default: INFO)")
|
||||||
|
parser.add_argument("--data-dir", type=str, default=".",
|
||||||
|
help="Directory for reading/writing data files (default: current dir)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@@ -376,11 +378,12 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
data_dir = Path(args.data_dir)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_cityhome.json")
|
json_path = data_dir / "byty_cityhome.json"
|
||||||
json_path.write_text(
|
json_path.write_text(
|
||||||
json.dumps(estates, ensure_ascii=False, indent=2),
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
|
|||||||
@@ -278,8 +278,8 @@ def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
|
||||||
cache = load_cache()
|
cache = load_cache(str(Path(data_dir) / "byty_idnes.json"))
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Stahuji inzeráty z Reality iDNES")
|
logger.info("Stahuji inzeráty z Reality iDNES")
|
||||||
@@ -499,6 +499,8 @@ if __name__ == "__main__":
|
|||||||
help="Maximum number of properties to fetch details for")
|
help="Maximum number of properties to fetch details for")
|
||||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
help="Logging level (default: INFO)")
|
help="Logging level (default: INFO)")
|
||||||
|
parser.add_argument("--data-dir", type=str, default=".",
|
||||||
|
help="Directory for reading/writing data files (default: current dir)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@@ -508,11 +510,12 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
data_dir = Path(args.data_dir)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_idnes.json")
|
json_path = data_dir / "byty_idnes.json"
|
||||||
json_path.write_text(
|
json_path.write_text(
|
||||||
json.dumps(estates, ensure_ascii=False, indent=2),
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
|
|||||||
@@ -122,8 +122,8 @@ def load_previous(json_path: str = "byty_psn.json") -> dict[str, str]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
|
||||||
previous_first_seen = load_previous()
|
previous_first_seen = load_previous(str(Path(data_dir) / "byty_psn.json"))
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Stahuji inzeráty z PSN.cz")
|
logger.info("Stahuji inzeráty z PSN.cz")
|
||||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||||
@@ -346,6 +346,8 @@ if __name__ == "__main__":
|
|||||||
help="Maximum number of properties to include in results")
|
help="Maximum number of properties to include in results")
|
||||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
help="Logging level (default: INFO)")
|
help="Logging level (default: INFO)")
|
||||||
|
parser.add_argument("--data-dir", type=str, default=".",
|
||||||
|
help="Directory for reading/writing data files (default: current dir)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@@ -355,11 +357,12 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
data_dir = Path(args.data_dir)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_psn.json")
|
json_path = data_dir / "byty_psn.json"
|
||||||
json_path.write_text(
|
json_path.write_text(
|
||||||
json.dumps(estates, ensure_ascii=False, indent=2),
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
|
|||||||
@@ -135,8 +135,8 @@ def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
|
||||||
cache = load_cache()
|
cache = load_cache(str(Path(data_dir) / "byty_realingo.json"))
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Stahuji inzeráty z Realingo.cz")
|
logger.info("Stahuji inzeráty z Realingo.cz")
|
||||||
@@ -354,6 +354,8 @@ if __name__ == "__main__":
|
|||||||
help="Maximum number of properties to fetch details for")
|
help="Maximum number of properties to fetch details for")
|
||||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
help="Logging level (default: INFO)")
|
help="Logging level (default: INFO)")
|
||||||
|
parser.add_argument("--data-dir", type=str, default=".",
|
||||||
|
help="Directory for reading/writing data files (default: current dir)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@@ -363,11 +365,12 @@ if __name__ == "__main__":
|
|||||||
handlers=[logging.StreamHandler()]
|
handlers=[logging.StreamHandler()]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
data_dir = Path(args.data_dir)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
|
||||||
|
|
||||||
if estates:
|
if estates:
|
||||||
json_path = Path("byty_realingo.json")
|
json_path = data_dir / "byty_realingo.json"
|
||||||
json_path.write_text(
|
json_path.write_text(
|
||||||
json.dumps(estates, ensure_ascii=False, indent=2),
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
|
|||||||
Reference in New Issue
Block a user