Upload files to "/"
v1 scrapery
This commit is contained in:
328
scrape_cityhome.py
Normal file
328
scrape_cityhome.py
Normal file
@@ -0,0 +1,328 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
CityHome (city-home.cz) scraper.
|
||||
Stáhne byty na prodej v Praze z projektů CityHome/SATPO.
|
||||
Výstup: byty_cityhome.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
# ── Konfigurace ─────────────────────────────────────────────────────────────
|
||||
|
||||
MAX_PRICE = 14_000_000
|
||||
MIN_AREA = 69
|
||||
MIN_FLOOR = 2
|
||||
|
||||
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "cs,en;q=0.9",
|
||||
}
|
||||
|
||||
BASE_URL = "https://www.city-home.cz"
|
||||
|
||||
|
||||
def fetch_url(url: str) -> str:
|
||||
"""Fetch URL and return HTML string."""
|
||||
for attempt in range(3):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
return resp.read().decode("utf-8")
|
||||
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
|
||||
if attempt < 2:
|
||||
time.sleep((attempt + 1) * 2)
|
||||
print(f" Retry {attempt + 1}: {e}")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def format_price(price: int) -> str:
|
||||
s = str(price)
|
||||
parts = []
|
||||
while s:
|
||||
parts.append(s[-3:])
|
||||
s = s[:-3]
|
||||
return " ".join(reversed(parts)) + " Kč"
|
||||
|
||||
|
||||
def parse_filter_page(html: str) -> list[dict]:
|
||||
"""Parse all listing rows from the filter page."""
|
||||
listings = []
|
||||
|
||||
# Find all <tr> with data-cena attribute
|
||||
row_pattern = re.compile(
|
||||
r'<tr[^>]*'
|
||||
r'data-cena="(\d+)"[^>]*'
|
||||
r'data-plocha="([\d.]+)"[^>]*'
|
||||
r'data-unittype="(\d+)"[^>]*'
|
||||
r'data-free="(yes|no)"[^>]*'
|
||||
r'data-project="(\d+)"[^>]*'
|
||||
r'data-transaction="([^"]*)"[^>]*'
|
||||
r'data-dispozition="([^"]*)"[^>]*'
|
||||
r'data-location="([^"]*)"[^>]*'
|
||||
r'>(.*?)</tr>',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
# Also try with different attribute order
|
||||
rows = re.findall(r'<tr[^>]*data-cena="[^"]*"[^>]*>(.*?)</tr>', html, re.DOTALL)
|
||||
|
||||
for row_html in rows:
|
||||
# Extract data attributes from the surrounding <tr>
|
||||
tr_match = re.search(
|
||||
r'<tr[^>]*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*'
|
||||
r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*'
|
||||
r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*'
|
||||
r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"',
|
||||
html
|
||||
)
|
||||
|
||||
# More flexible: search around each row
|
||||
pass
|
||||
|
||||
# Better approach: find each tr tag with all its attributes
|
||||
for match in re.finditer(r'<tr\s+([^>]*data-cena="[^"]*"[^>]*)>(.*?)</tr>', html, re.DOTALL):
|
||||
attrs_str = match.group(1)
|
||||
row_content = match.group(2)
|
||||
|
||||
# Extract all data attributes
|
||||
cena = re.search(r'data-cena="(\d+)"', attrs_str)
|
||||
plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str)
|
||||
unittype = re.search(r'data-unittype="(\d+)"', attrs_str)
|
||||
free = re.search(r'data-free="(yes|no)"', attrs_str)
|
||||
project = re.search(r'data-project="(\d+)"', attrs_str)
|
||||
transaction = re.search(r'data-transaction="([^"]*)"', attrs_str)
|
||||
dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str)
|
||||
location = re.search(r'data-location="([^"]*)"', attrs_str)
|
||||
|
||||
if not cena:
|
||||
continue
|
||||
|
||||
# Extract detail URL and unit name from first cell
|
||||
link_match = re.search(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', row_content, re.DOTALL)
|
||||
detail_url = link_match.group(1).strip() if link_match else ""
|
||||
unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else ""
|
||||
|
||||
if detail_url and not detail_url.startswith("http"):
|
||||
detail_url = BASE_URL + detail_url
|
||||
|
||||
# Extract floor from cells — look for pattern like "3.NP" or "2.PP"
|
||||
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
|
||||
floor = None
|
||||
floor_text = ""
|
||||
project_name = ""
|
||||
|
||||
for cell in cells:
|
||||
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
|
||||
# Floor pattern
|
||||
np_match = re.search(r'(\d+)\.\s*NP', cell_text)
|
||||
pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
|
||||
if np_match:
|
||||
floor = int(np_match.group(1))
|
||||
floor_text = cell_text
|
||||
elif pp_match:
|
||||
floor = -int(pp_match.group(1)) # Underground
|
||||
floor_text = cell_text
|
||||
|
||||
# Extract project name — usually in a cell that's not a number/price/floor
|
||||
for cell in cells:
|
||||
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
|
||||
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
|
||||
if len(cell_text) > 3 and cell_text != unit_name:
|
||||
project_name = cell_text
|
||||
break
|
||||
|
||||
listing = {
|
||||
"price": int(cena.group(1)),
|
||||
"area": float(plocha.group(1)) if plocha else 0,
|
||||
"unittype": int(unittype.group(1)) if unittype else 0,
|
||||
"free": free.group(1) if free else "no",
|
||||
"project_id": project.group(1) if project else "",
|
||||
"transaction": transaction.group(1) if transaction else "",
|
||||
"disposition": dispozition.group(1) if dispozition else "",
|
||||
"location": location.group(1) if location else "",
|
||||
"url": detail_url,
|
||||
"unit_name": unit_name,
|
||||
"floor": floor,
|
||||
"project_name": project_name,
|
||||
}
|
||||
listings.append(listing)
|
||||
|
||||
return listings
|
||||
|
||||
|
||||
def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
|
||||
"""Extract GPS coordinates for projects from locality pages."""
|
||||
# Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
|
||||
gps_data = {}
|
||||
for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
|
||||
name = match.group(1).strip()
|
||||
lat = float(match.group(2))
|
||||
lon = float(match.group(3))
|
||||
gps_data[name] = (lat, lon)
|
||||
return gps_data
|
||||
|
||||
|
||||
def scrape():
|
||||
print("=" * 60)
|
||||
print("Stahuji inzeráty z CityHome (city-home.cz)")
|
||||
print(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
print(f"Min. plocha: {MIN_AREA} m²")
|
||||
print(f"Patro: od {MIN_FLOOR}. NP")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Fetch the main filter page
|
||||
print("\nFáze 1: Stahování seznamu bytů...")
|
||||
html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
|
||||
all_listings = parse_filter_page(html)
|
||||
print(f" Nalezeno: {len(all_listings)} jednotek")
|
||||
|
||||
# Step 2: Collect unique project slugs from detail URLs to fetch GPS
|
||||
print("\nFáze 2: Stahování GPS souřadnic projektů...")
|
||||
project_slugs = set()
|
||||
for listing in all_listings:
|
||||
url = listing.get("url", "")
|
||||
# /projekty/zateckych-14/nabidka-nemovitosti/byt-a31
|
||||
slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
|
||||
if slug_match:
|
||||
project_slugs.add(slug_match.group(1))
|
||||
|
||||
# Fetch GPS for each project from locality pages
|
||||
project_gps = {}
|
||||
for slug in sorted(project_slugs):
|
||||
time.sleep(0.5)
|
||||
try:
|
||||
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
|
||||
loc_html = fetch_url(locality_url)
|
||||
gps = extract_project_gps(loc_html)
|
||||
if gps:
|
||||
# Take first entry (the project itself)
|
||||
first_name, (lat, lon) = next(iter(gps.items()))
|
||||
project_gps[slug] = (lat, lon)
|
||||
print(f" ✓ {slug}: {lat}, {lon}")
|
||||
else:
|
||||
print(f" ✗ {slug}: GPS nenalezeno")
|
||||
except Exception as e:
|
||||
print(f" ✗ {slug}: chyba ({e})")
|
||||
|
||||
# Step 3: Filter listings
|
||||
print(f"\nFáze 3: Filtrování...")
|
||||
results = []
|
||||
excluded_sold = 0
|
||||
excluded_type = 0
|
||||
excluded_disp = 0
|
||||
excluded_price = 0
|
||||
excluded_area = 0
|
||||
excluded_floor = 0
|
||||
excluded_no_gps = 0
|
||||
|
||||
for listing in all_listings:
|
||||
# Only available units
|
||||
if listing["free"] != "yes":
|
||||
excluded_sold += 1
|
||||
continue
|
||||
|
||||
# Only apartments (unittype=2)
|
||||
if listing["unittype"] != 2:
|
||||
excluded_type += 1
|
||||
continue
|
||||
|
||||
# Only sales
|
||||
if listing["transaction"] != "prodej":
|
||||
excluded_type += 1
|
||||
continue
|
||||
|
||||
# Disposition
|
||||
disp = listing["disposition"]
|
||||
if disp not in WANTED_DISPOSITIONS:
|
||||
excluded_disp += 1
|
||||
continue
|
||||
|
||||
# Price
|
||||
price = listing["price"]
|
||||
if price <= 0 or price > MAX_PRICE:
|
||||
excluded_price += 1
|
||||
continue
|
||||
|
||||
# Area
|
||||
area = listing["area"]
|
||||
if area < MIN_AREA:
|
||||
excluded_area += 1
|
||||
continue
|
||||
|
||||
# Floor
|
||||
floor = listing["floor"]
|
||||
if floor is not None and floor < MIN_FLOOR:
|
||||
excluded_floor += 1
|
||||
continue
|
||||
|
||||
# GPS from project
|
||||
url = listing.get("url", "")
|
||||
slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
|
||||
slug = slug_match.group(1) if slug_match else ""
|
||||
gps = project_gps.get(slug)
|
||||
|
||||
if not gps:
|
||||
excluded_no_gps += 1
|
||||
continue
|
||||
|
||||
lat, lon = gps
|
||||
|
||||
result = {
|
||||
"hash_id": f"cityhome_{slug}_{listing['unit_name']}",
|
||||
"name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
|
||||
"price": price,
|
||||
"price_formatted": format_price(price),
|
||||
"locality": f"{listing['project_name']}, Praha",
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"disposition": disp,
|
||||
"floor": floor,
|
||||
"area": area,
|
||||
"building_type": "Cihlová", # CityHome renovuje cihlové domy
|
||||
"ownership": "neuvedeno",
|
||||
"url": url,
|
||||
"source": "cityhome",
|
||||
"image": "",
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Výsledky CityHome:")
|
||||
print(f" Celkem jednotek: {len(all_listings)}")
|
||||
print(f" Vyloučeno (prodáno): {excluded_sold}")
|
||||
print(f" Vyloučeno (typ): {excluded_type}")
|
||||
print(f" Vyloučeno (dispozice): {excluded_disp}")
|
||||
print(f" Vyloučeno (cena): {excluded_price}")
|
||||
print(f" Vyloučeno (plocha): {excluded_area}")
|
||||
print(f" Vyloučeno (patro): {excluded_floor}")
|
||||
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
||||
print(f" ✓ Vyhovující byty: {len(results)}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start = time.time()
|
||||
estates = scrape()
|
||||
|
||||
if estates:
|
||||
json_path = Path("byty_cityhome.json")
|
||||
json_path.write_text(
|
||||
json.dumps(estates, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
print(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
print(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
else:
|
||||
print("\nŽádné byty z CityHome neodpovídají kritériím :(")
|
||||
Reference in New Issue
Block a user