Warum kann ich keine Informationen zum Auflisten extrahieren?

Guest · Post by **Guest** » 07 Feb 2025, 13:52

Ich versuche, die EPC -Bewertung aus den einzelnen Auflistungen zu extrahieren. Sie können die EPC -Bewertung nur erhalten, wenn Sie auf die Auflistung klicken. Jedes Mal, wenn ich mein Skript ausführe, wird es das Problem sein, was das Problem stimmt? Obwohl ich versucht habe, die Wartezeit zu erhöhen, bis der Hauptinhalt geladen wurde, stieß ich trotzdem auf das gleiche Problem. Könnte es der kopflose
Browser sein, der nicht laden konnte? < /p>

Code: Select all

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from undetected_chromedriver import Chrome
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from typing import Iterator
import pandas as pd

# Constants
URL = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
TIMEOUT = 5

# Helper function to extract text from a WebElement
def etext(e: WebElement) -> str:
if e:
if t := e.text.strip():
return t
if (p := e.get_property("textContent")) and isinstance(p, str):
return p.strip()
return ""

# Click a WebElement
def click(driver: WebDriver, e: WebElement) -> None:
ActionChains(driver).click(e).perform()

# Get all WebElements that match the given CSS selector
def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
wait = WebDriverWait(driver, TIMEOUT)
sel = (By.CSS_SELECTOR, css)
try:
yield from wait.until(EC.presence_of_all_elements_located(sel))
except TimeoutException:
pass  # Return empty if elements are not found

# Click the "Next" button for pagination
def click_next(driver: WebDriver) -> None:
for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
if etext(a) == "Next":
click(driver, a)
break

# Handle cookie consent popup
def click_through(driver: WebDriver) -> None:
try:
wait = WebDriverWait(driver, TIMEOUT)
shadow_root = driver.find_element(By.ID, "usercentrics-root").shadow_root
button = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]")
))
click(driver, button)
except Exception:
pass  # Ignore if cookie popup isn't present

# Scrape EPC Rating from individual listing
def get_epc_rating(driver: WebDriver, listing_url: str) -> str:
driver.get(listing_url)  # Open property details page
try:
epc_element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.main-content .z3kgis3 ._1vhryas0 ._8lgu4x1 div:nth-child(3) div'))
)
return etext(epc_element)  # Extract EPC rating text
except TimeoutException:
return "N/A"  # Return "N/A" if EPC Rating is missing

# Scrape data from the search results page
def scrape_page(driver: WebDriver) ->  list[dict]:
result = []
for house in get_all(driver, "div[data-testid=result-item]"):
try:
listing_url = house.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
address = etext(house.find_element(By.CSS_SELECTOR, "h2"))
date_sold = etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
house_type = etext(house.find_element(By.CSS_SELECTOR, "div._1pbf8i52 p"))
num_rooms = etext(house.find_element(By.CSS_SELECTOR, "._1pbf8i51 div:nth-child(2) p"))
tenure = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(1) div"))
square_foot = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(2) div"))

# Get EPC Rating from listing page
epc_rating = get_epc_rating(driver, listing_url)

result.append({
"Address": address,
"Date Last Sold": date_sold,
"Property Type": house_type,
"Number of Rooms": num_rooms,
"Tenure": tenure,
"Square Foot": square_foot,
"EPC Rating": epc_rating,
"Listing URL": listing_url
})
except NoSuchElementException:
continue  # Skip missing elements
return result

# Main script execution
if __name__ == "__main__":
with Chrome() as driver:
driver.get(URL)
click_through(driver)  # Handle cookies

all_results = []
prev_url = ""
npages = 0

while prev_url != driver.current_url:  # Stop if pagination stops working (e.g., Cloudflare blocks)
prev_url = driver.current_url
all_results.extend(scrape_page(driver))
click_next(driver)
npages += 1

# Convert results to DataFrame
df = pd.DataFrame(all_results)

# Display results
print(df)
print(f"Processed {npages} pages")

# Save to CSV
df.to_csv("zoopla_data.csv", index=False)

Warum kann ich keine Informationen zum Auflisten extrahieren?

Warum kann ich keine Informationen zum Auflisten extrahieren? ⇐ Python

Quick Reply