Ich habe versucht, eine Website mit Selenium zu kratzen, aber ich war erfolglos, weil ich nicht in der Lage bin, sein We
Posted: 02 Feb 2025, 07:02
Problembeschreibung: Ich habe versucht, eine Website mit Selen zu kratzen, aber ich war nicht erfolgreich, weil ich nicht in der Lage war, die Webelemente zu erkennen. Ich muss täglich Produktinformationen von 6 bis 10 verschiedenen Websites sammeln. Dies ist die Website: https://threil.com/casey-rec-menu/?dtch ... D=Products.
Code-Snippet:
Code-Snippet:
Code: Select all
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time # To manage delays
# Set up Selenium WebDriver (using Microsoft Edge)
edge_options = Options()
# Initialize WebDriver
service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=edge_options)
# URL of the website to scrape
url = "https://thriveil.com/casey-rec-menu/?dtche%5Bpath%5D=products"
driver.get(url)
# WebDriver Wait
wait = WebDriverWait(driver, 30)
# Handle ad close button
try:
close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
close_button.click()
except TimeoutException:
print("Ad close button not found. Continuing...")
# Handle cookie consent
try:
accept_button = wait.until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
accept_button.click()
except TimeoutException:
print("Cookie consent button not found. Continuing...")
# Initialize storage
product_list = []
current_page = 1
# Lists to store scraped data
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []
while True: # Loop through pages
products = []
# Get all product elements
product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")
for product in product_elements:
try:
product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href")
products.append(product_url)
except Exception as e:
print(f"Error extracting product link: {e}")
continue
# Open each product in a new tab, scrape details, then close tab
for product_url in products:
driver.execute_script(f"window.open('{product_url}', '_blank');")
driver.switch_to.window(driver.window_handles[-1]) # Switch to new tab
try:
wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) # Ensure page loads
product_soup = BeautifulSoup(driver.page_source, 'html.parser')
# Scrape the product data
name = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
names.append(name.text.strip())
brand_element = driver.find_element(By.CSS_SELECTOR, "a[href]")
brand = driver.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
brand_names.append(brand.text.strip())
brand_links.append(brand_element.get_attribute('href'))
strain = driver.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
strains.append(strain.text.strip())
potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
# Extract text and remove anything before ':'
potency_values = [p.text.split(":")[-1].strip() for p in potencies]
# Join them as a single string (optional, useful for CSV)
potency_text = potency_values[1]
potencys.append(potency_text)
price = driver.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")
prices.append(price.text.strip())
effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
effects.append(", ".join([e.text.strip() for e in effect_elements]))
product_data = {
"name": names,
"brand_name": brand_names,
"brand_link": brand_links,
"strain": strains,
"potency": potencys,
"price": prices,
"effects": effects
}
product_list.append(product_data)
except Exception as e:
print(f"Error scraping product details: {e}")
driver.close()
driver.switch_to.window(driver.window_handles[0]) # Switch back to main tab
print(f"Page {current_page} scraped successfully.")
# Click the next page button if available
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]')
next_button.click()
current_page += 1
time.sleep(5) # Allow the next page to load
except NoSuchElementException:
print("No more pages found. Exiting loop.")
break
# Check if data was scraped
if product_list:
# Initialize DataFrame with the scraped data
df = pd.DataFrame(product_list) # Wrap the dict in a list
# Save DataFrame to CSV (append mode if the file exists)
df.to_csv("thriveil_products.csv", mode='a', header=not pd.io.common.file_exists("thriveil_products.csv"), index=False)
print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
print("No data to save.")
# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)
#print("Scraping completed. Data saved to 'thriveil_products.xlsx'.")
driver.quit()
Thriveil sample 1 copy.py
Displaying thriveil.py.