Programmiererforum

Posted: **02 Feb 2025, 07:02**

Problembeschreibung: Ich habe versucht, eine Website mit Selen zu kratzen, aber ich war nicht erfolgreich, weil ich nicht in der Lage war, die Webelemente zu erkennen. Ich muss täglich Produktinformationen von 6 bis 10 verschiedenen Websites sammeln. Dies ist die Website: https://threil.com/casey-rec-menu/?dtch ... D=Products.
Code-Snippet:

Code: Select all

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time  # To manage delays

# Set up Selenium WebDriver (using Microsoft Edge)
edge_options = Options()

# Initialize WebDriver
service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=edge_options)

# URL of the website to scrape
url = "https://thriveil.com/casey-rec-menu/?dtche%5Bpath%5D=products"
driver.get(url)

# WebDriver Wait
wait = WebDriverWait(driver, 30)

# Handle ad close button
try:
close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
close_button.click()
except TimeoutException:
print("Ad close button not found. Continuing...")

# Handle cookie consent
try:
accept_button = wait.until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
accept_button.click()
except TimeoutException:
print("Cookie consent button not found.  Continuing...")

# Initialize storage
product_list = []
current_page = 1

# Lists to store scraped data
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []

while True:  # Loop through pages
products = []

# Get all product elements
product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")

for product in product_elements:
try:
product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href")
products.append(product_url)
except Exception as e:
print(f"Error extracting product link: {e}")
continue

# Open each product in a new tab, scrape details, then close tab
for product_url in products:
driver.execute_script(f"window.open('{product_url}', '_blank');")
driver.switch_to.window(driver.window_handles[-1])  # Switch to new tab

try:
wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))  # Ensure page loads
product_soup = BeautifulSoup(driver.page_source, 'html.parser')

# Scrape the product data
name = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
names.append(name.text.strip())

brand_element = driver.find_element(By.CSS_SELECTOR, "a[href]")
brand = driver.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
brand_names.append(brand.text.strip())
brand_links.append(brand_element.get_attribute('href'))

strain = driver.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
strains.append(strain.text.strip())

potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
# Extract text and remove anything before ':'
potency_values = [p.text.split(":")[-1].strip() for p in potencies]
# Join them as a single string (optional, useful for CSV)
potency_text = potency_values[1]
potencys.append(potency_text)

price = driver.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")
prices.append(price.text.strip())

effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
effects.append(", ".join([e.text.strip() for e in effect_elements]))

product_data = {
"name": names,
"brand_name": brand_names,
"brand_link": brand_links,
"strain": strains,
"potency": potencys,
"price": prices,
"effects": effects
}
product_list.append(product_data)

except Exception as e:
print(f"Error scraping product details: {e}")

driver.close()
driver.switch_to.window(driver.window_handles[0])  # Switch back to main tab

print(f"Page {current_page} scraped successfully.")

# Click the next page button if available
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]')
next_button.click()
current_page += 1
time.sleep(5)  # Allow the next page to load
except NoSuchElementException:
print("No more pages found. Exiting loop.")
break

# Check if data was scraped
if product_list:
# Initialize DataFrame with the scraped data
df = pd.DataFrame(product_list)  # Wrap the dict in a list

# Save DataFrame to CSV (append mode if the file exists)
df.to_csv("thriveil_products.csv", mode='a', header=not pd.io.common.file_exists("thriveil_products.csv"), index=False)
print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
print("No data to save.")

# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)
#print("Scraping completed.  Data saved to 'thriveil_products.xlsx'.")

driver.quit()
Thriveil sample 1 copy.py
Displaying thriveil.py.

Programmiererforum

Ich habe versucht, eine Website mit Selenium zu kratzen, aber ich war erfolglos, weil ich nicht in der Lage bin, sein We

Ich habe versucht, eine Website mit Selenium zu kratzen, aber ich war erfolglos, weil ich nicht in der Lage bin, sein We