Ich habe versucht, eine Website mit Selenium zu kratzen, aber ich war erfolglos, weil ich nicht in der Lage bin, sein WePython

Python-Programme
Anonymous
 Ich habe versucht, eine Website mit Selenium zu kratzen, aber ich war erfolglos, weil ich nicht in der Lage bin, sein We

Post by Anonymous »

Problembeschreibung: Ich habe versucht, eine Website mit Selen zu kratzen, aber ich war nicht erfolgreich, weil ich nicht in der Lage war, die Webelemente zu erkennen. Ich muss täglich Produktinformationen von 6 bis 10 verschiedenen Websites sammeln. Dies ist die Website: https://threil.com/casey-rec-menu/?dtch ... D=Products.
Code-Snippet:

Code: Select all

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time  # To manage delays

# Set up Selenium WebDriver (using Microsoft Edge)
edge_options = Options()

# Initialize WebDriver
service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=edge_options)

# URL of the website to scrape
url = "https://thriveil.com/casey-rec-menu/?dtche%5Bpath%5D=products"
driver.get(url)

# WebDriver Wait
wait = WebDriverWait(driver, 30)

# Handle ad close button
try:
close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
close_button.click()
except TimeoutException:
print("Ad close button not found. Continuing...")

# Handle cookie consent
try:
accept_button = wait.until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
accept_button.click()
except TimeoutException:
print("Cookie consent button not found.  Continuing...")

# Initialize storage
product_list = []
current_page = 1

# Lists to store scraped data
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []

while True:  # Loop through pages
products = []

# Get all product elements
product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")

for product in product_elements:
try:
product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href")
products.append(product_url)
except Exception as e:
print(f"Error extracting product link: {e}")
continue

# Open each product in a new tab, scrape details, then close tab
for product_url in products:
driver.execute_script(f"window.open('{product_url}', '_blank');")
driver.switch_to.window(driver.window_handles[-1])  # Switch to new tab

try:
wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))  # Ensure page loads
product_soup = BeautifulSoup(driver.page_source, 'html.parser')

# Scrape the product data
name = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
names.append(name.text.strip())

brand_element = driver.find_element(By.CSS_SELECTOR, "a[href]")
brand = driver.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
brand_names.append(brand.text.strip())
brand_links.append(brand_element.get_attribute('href'))

strain = driver.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
strains.append(strain.text.strip())

potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
# Extract text and remove anything before ':'
potency_values = [p.text.split(":")[-1].strip() for p in potencies]
# Join them as a single string (optional, useful for CSV)
potency_text = potency_values[1]
potencys.append(potency_text)

price = driver.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")
prices.append(price.text.strip())

effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
effects.append(", ".join([e.text.strip() for e in effect_elements]))

product_data = {
"name": names,
"brand_name": brand_names,
"brand_link": brand_links,
"strain": strains,
"potency": potencys,
"price": prices,
"effects": effects
}
product_list.append(product_data)

except Exception as e:
print(f"Error scraping product details: {e}")

driver.close()
driver.switch_to.window(driver.window_handles[0])  # Switch back to main tab

print(f"Page {current_page} scraped successfully.")

# Click the next page button if available
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]')
next_button.click()
current_page += 1
time.sleep(5)  # Allow the next page to load
except NoSuchElementException:
print("No more pages found. Exiting loop.")
break

# Check if data was scraped
if product_list:
# Initialize DataFrame with the scraped data
df = pd.DataFrame(product_list)  # Wrap the dict in a list

# Save DataFrame to CSV (append mode if the file exists)
df.to_csv("thriveil_products.csv", mode='a', header=not pd.io.common.file_exists("thriveil_products.csv"), index=False)
print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
print("No data to save.")

# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)
#print("Scraping completed.  Data saved to 'thriveil_products.xlsx'.")

driver.quit()
Thriveil sample 1 copy.py
Displaying thriveil.py.

Quick Reply

Change Text Case: 
   
  • Similar Topics
    Replies
    Views
    Last post