Problembeschreibung: Ich habe versucht, eine Website mit Selen zu kratzen, aber ich war nicht erfolgreich, weil ich nicht in der Lage war, die Webelemente zu erkennen. Ich muss täglich Produktinformationen von 6 bis 10 verschiedenen Websites sammeln. Dies ist die Website: https://threil.com/casey-rec-menu/?dtch ... D=Products.
Code-Snippet:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time # To manage delays
# Set up Selenium WebDriver (using Microsoft Edge)
edge_options = Options()
# Initialize WebDriver
service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=edge_options)
# URL of the website to scrape
url = "https://thriveil.com/casey-rec-menu/?dtche%5Bpath%5D=products"
driver.get(url)
# WebDriver Wait
wait = WebDriverWait(driver, 30)
# Handle ad close button
try:
close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
close_button.click()
except TimeoutException:
print("Ad close button not found. Continuing...")
# Handle cookie consent
try:
accept_button = wait.until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
accept_button.click()
except TimeoutException:
print("Cookie consent button not found. Continuing...")
# Initialize storage
product_list = []
current_page = 1
# Lists to store scraped data
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []
while True: # Loop through pages
products = []
# Get all product elements
product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")
for product in product_elements:
try:
product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href")
products.append(product_url)
except Exception as e:
print(f"Error extracting product link: {e}")
continue
# Open each product in a new tab, scrape details, then close tab
for product_url in products:
driver.execute_script(f"window.open('{product_url}', '_blank');")
driver.switch_to.window(driver.window_handles[-1]) # Switch to new tab
try:
wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) # Ensure page loads
product_soup = BeautifulSoup(driver.page_source, 'html.parser')
# Scrape the product data
name = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
names.append(name.text.strip())
brand_element = driver.find_element(By.CSS_SELECTOR, "a[href]")
brand = driver.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
brand_names.append(brand.text.strip())
brand_links.append(brand_element.get_attribute('href'))
strain = driver.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
strains.append(strain.text.strip())
potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
# Extract text and remove anything before ':'
potency_values = [p.text.split(":")[-1].strip() for p in potencies]
# Join them as a single string (optional, useful for CSV)
potency_text = potency_values[1]
potencys.append(potency_text)
price = driver.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")
prices.append(price.text.strip())
effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
effects.append(", ".join([e.text.strip() for e in effect_elements]))
product_data = {
"name": names,
"brand_name": brand_names,
"brand_link": brand_links,
"strain": strains,
"potency": potencys,
"price": prices,
"effects": effects
}
product_list.append(product_data)
except Exception as e:
print(f"Error scraping product details: {e}")
driver.close()
driver.switch_to.window(driver.window_handles[0]) # Switch back to main tab
print(f"Page {current_page} scraped successfully.")
# Click the next page button if available
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]')
next_button.click()
current_page += 1
time.sleep(5) # Allow the next page to load
except NoSuchElementException:
print("No more pages found. Exiting loop.")
break
# Check if data was scraped
if product_list:
# Initialize DataFrame with the scraped data
df = pd.DataFrame(product_list) # Wrap the dict in a list
# Save DataFrame to CSV (append mode if the file exists)
df.to_csv("thriveil_products.csv", mode='a', header=not pd.io.common.file_exists("thriveil_products.csv"), index=False)
print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
print("No data to save.")
# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)
#print("Scraping completed. Data saved to 'thriveil_products.xlsx'.")
driver.quit()
Thriveil sample 1 copy.py
Displaying thriveil.py.
Problembeschreibung: Ich habe versucht, eine Website mit Selen zu kratzen, aber ich war nicht erfolgreich, weil ich nicht in der Lage war, die Webelemente zu erkennen. Ich muss täglich Produktinformationen von 6 bis 10 verschiedenen Websites sammeln. Dies ist die Website: https://threil.com/casey-rec-menu/?dtche%5BPath%5D=Products. Code-Snippet: [code]from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.edge.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.edge.options import Options from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException, TimeoutException from bs4 import BeautifulSoup import pandas as pd import time # To manage delays
# Set up Selenium WebDriver (using Microsoft Edge) edge_options = Options()
# URL of the website to scrape url = "https://thriveil.com/casey-rec-menu/?dtche%5Bpath%5D=products" driver.get(url)
# WebDriver Wait wait = WebDriverWait(driver, 30)
# Handle ad close button try: close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close"))) close_button.click() except TimeoutException: print("Ad close button not found. Continuing...")
# Lists to store scraped data names = [] brand_names = [] brand_links = [] strains = [] potencys = [] prices = [] effects = []
while True: # Loop through pages products = []
# Get all product elements product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")
for product in product_elements: try: product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href") products.append(product_url) except Exception as e: print(f"Error extracting product link: {e}") continue
# Open each product in a new tab, scrape details, then close tab for product_url in products: driver.execute_script(f"window.open('{product_url}', '_blank');") driver.switch_to.window(driver.window_handles[-1]) # Switch to new tab
potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0") # Extract text and remove anything before ':' potency_values = [p.text.split(":")[-1].strip() for p in potencies] # Join them as a single string (optional, useful for CSV) potency_text = potency_values[1] potencys.append(potency_text)
effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1") effects.append(", ".join([e.text.strip() for e in effect_elements]))
# Click the next page button if available try: next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]') next_button.click() current_page += 1 time.sleep(5) # Allow the next page to load except NoSuchElementException: print("No more pages found. Exiting loop.") break
# Check if data was scraped if product_list: # Initialize DataFrame with the scraped data df = pd.DataFrame(product_list) # Wrap the dict in a list
# Save DataFrame to CSV (append mode if the file exists) df.to_csv("thriveil_products.csv", mode='a', header=not pd.io.common.file_exists("thriveil_products.csv"), index=False) print("Scraping completed. Data saved to 'thriveil_products.csv'.") else: print("No data to save.")
# Save DataFrame to Excel #df = pd.DataFrame(product_list) #df.to_excel("thriveil_products.xlsx", index=False) #print("Scraping completed. Data saved to 'thriveil_products.xlsx'.")
Das Problem besteht darin, dass das for-Element beim erneuten Rendern ein neues Dom-Element erstellt. Das heißt, wenn ein Element im for-Element keinen Übergang durchführen kann, wenn sich der Status...
Das Problem besteht darin, dass das for-Element beim erneuten Rendern ein neues Dom-Element erstellt. Das heißt, wenn ein Element im for-Element keinen Übergang durchführen kann, wenn sich der Status...
Das Problem besteht darin, dass das for-Element beim erneuten Rendern ein neues Dom-Element erstellt. Das heißt, wenn ein Element im for-Element keinen Übergang durchführen kann, wenn sich der Status...
Mein Beitrag wurde nach der Bewertung gelöscht und ich bin mir nicht sicher, was ich falsch gemacht habe und habe es nicht richtig gemacht. Der ursprüngliche Link ist
Wie man Text ohne Leerzeichen...
Verwenden Sie Shadcn/Ui Blatt zusammen mit einem Dropdown -Menu im Blatt . Es wird geklickt, es gibt den folgenden Fehler in der Konsole
Blocked aria-hidden on an element because its descendant...