Scraping Amazon Seller -Informationen mit Selenium: Der Geschäftsname/Telefonnummer kann nicht extrahieren [geschlossen]Python

Python-Programme
Anonymous
 Scraping Amazon Seller -Informationen mit Selenium: Der Geschäftsname/Telefonnummer kann nicht extrahieren [geschlossen]

Post by Anonymous »

Problembeschreibung: < /strong>
Ich versuche, den Firmennamen und die Telefonnummer des Verkäufers von Amazon -Produktseiten mit Selenium und BeautifulSoup zu extrahieren. Mein Code navigiert zum Verkäuferprofil, aber es kann den Firmennamen und die Telefonnummer nicht abrufen.

Code: Select all

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException  # Import TimeoutException
from bs4 import BeautifulSoup

# ScraperAPI proxy configuration
# PROXY = "scraperapi.follow_redirect=false.output_format=csv.autoparse=true.country_code=us:f@proxy-server.scraperapi.com:8001"

def get_page_content(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument(f'--proxy-server=https://{PROXY}')
driver = webdriver.Chrome(options=options)

try:
driver.get(url)

# Wait until the product elements are loaded
WebDriverWait(driver, 20).until(  # Increased timeout to 20 seconds
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.a-link-normal.s-line-clamp-2.s-link-style.a-text-normal'))
)

page_content = driver.page_source
return BeautifulSoup(page_content, 'html.parser')
except TimeoutException:  # Use the imported TimeoutException
print(f"Timeout occurred while loading {url}.")
# Check for CAPTCHA
if "captcha" in driver.page_source.lower():
print("CAPTCHA detected. You may need to solve it manually or use a CAPTCHA-solving service.")
return None
finally:
driver.quit()

def extract_seller_info(driver, product_url):
wait = WebDriverWait(driver, 10)
driver.get(product_url)

try:
# Wait for the "sold by" element to be present
sold_by_element = wait.until(EC.presence_of_element_located((By.ID, "sellerProfileTriggerId")))
sold_by_url = sold_by_element.get_attribute('href')

# Navigate to the seller profile page
driver.get(sold_by_url)

# Wait until the detailed seller information is present
wait.until(EC.presence_of_element_located((By.ID, 'page-section-detail-seller-info')))

soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find the section with detailed seller information
detailed_info_section = soup.find('div', id='page-section-detail-seller-info')
business_name = None
phone_number = 'N/A'

if detailed_info_section:
# Extract business name
for span in detailed_info_section.find_all('span', class_='a-text-bold'):
if 'Business Name:' in span.text:
business_name = span.find_next_sibling('span').text.strip()
break

# Extract phone number if available
for span in detailed_info_section.find_all('span', class_='a-text-bold'):
if 'Phone Number:' in span.text:
phone_number = span.find_next_sibling('span').text.strip()
break

return business_name, phone_number
except Exception as e:
print(f"Error extracting seller info: {e}")
return None, None

def search_amazon(page_limit):
base_url = "https://www.amazon.com/s?k=earbuds&page={}"
product_urls = []

for page in range(1, page_limit + 1):
print(f"Fetching page {page}...")
soup = get_page_content(base_url.format(page))

if soup:
# Updated CSS selector for product links
page_product_urls = [a['href'] for a in soup.select('a.a-link-normal.s-line-clamp-2.s-link-style.a-text-normal')]
product_urls.extend(["https://www.amazon.com" + url for url in page_product_urls])
else:
print(f"Failed to fetch content for page {page}.  Skipping...")

return product_urls

def main():
page_limit = int(input("Enter the number of pages to scrape: "))
product_urls = search_amazon(page_limit)

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument(f'--proxy-server=https://{PROXY}')
driver = webdriver.Chrome(options=options)

try:
for url in product_urls:
business_name, phone_number = extract_seller_info(driver, url)
print(f"Product URL: {url}")
print(f"Business Name: {business_name}")
print(f"Phone Number: {phone_number}")
print("-" * 80)
finally:
driver.quit()

if __name__ == "__main__":
main()
Was passiert: [/b]

Das Skript navigiert erfolgreich zur Verkäuferprofilseite. /> Die Verkäuferseite zeigt manuell überprüft, ob die Daten vorhanden sind (z. B. im Abschnitt "Geschäftsdetails". /> Wie kann ich zuverlässig den "Firmennamen" und "Telefonnummer"

Felder finden?>

Quick Reply

Change Text Case: 
   
  • Similar Topics
    Replies
    Views
    Last post