Python Web Parsing - Vermeiden Sie es, in eine lokale Version der Website umgeleitet zu werdenPython

Python-Programme
Anonymous
 Python Web Parsing - Vermeiden Sie es, in eine lokale Version der Website umgeleitet zu werden

Post by Anonymous »

Ich verwende den folgenden Code, um Websites zu analysieren: < /p>

Code: Select all

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests

def get_navigation_links(url, limit=500, wait_time=5):
def validate_url(url_string):
try:
result = urlparse(url_string)
if not result.scheme:
url_string = "https://" + url_string
result = urlparse(url_string)
return url_string if result.netloc else None
except:
return None

validated_url = validate_url(url)
if not validated_url:
raise ValueError("Invalid URL")

base_netloc = urlparse(validated_url).netloc.split(':')[0]

# Try JavaScript-rendered version first (Selenium)
try:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(options=chrome_options)
driver.get(validated_url)
time.sleep(wait_time)  # Allow JS to render

# Check if the current URL after loading is what you expect
current_url = driver.current_url
if base_netloc in current_url and current_url != validated_url:
print(f"Redirect detected: {current_url}. Scraping original URL.")

# Continue scraping the page only if the URL is as expected
a_tags = driver.find_elements(By.TAG_NAME, "a")
seen = set()
nav_links = []

for a in a_tags:
try:
href = a.get_attribute("href")
text = a.text.strip()
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))
except:
continue

driver.quit()

# If no navigation links found via Selenium, use BeautifulSoup
if not nav_links:
print("No navigation links found via Selenium. Falling back to BeautifulSoup.")
soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser')
a_tags = soup.find_all('a')
for a in a_tags:
href = a.get('href')
text = a.get_text(strip=True)
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))

# Return first N links without filtering by keywords
return nav_links[:limit]

except Exception as e:
print(f"[Selenium failed: {e}] Falling back to BeautifulSoup.")
# Fallback to BeautifulSoup in case of an error with Selenium
soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser')
a_tags = soup.find_all('a')
seen = set()
nav_links = []

for a in a_tags:
href = a.get('href')
text = a.get_text(strip=True)
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))

return nav_links[:limit]

Das Problem, mit dem ich ausgestrahlt bin, ist, dass ich, wenn ich eine Site auswähle (z. B. https://www.nike.com>

Quick Reply

Change Text Case: 
   
  • Similar Topics
    Replies
    Views
    Last post