Python Web Parsing - Vermeiden Sie es, in eine lokale Version der Website umgeleitet zu werden

Python Web Parsing - Vermeiden Sie es, in eine lokale Version der Website umgeleitet zu werden ⇐ Python

Post Reply Previous topic Next topic

1 post • Page 1 of 1

Anonymous

Python Web Parsing - Vermeiden Sie es, in eine lokale Version der Website umgeleitet zu werden

Post by Anonymous » 11 Apr 2025, 12:01

Ich verwende den folgenden Code, um Websites zu analysieren: < /p>

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests

def get_navigation_links(url, limit=500, wait_time=5):
def validate_url(url_string):
try:
result = urlparse(url_string)
if not result.scheme:
url_string = "https://" + url_string
result = urlparse(url_string)
return url_string if result.netloc else None
except:
return None

validated_url = validate_url(url)
if not validated_url:
raise ValueError("Invalid URL")

base_netloc = urlparse(validated_url).netloc.split(':')[0]

# Try JavaScript-rendered version first (Selenium)
try:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(options=chrome_options)
driver.get(validated_url)
time.sleep(wait_time)  # Allow JS to render

# Check if the current URL after loading is what you expect
current_url = driver.current_url
if base_netloc in current_url and current_url != validated_url:
print(f"Redirect detected: {current_url}. Scraping original URL.")

# Continue scraping the page only if the URL is as expected
a_tags = driver.find_elements(By.TAG_NAME, "a")
seen = set()
nav_links = []

for a in a_tags:
try:
href = a.get_attribute("href")
text = a.text.strip()
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))
except:
continue

driver.quit()

# If no navigation links found via Selenium, use BeautifulSoup
if not nav_links:
print("No navigation links found via Selenium. Falling back to BeautifulSoup.")
soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser')
a_tags = soup.find_all('a')
for a in a_tags:
href = a.get('href')
text = a.get_text(strip=True)
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))

# Return first N links without filtering by keywords
return nav_links[:limit]

except Exception as e:
print(f"[Selenium failed: {e}] Falling back to BeautifulSoup.")
# Fallback to BeautifulSoup in case of an error with Selenium
soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser')
a_tags = soup.find_all('a')
seen = set()
nav_links = []

for a in a_tags:
href = a.get('href')
text = a.get_text(strip=True)
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))

return nav_links[:limit]

Das Problem, mit dem ich ausgestrahlt bin, ist, dass ich, wenn ich eine Site auswähle (z. B. https://www.nike.com>

1744365688

Anonymous

Ich verwende den folgenden Code, um Websites zu analysieren: < /p>
[code]from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests

def get_navigation_links(url, limit=500, wait_time=5):
def validate_url(url_string):
try:
result = urlparse(url_string)
if not result.scheme:
url_string = "https://" + url_string
result = urlparse(url_string)
return url_string if result.netloc else None
except:
return None

validated_url = validate_url(url)
if not validated_url:
raise ValueError("Invalid URL")

base_netloc = urlparse(validated_url).netloc.split(':')[0]

# Try JavaScript-rendered version first (Selenium)
try:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(options=chrome_options)
driver.get(validated_url)
time.sleep(wait_time)  # Allow JS to render

# Check if the current URL after loading is what you expect
current_url = driver.current_url
if base_netloc in current_url and current_url != validated_url:
print(f"Redirect detected: {current_url}. Scraping original URL.")

# Continue scraping the page only if the URL is as expected
a_tags = driver.find_elements(By.TAG_NAME, "a")
seen = set()
nav_links = []

for a in a_tags:
try:
href = a.get_attribute("href")
text = a.text.strip()
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))
except:
continue

driver.quit()

# If no navigation links found via Selenium, use BeautifulSoup
if not nav_links:
print("No navigation links found via Selenium. Falling back to BeautifulSoup.")
soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser')
a_tags = soup.find_all('a')
for a in a_tags:
href = a.get('href')
text = a.get_text(strip=True)
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))

# Return first N links without filtering by keywords
return nav_links[:limit]

except Exception as e:
print(f"[Selenium failed: {e}] Falling back to BeautifulSoup.")
# Fallback to BeautifulSoup in case of an error with Selenium
soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser')
a_tags = soup.find_all('a')
seen = set()
nav_links = []

for a in a_tags:
href = a.get('href')
text = a.get_text(strip=True)
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))

return nav_links[:limit]

[/code]
Das Problem, mit dem ich ausgestrahlt bin, ist, dass ich, wenn ich eine Site auswähle (z. B. https://www.nike.com>

Post Reply Previous topic Next topic

1 post • Page 1 of 1

Quick Reply

Username:

Change Text Case:

Smilies

View more smilies

Similar Topics

Replies

Views

Last post

Eine vorhandene App -Version mit neuerer Version kann nicht aktualisiert werden, ohne sie als neue Version zu installier

Last post by Anonymous « 03 Mar 2025, 22:07
Posted in Android

by Anonymous » 03 Mar 2025, 22:07 » in Android

Mit dem neuesten Expo 52.0.32 Update habe ich angefangen, meine App zu aktualisieren. Änderungen.
Fehlt mir neue Einstellungen in Expo?
apply plugin: com.android.application
apply plugin:...

0 Replies

13 Views

Last post by Anonymous
03 Mar 2025, 22:07
Benutzer werden nicht auf nicht autorisierte Seite umgeleitet, wenn nicht in .NET 8 autorisiert wird

Last post by Anonymous « 28 Feb 2025, 05:58
Posted in C#

by Anonymous » 28 Feb 2025, 05:58 » in C#

Ich umleite von dort von einer anderen Anwendung in meine .NET 8 -Anwendung um. Ich erhalte die BenutzerID und fülle das Gleiche in meiner Sitzung. Ich füge die BenutzerID in Ansprüchen hinzu. using...

0 Replies

24 Views

Last post by Anonymous
28 Feb 2025, 05:58
Sitzungswerte, die nicht in Laravel Middleware umgeleitet werden (Laravel 12)

Last post by Anonymous « 06 Apr 2025, 20:49
Posted in Php

by Anonymous » 06 Apr 2025, 20:49 » in Php

Ich habe eine benutzerdefinierte Laravel Middleware -Klasse erstellt. Daraus geben ich eine Umleitung aus:
return redirect()->route('oauth.establishAuth')->with('url.intended', url()->current());...

0 Replies

1 Views

Last post by Anonymous
06 Apr 2025, 20:49
Der Benutzer wird nicht zur nicht autorisierten Seite umgeleitet, wenn sie nicht autorisiert ist

Last post by Anonymous « 28 Feb 2025, 06:58
Posted in C#

by Anonymous » 28 Feb 2025, 06:58 » in C#

Ich umleite von dort von einer anderen Anwendung in meine .NET 8 -Anwendung um. Ich erhalte die BenutzerID und fülle das Gleiche in meiner Sitzung. Ich füge die BenutzerID in Ansprüchen hinzu. using...

0 Replies

10 Views

Last post by Anonymous
28 Feb 2025, 06:58
Return Direct mit JSON -Reaktion in Laravel umgeleitet

Last post by Anonymous « 13 Apr 2025, 12:02
Posted in Php

by Anonymous » 13 Apr 2025, 12:02 » in Php

Ich möchte zu einer URL mit JSON -Antwort umleiten. Wie kann ich das tun? Zum Beispiel:
Ich habe bisher nach dem folgenden Code versucht:
return redirect(...

0 Replies

3 Views

Last post by Anonymous
13 Apr 2025, 12:02

Return to “Python”