from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
def get_navigation_links(url, limit=500, wait_time=5):
def validate_url(url_string):
try:
result = urlparse(url_string)
if not result.scheme:
url_string = "https://" + url_string
result = urlparse(url_string)
return url_string if result.netloc else None
except:
return None
validated_url = validate_url(url)
if not validated_url:
raise ValueError("Invalid URL")
base_netloc = urlparse(validated_url).netloc.split(':')[0]
# Try JavaScript-rendered version first (Selenium)
try:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=chrome_options)
driver.get(validated_url)
time.sleep(wait_time) # Allow JS to render
# Check if the current URL after loading is what you expect
current_url = driver.current_url
if base_netloc in current_url and current_url != validated_url:
print(f"Redirect detected: {current_url}. Scraping original URL.")
# Continue scraping the page only if the URL is as expected
a_tags = driver.find_elements(By.TAG_NAME, "a")
seen = set()
nav_links = []
for a in a_tags:
try:
href = a.get_attribute("href")
text = a.text.strip()
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))
except:
continue
driver.quit()
# If no navigation links found via Selenium, use BeautifulSoup
if not nav_links:
print("No navigation links found via Selenium. Falling back to BeautifulSoup.")
soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser')
a_tags = soup.find_all('a')
for a in a_tags:
href = a.get('href')
text = a.get_text(strip=True)
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))
# Return first N links without filtering by keywords
return nav_links[:limit]
except Exception as e:
print(f"[Selenium failed: {e}] Falling back to BeautifulSoup.")
# Fallback to BeautifulSoup in case of an error with Selenium
soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser')
a_tags = soup.find_all('a')
seen = set()
nav_links = []
for a in a_tags:
href = a.get('href')
text = a.get_text(strip=True)
if href and text and urlparse(href).netloc.split(':')[0] == base_netloc:
if href not in seen:
seen.add(href)
nav_links.append((text, href))
return nav_links[:limit]
Das Problem, mit dem ich ausgestrahlt bin, ist, dass ich, wenn ich eine Site auswähle (z. B. https://www.nike.com>
Ich verwende den folgenden Code, um Websites zu analysieren: < /p> [code]from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time from urllib.parse import urlparse from bs4 import BeautifulSoup import requests
def get_navigation_links(url, limit=500, wait_time=5): def validate_url(url_string): try: result = urlparse(url_string) if not result.scheme: url_string = "https://" + url_string result = urlparse(url_string) return url_string if result.netloc else None except: return None
validated_url = validate_url(url) if not validated_url: raise ValueError("Invalid URL")
# Try JavaScript-rendered version first (Selenium) try: chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=chrome_options) driver.get(validated_url) time.sleep(wait_time) # Allow JS to render
# Check if the current URL after loading is what you expect current_url = driver.current_url if base_netloc in current_url and current_url != validated_url: print(f"Redirect detected: {current_url}. Scraping original URL.")
# Continue scraping the page only if the URL is as expected a_tags = driver.find_elements(By.TAG_NAME, "a") seen = set() nav_links = []
for a in a_tags: try: href = a.get_attribute("href") text = a.text.strip() if href and text and urlparse(href).netloc.split(':')[0] == base_netloc: if href not in seen: seen.add(href) nav_links.append((text, href)) except: continue
driver.quit()
# If no navigation links found via Selenium, use BeautifulSoup if not nav_links: print("No navigation links found via Selenium. Falling back to BeautifulSoup.") soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser') a_tags = soup.find_all('a') for a in a_tags: href = a.get('href') text = a.get_text(strip=True) if href and text and urlparse(href).netloc.split(':')[0] == base_netloc: if href not in seen: seen.add(href) nav_links.append((text, href))
# Return first N links without filtering by keywords return nav_links[:limit]
except Exception as e: print(f"[Selenium failed: {e}] Falling back to BeautifulSoup.") # Fallback to BeautifulSoup in case of an error with Selenium soup = BeautifulSoup(requests.get(validated_url).text, 'html.parser') a_tags = soup.find_all('a') seen = set() nav_links = []
for a in a_tags: href = a.get('href') text = a.get_text(strip=True) if href and text and urlparse(href).netloc.split(':')[0] == base_netloc: if href not in seen: seen.add(href) nav_links.append((text, href))
return nav_links[:limit]
[/code] Das Problem, mit dem ich ausgestrahlt bin, ist, dass ich, wenn ich eine Site auswähle (z. B. https://www.nike.com>
Mit dem neuesten Expo 52.0.32 Update habe ich angefangen, meine App zu aktualisieren. Änderungen.
Fehlt mir neue Einstellungen in Expo?
apply plugin: com.android.application
apply plugin:...
Ich umleite von dort von einer anderen Anwendung in meine .NET 8 -Anwendung um. Ich erhalte die BenutzerID und fülle das Gleiche in meiner Sitzung. Ich füge die BenutzerID in Ansprüchen hinzu. using...
Ich habe eine benutzerdefinierte Laravel Middleware -Klasse erstellt. Daraus geben ich eine Umleitung aus:
return redirect()->route('oauth.establishAuth')->with('url.intended', url()->current());...
Ich umleite von dort von einer anderen Anwendung in meine .NET 8 -Anwendung um. Ich erhalte die BenutzerID und fülle das Gleiche in meiner Sitzung. Ich füge die BenutzerID in Ansprüchen hinzu. using...
Ich möchte zu einer URL mit JSON -Antwort umleiten. Wie kann ich das tun? Zum Beispiel:
Ich habe bisher nach dem folgenden Code versucht:
return redirect(...