Python-Web-Scraping – Massen-Download verknüpfter Dateien von der SEC AAER-Site, 403 Verbotener Fehler
Posted: 07 Jan 2025, 13:30
Ich habe versucht, 300 verknüpfte Dateien von der AAER-Site der SEC herunterzuladen. Die meisten Links sind PDFs, aber einige sind Websites, die ich als PDF speichern müsste, anstatt sie nur herunterzuladen. Ich bringe mir gerade etwas Python-Web-Scraping bei und dies schien keine allzu schwierige Aufgabe zu sein, aber ich konnte den 403-Fehler beim Herunterladen nicht überwinden.
Dieser Code funktioniert Es ist in Ordnung, die Links zu den Dateien und den 4-stelligen Code zu entfernen, den ich den Dateien nennen möchte:
Aber wenn ich versuche, so etwas zu tun, bekomme ich die Downloads nicht durch:
An diesem Punkt wäre es schneller gewesen, die Dateien manuell herunterzuladen, aber ich möchte wissen, was ich falsch mache. Ich habe versucht, den User-Agent-Header festzulegen und den Benutzerklick mit Selenium zu simulieren. Vielen Dank für jeden Rat!
Dieser Code funktioniert Es ist in Ordnung, die Links zu den Dateien und den 4-stelligen Code zu entfernen, den ich den Dateien nennen möchte:
Code: Select all
from selenium import webdriver
from import By
from import Options
from import WebDriverWait
from import expected_conditions as EC
import time
import os
import requests
# Set up Chrome options to allow direct PDF download (for the download step)
download_path = "C:/Users/taylo/Downloads/sec_aaer_downloads"
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
"download.default_directory": download_path, # Specify your preferred download directory
"download.prompt_for_download": False, # Disable download prompt
"plugins.always_open_pdf_externally": True, # Automatically open PDF in browser
"safebrowsing.enabled": False, # Disable Chrome’s safe browsing check that can block downloads
"profile.default_content_settings.popups": 0 # Disable popups
# Set up the webdriver with options
driver = webdriver.Chrome(executable_path="C:/chromedriver/chromedriver", options=chrome_options)
# URLs for pages 1, 2, and 3
urls = [
# Initialize an empty list to store the URLs and AAER numbers
pdf_data = []
# Loop through each URL (pages 1, 2, and 3)
for url in urls:
print(f"Scraping URL: {url}...")
# Wait for the table rows containing links to be loaded
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="block-uswds-sec-content"]/div/div/div[3]/div/table/tbody/tr[1]')))
# Extract the link and AAER number from each row on the current page
rows = driver.find_elements(By.XPATH, '//*[@id="block-uswds-sec-content"]/div/div/div[3]/div/table/tbody/tr')
for row in rows:
# Extract the link from the first column (PDF link)
link_element = row.find_element(By.XPATH, './/td[2]/div[1]/a')
link_href = link_element.get_attribute('href')
# Extract the AAER number from the second column
aaer_text_element = row.find_element(By.XPATH, './/td[2]/div[2]/span[2]')
aaer_text = aaer_text_element.text
aaer_number = aaer_text.split("AAER-")[1].split()[0] # Extract the number after AAER-
# Store the data in a list of dictionaries
pdf_data.append({'link': link_href, 'aaer_number': aaer_number})
except Exception as e:
print(f"Error extracting data from row: {e}")
# Print the scraped data (optional for verification)
for entry in pdf_data:
print(f"Link: {entry['link']}, AAER Number: {entry['aaer_number']}")
Code: Select all
import os
import time
import requests
# Set the download path
download_path = "C:/Users/taylo/Downloads/sec_aaer_downloads"
os.makedirs(download_path, exist_ok=True)
# Loop through each entry in the pdf_data list
for entry in pdf_data:
# Extract the PDF link and AAER number
link_href = entry['link']
aaer_number = entry['aaer_number']
# Send a GET request to download the PDF
pdf_response = requests.get(link_href, stream=True, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# Check if the request was successful
if pdf_response.status_code == 200:
# Save the PDF to the download folder, using the AAER number as the filename
pdf_file_path = os.path.join(download_path, f"{aaer_number}.pdf")
with open(pdf_file_path, "wb") as pdf_file:
for chunk in pdf_response.iter_content(chunk_size=8192):
print(f"Downloaded: {aaer_number}.pdf")
print(f"Failed to download the file from {link_href}, status code: {pdf_response.status_code}")
except Exception as e:
print(f"Error downloading the PDF for AAER {aaer_number}: {e}")