Probleme beim Herunterladen von Dateien mit Selenium + ScrapyPython

Python-Programme
Anonymous
 Probleme beim Herunterladen von Dateien mit Selenium + Scrapy

Post by Anonymous »

Dieser Code soll einige Dokumente herunterladen, die in einer Reihe bestimmter Links am meisten lokalisiert werden. Was könnte das Problem sein? < /P>

Code: Select all

class DownloaderSpider(scrapy.Spider):

def __init__(self, *args, **kwargs):
super(DownloaderSpider, self).__init__(*args, **kwargs)

# Configure Chrome WebDriver with download preferences
options = webdriver.ChromeOptions()
prefs = {
"download.default_directory": "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads",
"download.prompt_for_download": False,
"plugins.always_open_pdf_externally": True
}
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def parse(self, response):
query = response.meta['query']
summary = response.meta['summary']
date = response.meta['date']
deadline = response.meta['deadline']
pdf_link = None

self.driver.get(response.url)

# Wait until the document is fully loaded
WebDriverWait(self.driver, 15).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)

if not self.wait_for_stability():
self.log("Page did not stabilize in time.")
return

response = HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')

elements = response.xpath("//tr | //div[not(div)]")
self.log(f"Found {len(elements)} elements containing text.")

best_match = None
highest_score = 0
for element in elements:
element_text = element.xpath("string(.)").get().strip()
score = fuzz.partial_ratio(summary.lower(), element_text.lower())
# Accept element if it contains a matching date (or deadline) in any format
if score > highest_score and (self.contains_matching_date(element_text, date) or self.contains_matching_date(element_text, deadline)):
highest_score = score
best_match = element

if best_match and highest_score >= 0:  # Adjust threshold as needed
self.log(f"Best match found with score {highest_score}")
pdf_link = best_match.xpath(".//a[contains(@href, '.pdf')]/@href").get()
if pdf_link:
self.log(f"Found PDF link: {pdf_link}")
if pdf_link:
pdf_link = response.urljoin(pdf_link)
try:
# Use Selenium to click the PDF link and trigger the download
pdf_element = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable((By.XPATH, f"//a[contains(@href, '{pdf_link.split('/')[-1]}')]"))
)
pdf_element.click()

# Wait for the file to appear in the download directory
download_dir = "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads"
local_filename = query.replace(' ', '_') + ".pdf"
local_filepath = os.path.join(download_dir, local_filename)

timeout = 30  # seconds
start_time = time.time()
while not os.path.exists(local_filepath):
if time.time() - start_time >  timeout:
raise Exception("Download timed out.")
time.sleep(1)

self.log(f"Downloaded file {local_filepath}")
except Exception as e:
self.log(f"Failed to download file from {pdf_link}: {e}")
else:
self.log("No direct PDF link found, checking for next page link.")
next_page = best_match.xpath(".//a/@href").get() if best_match else None
if next_page:
next_page = response.urljoin(next_page)
self.log(f"Following next page link: {next_page}")
yield scrapy.Request(next_page, self.parse_next_page, meta={'query': query})

def parse_next_page(self, response):
query = response.meta['query']

self.driver.get(response.url)

WebDriverWait(self.driver, 15).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)

if not self.wait_for_stability():
self.log("Page did not stabilize in time.")
return

response = HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')

pdf_link = response.xpath("//a[contains(@href, '.pdf')]/@href").get()
if pdf_link:
pdf_link = response.urljoin(pdf_link)
try:
# Use Selenium to click the PDF link and trigger the download
pdf_element = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable((By.XPATH, f"//a[contains(@href, '{pdf_link.split('/')[-1]}')]"))
)
pdf_element.click()

# Wait for the file to appear in the download directory
download_dir = "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads"
local_filename = query.replace(' ', '_') + ".pdf"
local_filepath = os.path.join(download_dir, local_filename)

timeout = 30  # seconds
start_time = time.time()
while not os.path.exists(local_filepath):
if time.time() - start_time > timeout:
raise Exception("Download timed out.")
time.sleep(1)

self.log(f"Downloaded file {local_filepath}")
except Exception as e:
self.log(f"Failed to download file from {pdf_link}: {e}")
else:
self.log("No PDF link found on next page.")
< /code>
Die Pipeline lautet wie folgt: Der Code erstellt und konfiguriert den Selenium -Treiber und navigiert die Links im .csv -Dokument.  Suchen Sie den wahrscheinlichsten Eintrag auf der Seite,
Hier sind die Protokolle, die dem Download entsprechen, wenn es kein Download -Link ist, navigieren Sie zur neuen Seite und suchen Sie den ersten Download -Link und laden Sie ihn herunter (oder versucht es): < /p>
2025-03-29 17:31:14 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying  (failed 6 times): []
2025-03-29 17:31:14 [scrapy.core.scraper] ERROR: Error downloading 
Traceback (most recent call last):
File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\twisted\internet\defer.py", line 2013, in _inlineCallbacks
result = context.run(
cast(Failure, result).throwExceptionIntoGenerator, gen
)
File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\twisted\python\failure.py", line 467, in throwExceptionIntoGenerator
return g.throw(self.value.with_traceback(self.tb))
~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\scrapy\core\downloader\middleware.py", line 68, in process_request
return (yield download_func(request, spider))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
twisted.web._newclient.ResponseNeverReceived: []
2025-03-29 17:31:14 [scrapy.core.engine] INFO: Closing spider (finished)
Eine Vorstellung davon, was das Problem sein könnte?

Quick Reply

Change Text Case: 
   
  • Similar Topics
    Replies
    Views
    Last post