Wie erstelle ich ein durchsuchbares PDF mit Python und Selen?Python

Python-Programme
Anonymous
 Wie erstelle ich ein durchsuchbares PDF mit Python und Selen?

Post by Anonymous »

Ich möchte ein Programm wie Fireshot (Premium-Version) erstellen, um eine Webseite auf Chromedriver aufzunehmen und es in ein PDF umzuwandeln.

Code: Select all

import time
import os
import glob
import base64
from PyPDF2 import PdfMerger
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
prefs = {"printing.print_preview_sticky_settings.appState": '{"recentDestinations":[{"id":"Save as PDF"}]}'}
options.add_experimental_option("prefs", prefs)
options.add_argument("--kiosk-printing")  # Auto confirm print
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Load the webpage
driver.get("https://www.coursera.org/?authMode=login")

# complete the sign-in and you are redirected to another page

driver.switch_to.window(driver.window_handles[1])

def save_pdf(driver, file_name):
params = {'landscape': False, 'paperWidth': 8.27, 'paperHeight': 11.69}
data = driver.execute_cdp_cmd("Page.printToPDF", params)
with open(file_name, 'wb') as file:
file.write(base64.b64decode(data['data']))

def scroll_and_save(driver, scrollable_xpath, output_prefix):
scrollable_div = driver.find_element(By.XPATH, scrollable_xpath)
file_list = []
page_num = 1
last_scroll_position = -1

while True:
file_name = f"{output_prefix}_page_{page_num}.pdf"
save_pdf(driver, file_name)
file_list.append(file_name)

driver.execute_script("arguments[0].scrollTop += arguments[0].clientHeight;", scrollable_div)
time.sleep(2)  # Allow time for new content to load

new_scroll_position = driver.execute_script("return arguments[0].scrollTop;", scrollable_div)
if new_scroll_position == last_scroll_position:
break  # Stop when scrolling reaches the end
last_scroll_position = new_scroll_position
page_num += 1

return file_list

def merge_pdfs(file_list, output_file):
merger = PdfMerger()
for pdf in file_list:
merger.append(pdf)
merger.write(output_file)
merger.close()

# Clean up individual PDF files
for pdf in file_list:
os.remove(pdf)

scrollable_xpath = "/html/body/div[5]/div/div/div/div[2]/div[2]/div"
output_prefix = "practical_quiz_1"

file_list = scroll_and_save(driver, scrollable_xpath, output_prefix)
merge_pdfs(file_list, output_prefix + ".pdf")
Der Fehler, mit dem ich ausgesetzt bin, ist, dass data = driver.execute_cdp_cmd ("page.printtopdf", Params) speichert nur den oberen Teil der Webseite als PDF, obwohl ich nach unten scrolle - wie ich dies auflöste und die Textsuche in der PDF.>

Quick Reply

Change Text Case: 
   
  • Similar Topics
    Replies
    Views
    Last post