Lesen Sie aus PDF und kopieren Sie sie genau wie formatiert in Excel -Zellen

Anonymous · Post by **Anonymous** » Yesterday, 21:06

Ich verwende Python mit PDFPLumber, um eine PDF -Datei zu lesen und seinen Inhalt in eine Excel -Tabelle zu kopieren. Es funktioniert einwandfrei, aber ich möchte die Formatierung und Eindrücke beibehalten und externe CR/LFs aus der PDF entfernen. Hier ist mein Skript: < /p>
# Script to copy PDF sections into Excel tabs using fuzzy matching
import os
import sys
import pdfplumber
from openpyxl import load_workbook
from rapidfuzz import fuzz, process
import re as _re

PDF_DIR = 'files'
EXCEL_PATH = os.path.join(
PDF_DIR, '_Information Security Policy Documents.xlsx')

# Verbosity flag (enable with --verbose CLI arg)
VERBOSE = '--verbose' in sys.argv

def log(msg):
"""Print debug message when verbose mode is active."""
if VERBOSE:
print(msg)

# Section headers to look for
SECTION_HEADERS = [
'Minimum Requirements',
'Supporting Guidance',
'Related Controls',
'References',
'Overlay Requirements'
]

CONTROL_ID_PATTERN = r'^[A-Z]{2,3}-\d{2}(?:\([0-9]{2}\))?'

# Normalization utilities for flexible section header detection

def _normalize_header(text: str) -> str:
"""Lowercase, remove punctuation, collapse spaces for comparison."""
return _re.sub(r'[^a-z0-9]+', ' ', text.lower()).strip()

_SECTION_NORMAL_MAP = {_normalize_header(h): h for h in SECTION_HEADERS}
# Add simple singular variants and common typos
_ADDITIONAL_HEADER_ALIASES = {
'related control': 'Related Controls',
'overlay requirement': 'Overlay Requirements',
'supporting guidances': 'Supporting Guidance',
}
for k, v in _ADDITIONAL_HEADER_ALIASES.items():
_SECTION_NORMAL_MAP.setdefault(k, v)

def extract_controls_with_sections(pdf_path, section_headers):
"""Parse PDF into mapping: control_id -> title + per-section text."""
import re
controls = {}
current_control = None
current_section = None
with pdfplumber.open(pdf_path) as pdf:
lines = []
# Skip first 3 pages (table of contents / blanks)
for page_index, page in enumerate(pdf.pages):
if page_index < 3:
continue
page_text = page.extract_text() or ''
for raw_line in page_text.splitlines():
line = raw_line.strip()
if re.fullmatch(r'\d+', line): # skip pure page numbers
continue
lines.append(line)

# Build normalization map fresh in case caller passed different headers
section_map = {h.lower(): h for h in section_headers}
norm_map = {_normalize_header(h): h for h in section_headers}
# Include global aliases
for k, v in _SECTION_NORMAL_MAP.items():
norm_map.setdefault(k, v)
for line in lines:
if not line:
continue
m = re.match(rf'({CONTROL_ID_PATTERN})\s+(.*)', line)
if m:
control_id = m.group(1).upper()
title = m.group(2).strip()
controls.setdefault(control_id, {'title': title, 'sections': {}})
current_control = control_id
current_section = None
log(f"Detected control {control_id} - {title}")
continue
# Prepare a candidate for header detection (support inline forms):
# e.g., "Supporting Guidance: Per directive" -> header part only
raw_lower = line.lower()
split_inline = re.split(r'[:\-]\s+', raw_lower, maxsplit=1)
possible_header_part = split_inline[0]
# Remove numbering prefixes (numeric / roman) from the possible header
candidate = re.sub(
r'^[0-9ivxld]+[).\-:\s]+', '', possible_header_part,
flags=re.IGNORECASE
).rstrip(':').strip()
norm_candidate = _normalize_header(candidate)
header_match = None
if raw_lower.rstrip(':') in section_map:
header_match = section_map[raw_lower.rstrip(':')]
elif norm_candidate in norm_map:
header_match = norm_map[norm_candidate]
else:
# Fallback fuzzy attempt (only if reasonably short to avoid noise)
if 5 1:
inline_text = split_inline[1].strip()
if inline_text:
bucket.append(inline_text)
log(f" Start section: {current_section} for {current_control}")
continue
if current_control and current_section:
controls[current_control]['sections'][current_section].append(line)

for cid, data in controls.items():
for sec, parts in data['sections'].items():
data['sections'][sec] = '\n'.join(parts).strip()
return controls

def extract_sections_from_pdf(pdf_path, section_headers):
"""Legacy whole-document section extraction (unused in main flow)."""
sections = {}
with pdfplumber.open(pdf_path) as pdf:
# Skip first 3 pages here as well for consistency
collected = []
for idx, page in enumerate(pdf.pages):
if idx < 3:
continue
collected.append(page.extract_text() or '')
text = "\n".join(collected)
import re
lines = text.splitlines()
header_indices = []
for idx, line in enumerate(lines):
for header in section_headers:
if re.fullmatch(
rf"{re.escape(header)}\s*[:\-–—]*",
line.strip(),
re.IGNORECASE,
):
header_indices.append((idx, header))
header_indices.sort()
header_indices.append((len(lines), None))
for i in range(len(header_indices) - 1):
start_idx, header = header_indices
end_idx, _ = header_indices[i + 1]
if header:
section_lines = lines[start_idx + 1:end_idx]
cleaned_lines = [
ln for ln in section_lines
if not re.fullmatch(r'\s*\d+\s*', ln)
]
text_block = '\n'.join(cleaned_lines).strip()
log(f"Legacy extracted {header} length={len(text_block)}")
sections[header] = text_block
return sections

def get_excel_sheetnames(wb):
return wb.sheetnames

def fuzzy_match(name, choices):
match, score, _ = process.extractOne(
name, choices, scorer=fuzz.token_sort_ratio
)
return match, score

def main():
wb = load_workbook(EXCEL_PATH)
sheetnames = get_excel_sheetnames(wb)
pdf_files = [
f for f in os.listdir(PDF_DIR) if f.lower().endswith('.pdf')
]
print(f"Found {len(pdf_files)} PDF files.")
for pdf_file in pdf_files:
pdf_stem = os.path.splitext(pdf_file)[0]
best_sheet, score = fuzzy_match(pdf_stem, sheetnames)
if score < 50:
print(f"Skip {pdf_file}: low sheet match score {score}")
continue
print(
f"Processing PDF '{pdf_file}' -> sheet '{best_sheet}' "
f"(score {score})"
)
controls = extract_controls_with_sections(
os.path.join(PDF_DIR, pdf_file), SECTION_HEADERS
)
log(f"Parsed {len(controls)} controls from {pdf_file}")
# Verbose per-control section presence summary
if VERBOSE and controls:
log("Control section matrix (✓=found, -=missing):")
header_keys = SECTION_HEADERS
for cid, data in sorted(controls.items()):
flags = []
for h in header_keys:
present = (
h in data['sections'] and
bool(data['sections'][h].strip())
)
flags.append('✓' if present else '-')
log(f" {cid}: {' '.join(flags)}")
log(f"Legend order: {' | '.join(header_keys)}")
ws = wb[best_sheet]
header_row = [cell.value for cell in ws[1]]
# Ensure section header columns exist
for header in SECTION_HEADERS:
if header not in header_row:
ws.cell(row=1, column=len(header_row) + 1, value=header)
header_row.append(header)
# Build column index map
col_index = {h: header_row.index(h) + 1 for h in SECTION_HEADERS}
# Build row map: control_id -> row number
row_map = {}
for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
cid = str(row[0].value).strip().upper() if row[0].value else ''
if cid:
row_map[cid] = row[0].row
updated_cells = 0
matched_controls = 0
for control_id, data in controls.items():
row_num = row_map.get(control_id)
if not row_num:
continue
matched_controls += 1
for section_header, text in data['sections'].items():
if not text:
continue
col = col_index.get(section_header)
if not col:
continue
ws.cell(row=row_num, column=col, value=text)
updated_cells += 1
print(
f"Sheet '{best_sheet}': matched {matched_controls} controls; "
f"wrote {updated_cells} cells from '{pdf_file}'."
)
# Save once after all PDFs processed
updated_path = EXCEL_PATH.replace('.xlsx', '_updated.xlsx')
wb.save(updated_path)
print(f"Saved updated workbook to {updated_path}")
print("Done.")

if __name__ == "__main__":
main()
< /code>
Ich mache also im Grunde genommen eine Fuzzy -Übereinstimmung mit dem Namen des PDF zum Blattnamen im Excel -Dokument und aktualisiere die darin enthaltenen Zellen, die den Überschriften übereinstimmen, die aus dem PDF -Dokument -Matching -Abschnitt_Header extrahiert wurden. Aber ich muss die Formatierung im PDF halten.a. this this stuff for section a
1. stuff under it here
2. more here
3. more, etc
a. more here
b. even more
a) or could be this
b) or this
* or this
* or even this
< /code>
Mit den Sternchen sind Kugeln. Ich muss alle 1 durch 5 Plätze nach einem a ersetzen. oder irgendwelche a. B. C. Nach einem a. 1. oder 2. 3. mit 10 Leerzeichen und dasselbe gilt für a) b) oder Kugeln. Ich kann mir keine einfache Möglichkeit vorstellen, es zu tun.

Lesen Sie aus PDF und kopieren Sie sie genau wie formatiert in Excel -Zellen

Lesen Sie aus PDF und kopieren Sie sie genau wie formatiert in Excel -Zellen ⇐ Python

Quick Reply