Word to Excel mit Python, aber Wortformat (Kugelpunkte) und Struktur erhalten

Anonymous · Post by **Anonymous** » 20 Apr 2025, 06:01

Skript zum Umwandeln von Wortdokumenten in Excel. Funktioniert gut, behält aber keine Struktur und bestimmte Zeichen wie Aufzählungszeichen. < /P>
import tkinter as tk
from tkinter import filedialog
import re
import os
import subprocess
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.text.paragraph import Paragraph
from docx.table import Table
from docx.oxml.ns import qn
from openpyxl import Workbook
import difflib

# Control Flags
EXTRACT_PARAGRAPHS = False # Global flag: if True, non-heading paragraphs are output.
EXTRACT_TABLES = True # Global flag: if True, tables are extracted.
ONLY_INCLUDE_HEADINGS_WITH_DATA = True # If True, an official heading is printed only when a data block follows.
CUSTOM_HEADERS = ["Document Info:"] # Custom header texts that are always printed immediately.

# Custom section rules:
# Keys are lower-case, numbering-stripped header texts.
# Values are sets of allowed data types for that section.
CUSTOM_SECTION_RULES = {
"references": {"Paragraph"}
}

def sanitize_sheet_name(name):
r"""
Sanitize sheet names for Excel:
- Ensure the name is not empty (if empty, use "Default")
- Remove invalid characters: : \ / ? * [ or ]
- Limit to 31 characters.
"""
name = name.strip()
if not name:
name = "Default"
invalid_chars = r'[:\\/*?\[\]]'
sanitized = re.sub(invalid_chars, "", name)
return sanitized[:31]

def sanitize_filename(filename):
r"""
Sanitize the filename so that only valid characters remain.
This function:
- Strips leading/trailing whitespace.
- Replaces spaces with underscores.
- Removes any character that is not alphanumeric, an underscore, hyphen, or period.
- Removes any trailing periods.
The resulting string is used as the base of the Excel filename.
"""
filename = filename.strip()
# Replace spaces with underscores.
filename = filename.replace(" ", "_")
# Allow only alphanumeric characters, underscores, hyphens, and periods.
filename = re.sub(r'[^A-Za-z0-9_.-]', '', filename)
# Remove trailing periods.
filename = filename.rstrip(".")
if not filename:
filename = "Default"
return filename

def strip_heading_number(text):
"""
Remove leading numbering from heading text.
e.g., "4.3 References" becomes "References".
"""
return re.sub(r'^\d+(\.\d+)*\s*', '', text)

def iter_block_items(parent):
"""
Yield each paragraph and table child within *parent* in document order.
"""
if hasattr(parent, 'element'):
parent_element = parent.element.body
else:
parent_element = parent
for child in parent_element:
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)

def get_paragraph_numbering(paragraph, numbering_counters):
"""
Return a tuple (numbering_str, ilvl) for a numbered paragraph,
or (None, None) if the paragraph isn’t numbered.
"""
pPr = paragraph._p.pPr
if pPr is None:
return (None, None)
numPr = pPr.numPr
if numPr is None:
return (None, None)
numId = numPr.numId
ilvl = numPr.ilvl
if numId is None or ilvl is None:
return (None, None)
numId_val = int(numId.val)
ilvl_val = int(ilvl.val)
if numId_val not in numbering_counters:
numbering_counters[numId_val] = [0] * 9 # Assuming up to 9 levels.
counters = numbering_counters[numId_val]
counters[ilvl_val] += 1
for lvl in range(ilvl_val + 1, len(counters)):
counters[lvl] = 0
numbering_str = '.'.join(str(counters[lvl]) for lvl in range(ilvl_val + 1) if counters[lvl] != 0)
return (numbering_str, ilvl_val)

def is_official_heading(block):
return block.style and block.style.name.startswith("Heading")

def is_custom_header(block, threshold=0.8):
"""
Check if the paragraph text fuzzy matches any custom header.
The text from the block is normalized (lowercase, stripped of trailing colon)
and then compared to each entry in CUSTOM_HEADERS using difflib.
If the similarity ratio is equal to or exceeds the threshold, returns True.
"""
header_text = block.text.strip().lower().rstrip(":")
for custom in CUSTOM_HEADERS:
custom_normalized = custom.lower().rstrip(":")
similarity = difflib.SequenceMatcher(None, header_text, custom_normalized).ratio()
if similarity >= threshold:
return True
return False

def extract_table_as_list(table):
"""
Extract the table content as a list of lists.
Each sublist represents a row with cell texts.
"""
table_data = []
for row in table.rows:
row_data = [cell.text.strip() for cell in row.cells]
table_data.append(row_data)
return table_data

def process_document(file_path):
"""
Process a single Word document.
Extract sections and blocks based on control flags and custom rules,
print output to the console, export to an Excel workbook saved in the same location,
and return the number of items generated (total blocks).
"""
print(f"\nProcessing file: {file_path}")
doc = Document(file_path)
blocks = list(iter_block_items(doc))
numbering_counters = {}

# Build default allowed types from global flags.
default_allowed = set()
if EXTRACT_PARAGRAPHS:
default_allowed.add("Paragraph")
if EXTRACT_TABLES:
default_allowed.add("Table")

current_section_rule = default_allowed.copy()
current_section = None # Section header name.
pending_heading = None # For ONLY_INCLUDE_HEADINGS_WITH_DATA logic.

# Dictionary to hold sections and their extracted blocks.
sections = {}

# Local counters for the current section:
local_para_count = 1
local_table_count = 1

for block in blocks:
if isinstance(block, Paragraph):
text = block.text.strip()
if is_custom_header(block):
pending_heading = None
current_section = text # Use custom header text as section name.
sections[current_section] = []
current_section_rule = default_allowed.copy()
local_para_count = 1
local_table_count = 1
elif is_official_heading(block):
num_str, _ = get_paragraph_numbering(block, numbering_counters)
heading_text_with_num = f"{num_str} {text}" if num_str else text
stripped = strip_heading_number(text).lower()
if stripped in CUSTOM_SECTION_RULES:
current_section_rule = CUSTOM_SECTION_RULES[stripped]
heading_text_print = strip_heading_number(text)
else:
current_section_rule = default_allowed.copy()
heading_text_print = heading_text_with_num
if ONLY_INCLUDE_HEADINGS_WITH_DATA:
pending_heading = heading_text_print
else:
current_section = heading_text_print
sections[current_section] = []
pending_heading = None
local_para_count = 1
local_table_count = 1
else:
# Non-heading paragraph.
if text:
if "Paragraph" in current_section_rule:
if ONLY_INCLUDE_HEADINGS_WITH_DATA and pending_heading is not None:
current_section = pending_heading
if current_section not in sections:
sections[current_section] = []
pending_heading = None
local_para_count = 1
local_table_count = 1
if current_section is None:
current_section = "Default"
sections[current_section] = []
local_para_count = 1
local_table_count = 1
label = f"[Paragraph{local_para_count}]"
sections[current_section].append({
"type": "Paragraph",
"content": text,
"label": label
})
local_para_count += 1
elif isinstance(block, Table):
if "Table" in current_section_rule:
if ONLY_INCLUDE_HEADINGS_WITH_DATA and pending_heading is not None:
current_section = pending_heading
if current_section not in sections:
sections[current_section] = []
pending_heading = None
local_para_count = 1
local_table_count = 1
if current_section is None:
current_section = "Default"
sections[current_section] = []
local_para_count = 1
local_table_count = 1
label = f"[Table{local_table_count}]"
table_data = extract_table_as_list(block)
sections[current_section].append({
"type": "Table",
"content": table_data,
"label": label
})
local_table_count += 1

# (Optional) Print extracted output.
print("\nExtracted Output:")
for section, blocks_list in sections.items():
print(f"\nSection: {section}")
for block in blocks_list:
if block["type"] == "Paragraph":
print(f" {block['label']} Paragraph: {block['content']}")
elif block["type"] == "Table":
print(f" {block['label']} Table:")
for row in block["content"]:
print(" " + "\t".join(row))

# Create an Excel workbook using openpyxl.
wb = Workbook()
default_sheet = wb.active
wb.remove(default_sheet)
for section, blocks_list in sections.items():
sheet_name = sanitize_sheet_name(section)
ws = wb.create_sheet(title=sheet_name)
row_pointer = 1
for block in blocks_list:
if block["type"] == "Paragraph":
ws.cell(row=row_pointer, column=1, value=block["label"])
ws.cell(row=row_pointer, column=2, value=block["content"])
row_pointer += 1
elif block["type"] == "Table":
table_data = block["content"]
first_row = True
for r in table_data:
if first_row:
ws.cell(row=row_pointer, column=1, value=block["label"])
for c_idx, cell_text in enumerate(r, start=2):
ws.cell(row=row_pointer, column=c_idx, value=cell_text)
first_row = False
else:
for c_idx, cell_text in enumerate(r, start=2):
ws.cell(row=row_pointer, column=c_idx, value=cell_text)
row_pointer += 1
row_pointer += 1 # Blank row after table.
directory = os.path.dirname(file_path)
base = os.path.splitext(os.path.basename(file_path))[0]
sanitized_base = sanitize_filename(base)
output_file = os.path.join(directory, f"{sanitized_base}.xlsx")
wb.save(output_file)
print(f"\nExtraction complete. Data exported to {output_file}")

# Calculate total number of items (blocks) generated in this file.
blocks_count = sum(len(v) for v in sections.values())
return blocks_count

def main():
# Prompt the user to select a folder.
root = tk.Tk()
root.withdraw()
folder = filedialog.askdirectory(title="Select a Folder Containing Word Documents")
if not folder:
print("No folder selected.")
return

# Gather all DOCX files and group them by normalized base filename (ignoring version markers).
regex = re.compile(r"^(?P.+?)(?:\s*$(?P\d+)$)?\.docx$", re.IGNORECASE)
grouping = {} # key: normalized base name, value: list of (version, filename)
for filename in os.listdir(folder):
if filename.lower().endswith(".docx"):
match = regex.match(filename)
if match:
base = match.group("base").strip().lower()
version = int(match.group("version")) if match.group("version") else 0
grouping.setdefault(base, []).append((version, filename))

total_files_found = sum(len(v) for v in grouping.values())
unique_files_selected = len(grouping)
duplicates_skipped = total_files_found - unique_files_selected

# Select the highest version for each base and record duplicate details.
files_to_process = {}
duplicate_details = {} # key: base, value: dict with 'selected' and 'duplicates' list
for base, files in grouping.items():
sorted_files = sorted(files, key=lambda x: x[0], reverse=True)
chosen = sorted_files[0]
files_to_process[base] = chosen
if len(sorted_files) > 1:
duplicate_details[base] = {
"selected": chosen,
"duplicates": sorted_files[1:]
}

files_processed = 0
total_items_generated = 0
issues = []

for base, (version, filename) in files_to_process.items():
file_path = os.path.join(folder, filename)
try:
items_generated = process_document(file_path)
total_items_generated += items_generated
files_processed += 1
except Exception as e:
issues.append(f"Error processing {filename}: {e}")

# Final Report
print("\n===== FINAL REPORT =====")
print(f"Total DOCX files found: {total_files_found}")
print(f"Unique files selected for processing: {unique_files_selected}")
print(f"Duplicate/skipped files: {duplicates_skipped}")
if duplicate_details:
print("\nDetails of duplicate groups:")
for base, info in duplicate_details.items():
selected_version, selected_file = info["selected"]
duplicates_list = ", ".join(f"{fn} (v{ver})" for ver, fn in info["duplicates"])
print(f" Group '{base}': selected -> {selected_file} (v{selected_version}); duplicates -> {duplicates_list}")
print(f"\nFiles processed successfully: {files_processed}")
print(f"Total items generated (blocks extracted): {total_items_generated}")
if issues:
print("\nIssues encountered:")
for issue in issues:
print(f" - {issue}")
else:
print("\nNo issues encountered.")
print("========================\n")

if __name__ == "__main__":
main()
< /code>
Ist es möglich, aus dem Wort Dokument zu extrahieren, um einige der Formatierung und Struktur der Wortdokumente zu erhalten? Block.
Besonders enttäuschend, keine Kugelpunkte und den richtigen Abstand zu haben. />
excel

Word to Excel mit Python, aber Wortformat (Kugelpunkte) und Struktur erhalten

Word to Excel mit Python, aber Wortformat (Kugelpunkte) und Struktur erhalten ⇐ Python

Quick Reply