Benutzerdefinierte Azure -Fähigkeit: PDF -Chunking mit LLMPYMUPDF

Anonymous · Post by **Anonymous** » 20 May 2025, 16:04

Ich möchte meinem Azure-Indexer einen Zoll hinzugefügt und die folgende Azure-Funktion geschrieben: < /p>
import base64
import fitz # PyMuPDF
import io
import pymupdf4llm
import azure.functions as func
import json
import re
from typing import List, Dict
import hashlib
import time

def process_chunks(chunks: List[Dict])->List[List[Dict]]:
processed_chunks = []
images_per_chunk = []
image_counter = {}

pattern = re.compile(
r"!\[\]\(data:image/(jpg|jpeg|png);base64,([A-Za-z0-9+/=\s]+?)\)", re.IGNORECASE
)

for chunk in chunks:
text = chunk.get("text", "")
page_number = chunk.get("metadata", {}).get("page", "unknown")

image_counter.setdefault(page_number, 0)
base64_images_in_chunk = []

def replace_func(match):
base64_data = match.group(2).replace('\n', '').replace('\r', '')
image_counter[page_number] += 1
image_id = f"image_{page_number}_{image_counter[page_number]}"
base64_images_in_chunk.append(base64_data)

# description = The following description is intended for future developments and for working with images: for now, it is generic and will be reworked once in the Prompt Flow.
# The description will be replaced with a GPT-generated description or the output from OCR image processing.
description = "This is an image with a generic description. The image may contain a table or financial charts. Use the function to retrieve this image from the index only if you determine that it is relevant based on the preceding or following text."

return f"![]({image_id})\n"

updated_text = re.sub(pattern, replace_func, text)
updated_chunk = chunk.copy()
updated_chunk["text"] = updated_text

processed_chunks.append(updated_chunk)
images_per_chunk.append(base64_images_in_chunk)

return processed_chunks, images_per_chunk

def get_base64_pdf_page(doc, page_number)->str:
"""
Extracts a single page from a PDF document and converts it to a base64-encoded string.

This function takes a PDF document, extracts the specified page, and saves it into a new document.
The new document is then converted into a byte stream, which is encoded into base64 format.

Args:
doc: A `fitz.Document` object representing the PDF document from which the page will be extracted.
page_number: An integer representing the page number to be extracted (0-indexed).

Returns:
str: A base64-encoded string representing the extracted page. If an error occurs, it returns `None`.

Example:
base64_pdf = get_base64_pdf_page(doc, 0)
"""
try:
single_page_doc = fitz.open()
single_page_doc.insert_pdf(doc, from_page=page_number, to_page=page_number)

byte_io = io.BytesIO()
single_page_doc.save(byte_io)
single_page_doc.close()

base64_pdf = base64.b64encode(byte_io.getvalue()).decode('utf-8')
return base64_pdf
except Exception as e:
logging.error(f"Error in get_base64_pdf_page: {e}")
return None

def generate_parent_id(filename: str) -> str:
"""
Generates a safe and unique parent ID for indexed documents.

The function sanitizes the filename by:
1. Removing any non-alphanumeric characters, hyphens, or underscores at the beginning.
2. Replacing remaining invalid characters with underscores.
3. Ensuring that the resulting filename is not empty or starting with a non-alphanumeric character.
If the sanitized filename is empty, it defaults to "file".

To ensure uniqueness, the function:
- Appends a timestamp.
- Generates a hash based on the filename and timestamp.

Args:
filename (str): The name of the file.

Returns:
str: A unique and safe parent ID in the format:
"_"
If the sanitized filename is empty, the format becomes:
"file_"
"""
# Remove non-alphanumeric characters, hyphens, and underscores at the start of the filename
safe_filename = re.sub(r'^[^a-zA-Z0-9]+', '', filename)
# Replace remaining invalid characters with underscores
safe_filename = re.sub(r'[^a-zA-Z0-9-_]', '_', safe_filename)

# If the result is empty or still starts with a non-alphanumeric character, use a default value
if not safe_filename:
safe_filename = "file"

# Add a timestamp to ensure the uniqueness of the upload
timestamp = str(int(time.time()))

# Generate a hash based on the filename and timestamp
short_hash = hashlib.md5(f"{filename}_{timestamp}".encode()).hexdigest()[:16]
return f"{safe_filename}_{short_hash}"

def generate_safe_id(filename: str, page_number: str) -> str:
"""
Generates a safe and unique ID for indexed documents.

The function sanitizes the filename by:
1. Removing any non-alphanumeric characters, hyphens, or underscores at the beginning.
2. Replacing remaining invalid characters with underscores.

To ensure uniqueness and consistency, the function:
- Appends an external timestamp.
- Uses an MD5 hash of the filename truncated to 8 characters.

Args:
filename (str): The name of the file.
page_number (str): The page number of the document.

Returns:
str: A unique and safe ID in the format:
"___page_"
If the sanitized filename is empty, the format becomes:
"__page_"
"""
# Remove non-alphanumeric characters, hyphens, and underscores at the start of the filename
safe_filename = re.sub(r'^[^a-zA-Z0-9]+', '', filename)
# Replace remaining invalid characters with underscores
safe_filename = re.sub(r'[^a-zA-Z0-9-_]', '_', safe_filename)

# Add the external timestamp to ensure uniqueness and hash for consistency
timestamp = str(int(time.time()))
short_hash = hashlib.md5(filename.encode()).hexdigest()[:8]

return f"{safe_filename}_{short_hash}_{timestamp}_page_{page_number}" if safe_filename else f"{short_hash}_{timestamp}_page_{page_number}"

def process_pdf(pdf_bytes)->List[Dict]:
"""
Processes a PDF file (given as bytes) and extracts relevant information, including the text,
images (encoded in base64), and other data from each page. The function returns a list of dictionaries,
each containing the processed content for a page.

This function does the following:
1. Converts the PDF byte data into a document object.
2. Extracts text and images, and processes them into markdown format.
3. Converts each page of the PDF to base64 format.
4. Returns a list of dictionaries where each dictionary contains:
- "text": The processed text of the page.
- "base64_imgs_list": The list of base64-encoded images found in the page.
- "base64_data": The base64-encoded PDF data of the page.
- "context": The raw text of the page extracted using PyMuPDF.

Args:
pdf_bytes: A byte string representing the PDF file to be processed.

Returns:
List[Dict]: A list of dictionaries where each dictionary contains the following keys:
- "text": The processed text content from the page.
- "base64_imgs_list": A list of base64-encoded images extracted from the page.
- "base64_data": The base64-encoded representation of the page PDF.
- "context": The raw text extracted from the PDF page.

Example:
processed_pages = process_pdf(pdf_bytes)
"""
try:
with io.BytesIO(pdf_bytes) as pdf_io:
doc = fitz.open(stream=pdf_io, filetype="pdf")
documents = pymupdf4llm.to_markdown(doc=doc, page_chunks=True, write_images=False, embed_images=True)
processed_chunks, images_base64 = process_chunks(chunks=documents)

for i, page in enumerate(doc):
base64_pdf_page = get_base64_pdf_page(doc, i)

documents["text"] = processed_chunks["text"]
documents.update({"base64_imgs_list": images_base64})

documents.update({"base64_data": base64_pdf_page})
documents.update({"context": page.get_text()})

return documents
except Exception as e:
logging.error(f"Error in process_pdf: {e}")
return None

def main(req: func.HttpRequest) -> func.HttpResponse:
"""
Azure Function to process PDF files from an HTTP request.

This function is triggered via an HTTP request and processes one or more PDF files
provided as base64-encoded strings. The function decodes the PDF data, extracts page
content, and returns the processed data in JSON format.

Parameters:
req (func.HttpRequest): The HTTP request object containing the PDF data
in a JSON payload. The payload must include a "values" field, which is
a list of objects. Each object should contain:
- "recordId" (str, optional): A unique identifier for the record.
- "data" (str): The base64-encoded PDF content.

Returns:
func.HttpResponse: A JSON response containing a list of processed PDF pages.
Each entry includes:
- "recordId": The ID of the processed record.
- "data": A dictionary containing:
- "complexPages": A list of page data, each containing:
- "contentRaw": The raw text content extracted from the page.
- "contentMarkdown": The formatted text content in Markdown.
- "base64Data": The base64-encoded content of the page.
- "pageNumber": The page number within the PDF.
- "title": The title of the page (if available).
- "base64ImgsList": A list of base64-encoded images extracted from the page.
- "parentIdGenerated": A generated identifier for the parent file.
- "documentId": A unique identifier for the page.

Processing Steps:
1. Validates the input JSON and extracts PDF data.
2. Decodes the base64 PDF content.
3. Processes the PDF to extract page content and metadata.
4. Constructs a JSON response with the extracted data.
5. Returns the response as an HTTP JSON object.

Example Usage:
POST /api/AIProcessPdfFunction
{
"values": [
{
"recordId": "12345",
"data": "base64_encoded_pdf_string"
}
]
}

Errors:
- If the input JSON is malformed, returns a 400 response with a relevant message.
- If the PDF data cannot be decoded or processed, an error message is included
in the response.

Logs:
- Information about the request processing and any errors encountered
are logged with the 'AIProcessPdfFunction' suffix.
"""

log_suffix = 'AIProcessPdfFunction: '
logging.info(f'{log_suffix}Python HTTP trigger function processed a request.')

try:
req_body = req.get_json()
except ValueError:
logging.warning(f'{log_suffix}Invalid JSON payload received.')
return func.HttpResponse("Please pass a JSON payload", status_code=400)

# Check if 'values' exists, if not, check if it's a direct input with 'data'
values = req_body.get("values")
#single_output = False
if not values:
if "data" in req_body:
values = [req_body] # Wrap single data object in a list
#single_output = True
else:
logging.error(f'{log_suffix}No values or data found in the JSON payload.')
return func.HttpResponse("Please pass 'values' or 'data' in the JSON payload", status_code=400)
results = {"values": []} # Inizializza come dizionario con una lista vuota

for value in values:
record_id = value.get("recordId", None)

pdf_bytes = None

pdf_base64 = value.get("data", "")
if not isinstance(pdf_base64, str):
logging.error(f"Invalid data format, expected base64 string but got {type(pdf_base64)}")
result_entry = {"errors": [{"message": "Invalid data format: expected base64 string."}]}
continue

try:
pdf_bytes = base64.b64decode(pdf_base64)
except Exception as e:
logging.error(f"Error decoding base64 data: {e}")
result_entry = {"errors": [{"message": f"Base64 decoding failed: {e}"}]}
continue

if pdf_bytes is None:
results["values"].append({
"recordId": record_id if record_id else "unknown",
"data": {},
"errors": [{"message": "PDF data is missing or not properly decoded."}],
"warning": None
})
continue

processed_pages = process_pdf(pdf_bytes)

if processed_pages is None:
results["values"].append({
"recordId": record_id if record_id else "unknown",
"data": {},
"errors": [{"message": "PDF processing failed."}],
"warning": None
})
continue

page_results = []
for page_data in processed_pages:
filepath = page_data['metadata'].get("file_path", "")
page_number = str(page_data['metadata'].get('page', -1))
record_id_suffix = generate_safe_id(filename=filepath, page_number=page_number)
full_record_id = f"{record_id}{record_id_suffix}" if record_id else record_id_suffix
page_results.append(
{
"contentRaw": page_data.get("context", ""),
"contentMarkdown": page_data.get("text", ""),
"base64Data": page_data.get("base64_data", ""),
"pageNumber": page_number,
"title": page_data['metadata'].get("title", "Unknown"),
"base64ImgsList": page_data.get("base64_imgs_list", []),
"parentIdGenerated": generate_parent_id(filename=filepath),
"documentId": full_record_id
}
)

# Aggiungi il risultato corrente alla lista dei risultati
results["values"].append({
"recordId": record_id if record_id else generate_parent_id(filename=filepath),
"data": {"complexPages": page_results},
"errors": None,
"warning":None
})

logging.info(f'{log_suffix}Successfully processed PDF.')
return func.HttpResponse(json.dumps(results), mimetype="application/json")
< /code>
Wenn ich es mit Postman teste, funktioniert es genau so, wie ich es will. Wenn ich es jedoch zu den Fähigkeiten hinzufüge, bekomme ich Probleme: < /p>
"@odata.etag": "\"0x8DD92AB43532119\"",
"name": "skillset-qrmaiproject-1",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
"name": "#1",
"description": "Base64, markdown and chunks",
"context": "/document",
"uri": "https://mycode==",
"httpMethod": "POST",
"timeout": "PT30S",
"batchSize": 1000,
"inputs": [
{h
"name": "file_data",
"source": "/document/file_data",
"inputs": []
}
],
"outputs": [
{
"name": "complexPages",
"targetName": "content_pages"
}
],
"httpHeaders": {}
}
< /code>
Er gibt die folgende Warnung zurück: < /p>

enRichment.webapiskill.#1
konnte keine Fähigkeiten ausführen, da die Reaktion von Web -API -Fähigkeiten ungültig ist. Ich verstehe, was in meiner Konfiguration meiner Fähigkeiten falsch sein oder fehlt?>

Benutzerdefinierte Azure -Fähigkeit: PDF -Chunking mit LLMPYMUPDF

Benutzerdefinierte Azure -Fähigkeit: PDF -Chunking mit LLMPYMUPDF ⇐ Python

Quick Reply