Wie können Faiss-Ergebnisse verbessert werden?

Guest · Post by **Guest** » 16 Jan 2025, 12:00

Ich schreibe gerade ein Programm, in dem ich Informationen von einem Lappen abrufen muss. Diese Informationen sollten dann von einem LLM verwendet werden. Ich verwende FAISS in einer Python-Umgebung mit dem Langchain-Wrapper.
Die Datenquelle ist ein Dokument mit Regelmäßigkeiten, das ich anhand der Absätze in einzelne Texte zerlege. Dadurch kann ich sicherstellen, dass der Text für jedes Thema kohärent ist.
Allerdings habe ich jetzt das Problem, dass die Datenbank mir recht seltsame Ergebnisse liefert. Wenn ich in der Datenbank (z. B. Topic obs) nachfrage, wie man eine Banane schält, erhalte ich Ergebnisse, die sich beispielsweise damit befassen, wie man eine Kiwi am besten pflanzt.
Entsprechend sind die Ergebnisse helfen mir nicht viel und ich frage mich, wie ich sie verbessern kann.
hier ist mein Code:
die Eingabe der Suchmethode besteht meist aus einem Satz, der eine Frage enthält.
Die Suchmethode sollte dann ein oder zwei Dokumente zurückgeben.

Code: Select all

import logging
import os
from pathlib import Path

import PyPDF2
from langchain_core.documents import Document

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm

logging.basicConfig(
level=logging.INFO,
filename="logs/api.log",
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

class FaissConnection:
_instance = None

def __new__(cls):
if cls._instance is None:
cls._instance = super(FaissConnection, cls).__new__(cls)
cls._instance._initialize()
return cls._instance

def _initialize(self):
"""Initializes the FAISS connection, loading and processing the PDF."""

# Load and filter documents
character_chunks += self.get_regulation_chunks()

self.embeddings = HuggingFaceEmbeddings()

logging.info("Text split into %d chunks successfully.", len(character_chunks))

# Create FAISS index
self.db = FAISS.from_documents(character_chunks, self.embeddings)
logging.info("FAISS index created successfully.")

@staticmethod
def get_regulation_chunks() -> list[Document]:
"""Returns the regulation documents."""
documents = FaissConnection.get_regulation_documents()
logging.info("Text extracted from PDF file successfully.  Total pages: %d", len(documents))

text_splitter = CharacterTextSplitter(separator="\n§")
character_chunks = text_splitter.split_documents(documents)

return character_chunks

@staticmethod
def get_regulation_documents() -> list[Document]:
"""Returns the regulation documents."""
current_file = Path(__file__).resolve()
project_root = current_file.parents[2]
pdf_path = project_root / "resources" / "document.pdf"

if not pdf_path or not os.path.exists(pdf_path):
raise FileNotFoundError("the file does not exist.")

documents = FaissConnection.load_pdf_from_file(pdf_path)
# filter all docs with less than 100 characters
documents = [doc for doc in documents if len(doc.page_content) > 100]
return documents

@staticmethod
def load_pdf_from_file(file_path: str) -> list[Document]:
"""Loads text from a PDF file."""
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
documents = []
reader = PyPDF2.PdfReader(file_path)
progress_bar = tqdm(range(len(reader.pages)), desc="Reading PDF pages")
for page_num in progress_bar:
page = reader.pages[page_num]
text = page.extract_text()
document = Document(page_content=text)
documents.append(document)
return documents

def search(self, query, return_amount=1):
"""
Searches the FAISS index with the given query and returns the most relevant documents.

Args:
query (str): The search query.
return_amount (int): Number of documents to return.

Returns:
list[Document]: List of relevant documents.
"""
retriever = self.db.as_retriever(search_type="mmr")
retriever.search_kwargs["k"] = return_amount  # Limit results
#docs = retriever.get_relevant_documents(query)
docs = retriever.invoke(query) #TODO:test difference
logging.info("Search query executed. Returning top %d result(s).", return_amount)
for doc in docs:
logging.info("Document: %s", doc.page_content)
return docs[0] if return_amount == 1 else docs

if __name__ == "__main__":
# Create the singleton instance
faiss_instance = FaissConnection()

# Example of using the singleton instance to retrieve relevant documents
relevant_docs = faiss_instance.search("How to peel a Banana?", return_amount=2)

Wie können Faiss-Ergebnisse verbessert werden?

Wie können Faiss-Ergebnisse verbessert werden? ⇐ Python

Quick Reply