Als solches:
Code: Select all
from bertopic.vectorizers import OnlineCountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import MiniBatchKMeans
import numpy as np
class SafeIncrementalPCA(IncrementalPCA):
def partial_fit(self, X, y=None):
# Ensure the input is contiguous and in float64
X = np.ascontiguousarray(X, dtype=np.float64)
return super().partial_fit(X, y)
def transform(self, X):
result = super().transform(X)
# Force the output to be float64 and contiguous
return np.ascontiguousarray(result, dtype=np.float64)
vectorizer_model = OnlineCountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)
umap_model = SafeIncrementalPCA(n_components=100)
cluster_model = MiniBatchKMeans(n_clusters=1000, random_state=0)
from bertopic import BERTopic
topic_model = BERTopic(umap_model=umap_model,
hdbscan_model=cluster_model,
for docs_delayed, emb_delayed in tqdm(zip(docs_partitions, embeddings_partitions), total=len(docs_partitions)):
docs_pdf = docs_delayed.compute()
emb_pdf = emb_delayed.compute()
docs = docs_pdf["text"].tolist()
embeddings = np.vstack(emb_pdf['embeddings'].tolist())
# Partial fit your model (make sure your model supports partial_fit, like many scikit-learn estimators do)
topic_model.partial_fit(docs, embeddings)
Code: Select all
for docs_delayed, emb_delayed in tqdm(zip(docs_partitions, embeddings_partitions), total=len(docs_partitions)):
docs_pdf = docs_delayed.compute()
emb_pdf = emb_delayed.compute()
docs = docs_pdf["text"].tolist()
embeddings = np.vstack(emb_pdf['embeddings'].tolist())
# 3) Apply BERTopic on this shard
topics, probs = topic_model.transform(docs, embeddings)
# Save topics to DataFrame
df_topics = pd.DataFrame({
"tweet_id": docs_pdf["id"].tolist(),
"topic": topics,
"probability": probs
})
## Merge & store in DB
docs_pdf["topic"] = df_topics["topic"]
docs_pdf["probability"] = df_topics["probability"]
docs_pdf.to_sql("tweets", engine, if_exists="append", index=False)
Ich würde mich sehr über jede Art von Vorschlag freuen, was schiefgehen könnte und wie ich das Problem beheben könnte!
Vielen Dank!
Mobile version