Code: Select all
class IngenxPreTokenizer:
def pre_tokenize(self, pretok: PreTokenizedString):
processed = self.base_tokenizer.process_text(pretok)
normalized_tokens = []
current_offset = 0
for token in processed:
token_len = len(token)
normalized_tokens.append((
token,
(current_offset, current_offset + token_len)
))
current_offset += token_len + 1
pretok.tokens = normalized_tokens
return pretok
class IngenxTokenTrainer:
def __init__(self,df,size_dataset =240340,vocab_size=150000,min_freq = 5,batch_size=1000):
self.tokenizer = IngenxTokenizer()
self.df = df
self.size_dataset = size_dataset
self.vocab_size = vocab_size
self.min_freq = min_freq
self.batch_size=1000
self.special_tokens = ["","","",]
self.training_corpus = self.preprare_dataset()
def preprare_dataset(self):
X2 = np.random.choice(len(self.df), size=self.size_dataset, replace=False)
training_texts = [f"{df.iloc[i]['problem']} {df.iloc[i]['solution']}" for i in X2]
return examples
def get_training_corpus(self):
dataset = self.training_corpus
with tqdm(total=len(dataset), desc="Processing training corpus", unit="text") as pbar:
for text in dataset:
pbar.update(1)
yield text
def train_tokenizer(self):
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = PreTokenizer.custom(IngenxPreTokenizer())
trainer = BpeTrainer(
vocab_size=self.vocab_size,
min_frequency=self.min_freq,
special_tokens=self.special_tokens
)
tokenizer.train_from_iterator(self.get_training_corpus(),trainer=trainer, length=len(self.training_corpus))
tokenizer.save("ingenx_tokenizewr.json")
return tokenizer
Exception Traceback (letzter Aufruf zuletzt) in () ----> 1 a.train_tokenizer() in train_tokenizer(self) 41 special_tokens=self.special_tokens 42 ) ---> 43 tokenizer.train_from_iterator(self.get_training_corpus(),trainer=trainer, length=len(self.training_corpus)) 44 tokenizer.save("ingenx_tokenizewr.json") 45 return tokenizer Exception : TypeError: erwarteter String oder Puffer
Ich kann nicht herausfinden, was ich hier falsch mache. Ich habe auch auf die Dokumentation verwiesen und jeden einzelnen Schritt befolgt, erhalte aber immer noch die Fehlermeldung.