Blenderbot: Schlüsselfehler beim Training von JSON
Posted: 29 Dec 2024, 12:03
Ich versuche, das Blenderbot-Modell facebook/blenderbot-90m auf einer Reihe von Fragebögen basierend auf meinem Eingabe-JSON zu trainieren.
Das Beispiel-JSON (Snippet) sieht so aus dies:
Code für die Schulung:
Gibt diesen Fehler aus:
Das Beispiel-JSON (Snippet) sieht so aus dies:
Code: Select all
[
{
"input": "Who is Carole Nicholas?",
"output": "She is a software engineer !"
},
{
"input": "Where was Carole Nicholas born?",
"output": "She was born in San Franscisco."
},
{
"input": "Which was Carole's school?",
"output": "Lake Side school."
},
{
"input": "Where is Lakeside school located?",
"output": "It is located at Seattle!"
}
]
Code: Select all
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorWithPadding
from datasets import Dataset
import torch
modelName = "facebook/blenderbot-90m"
# Load the dataset
dataset = Dataset.from_json("C:\\Users\\Raj\\Desktop\\To-Dos\\AI_related\\chatInput.json")
model = AutoModelForSeq2SeqLM.from_pretrained(modelName)
tokenizer = AutoTokenizer.from_pretrained(modelName)
# Preprocess the data
def preprocess(examples):
model_inputs = tokenizer(
examples["input"],
examples["output"],
padding="max_length",
truncation=True,
max_length=512,
)
return model_inputs
dataset = dataset.map(preprocess, batched=True)
#print(dataset['input'])
train_size = int(len(dataset) * 0.8)
val_size = int(len(dataset) * 0.1)
test_size = len(dataset) - train_size - val_size
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
from torch.utils.data import Dataset
class BlenderBotDataset(Dataset):
def __init__(self, dataset, tokenizer):
self.dataset = dataset
self.tokenizer = tokenizer
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
encoding = self.dataset[idx]
return {
"input_ids": torch.tensor(encoding["input_ids"]),
"attention_mask": torch.tensor(encoding["attention_mask"]),
"labels": torch.tensor(encoding["input_ids"]),
}
from torch.utils.data import DataLoader
train_dataset = BlenderBotDataset(dataset[train_size], tokenizer)
val_dataset = BlenderBotDataset(dataset[val_size], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
from transformers import Trainer, TrainingArguments
# Define the training arguments
#training_args = TrainingArguments("test-trainer")
training_args = TrainingArguments(
output_dir="C:\\Users\\Raj\\Desktop\\To-Dos\\AI_related",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
evaluation_strategy="epoch",
learning_rate=5e-5,
save_total_limit=2,
save_steps=500,
#load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
save_on_each_node=True
)
print(dataset[train_size])
# Create the trainer
trainer = Trainer(
model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
#tokenizer = tokenizer,
data_collator=data_collator
#compute_metrics=lambda pred: {"accuracy": torch.sum(torch.argmax(pred.label_ids, dim=1) == torch.argmax(pred.predictions, dim=1)) / len(pred.label_ids)},
)
# Train the model
trainer.train()
Code: Select all
0%| | 0/3 [00:00