Traceback (most recent call last):
File "F:\Pendrive\!!! Pivate !!!\Other\Other\Private\AI\PyTorch\Model_1\main.py", line 20, in
model.pretrain_on_corpus(
File "F:\Pendrive\!!! Pivate !!!\Other\Other\Private\AI\PyTorch\Model_1\ai_dev_kit.py", line 84, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "F:\Pendrive\!!! Pivate !!!\Other\Other\Private\AI\PyTorch\Model_1\ai_dev_kit.py", line 386, in pretrain_on_corpus
x, y = x.to(device), y.to(device)
^^^^^^^^^^^^
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
< /code>
Dies ist mein Code: < /p>
"""
Advanced AI Dev Kit for making powerful AI models. Powered by PyTorch, TensorFlow and Scikit-Learn
"""
from sklearn.model_selection import train_test_split
from transformers import pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
from mixture_of_experts import MoE
import nltk
from nltk.tokenize import sent_tokenize
from textwrap import wrap
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.amp.autocast_mode import autocast
from tqdm import tqdm
import pandas as pd
import tiktoken
import json
import csv
from numba import cuda
import numpy as np
import functools
from functools import wraps
from torch.nn.utils.rnn import pad_sequence
torch.cuda.empty_cache()
def general_accelerator(func):
"""A decorator to optimize functions."""
@functools.wraps(func)
def wrapper(*args, **kwargs):
# Example optimization: simplify mathematical computations like expensive trig or powers.
# Check if arguments have constant values that can be pre-calculated (if possible)
optimized_args = []
for arg in args:
if isinstance(arg, float) and (arg.is_integer() or abs(arg) > 100): # Example condition to optimize calculations
optimized_args.append(arg)
else:
optimized_args.append(arg) # No change
# Example optimization for a function that does math: avoid redundant power calculations
if 'n' in kwargs and kwargs['n'] == 2: # Just an example: if `n` is 2, use sqrt instead of power
kwargs['n'] = 2 # Use a faster operation like sqrt instead of `** 2`
# You can apply other specific optimizations depending on known behavior of the function
return func(*optimized_args, **kwargs)
return wrapper
@general_accelerator
def accelerator(func, log: bool = False):
"""
A decorator to run operations on the GPU if Numba CUDA is available.
"""
def wrapper(*args, **kwargs):
if cuda.is_available():
try:
if log:
print("CUDA is available. Running on GPU.")
# Allocate memory on the GPU
d_args = [cuda.to_device(arg) if isinstance(arg, np.ndarray) else arg for arg in args]
d_kwargs = {k: cuda.to_device(v) if isinstance(v, np.ndarray) else v for k, v in kwargs.items()}
result = func(*d_args, **d_kwargs)
# Copy result back to host
if isinstance(result, cuda.devicearray.DeviceNDArray):
result = result.copy_to_host()
return result
except cuda.CudaSupportError:
if log:
print("CUDA is not available. Running function on CPU.")
return func(*args, **kwargs)
except cuda.KernelRuntimeError:
if log:
print("CUDA runtime error. Running function on CPU.")
return func(*args, **kwargs)
except:
if log:
print("Unknown Error encountered. Running function on CPU.")
return func(*args, **kwargs)
else:
if log:
print("CUDA is not available. Running function on CPU.")
return func(*args, **kwargs)
return wrapper
# # Download correct tokenizer
nltk.download("punkt_tab")
# Load the question generation model (fine-tuned for QA)
model_name = "valhalla/t5-small-qg-prepend"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Semantic similarity model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
@general_accelerator
def model_accelerator(cls):
original_init = cls.__init__
@wraps(original_init)
def new_init(self, *args, **kwargs):
original_init(self, *args, **kwargs)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.to(device)
print(f"[Accelerate] Model moved to {device}.")
cls.__init__ = new_init
return cls
@general_accelerator
def generate_qa_pairs(text, top_k=50000, similarity_threshold=0.4):
sentences = sent_tokenize(text)
qa_pairs = []
for sentence in sentences:
input_text = f"answer: {sentence.strip()} context: {text.strip()}"
encoding = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
input_ids = encoding['input_ids']
output = model.generate(
input_ids=input_ids,
max_length=128,
num_beams=10,
early_stopping=True,
do_sample=True,
top_k=top_k,
top_p=0.9
)
question = tokenizer.decode(output[0], skip_special_tokens=True).strip()
answer = sentence.strip()
embeddings = sentence_model.encode([question, answer])
similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
if similarity >= similarity_threshold:
qa_pairs.append({
"question": question,
"answer": answer,
"similarity": round(similarity, 4)
})
#
if len(qa_pairs) >= top_k:
break
return qa_pairs
# Dataset class
@general_accelerator
class TextDataset(Dataset):
def __init__(self, dataframe, input_col, output_col, tokenizer, max_seq_len):
self.data = dataframe
self.input_col = input_col
self.output_col = output_col
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
input_text = self.data.iloc[idx][self.input_col]
output_text = self.data.iloc[idx][self.output_col]
input_ids = self.tokenizer.encode(input_text)
output_ids = self.tokenizer.encode(output_text)
input_ids = input_ids[:self.max_seq_len] + [0] * (self.max_seq_len - len(input_ids[:self.max_seq_len]))
output_ids = output_ids[:self.max_seq_len] + [0] * (self.max_seq_len - len(output_ids[:self.max_seq_len]))
return torch.tensor(input_ids), torch.tensor(output_ids)
# Linear Attention Layer
@accelerator
class LinearAttention(nn.Module):
def __init__(self, d_model, n_heads):
super().__init__()
self.d_model = d_model
self.n_heads = n_heads
self.head_dim = d_model // n_heads
self.q_proj = nn.Linear(d_model, d_model)
self.k_proj = nn.Linear(d_model, d_model)
self.v_proj = nn.Linear(d_model, d_model)
self.out_proj = nn.Linear(d_model, d_model)
self.epsilon = 1e-6
def forward(self, x):
B, T, D = x.shape
H = self.n_heads
d = self.head_dim
# Project queries, keys, and values
Q = self.q_proj(x).reshape(B, T, H, d).transpose(1, 2) # (B, H, T, d)
K = self.k_proj(x).reshape(B, T, H, d).transpose(1, 2) # (B, H, T, d)
V = self.v_proj(x).reshape(B, T, H, d).transpose(1, 2) # (B, H, T, d)
# Apply kernel feature (elu + 1) to Q and K for positive values
Q = F.elu(Q) + 1
K = F.elu(K) + 1
# Compute KV and normalize
KV = torch.einsum("bhnd,bhne->bhde", K, V) # (B, H, d, d)
Z = 1 / (torch.einsum("bhnd,bhnd->bhn", Q, K.sum(dim=2)) + self.epsilon) # (B, H, T)
# Compute output
context = torch.einsum("bhnd,bhde->bhne", Q, KV) # (B, H, T, d)
context = context * Z.unsqueeze(-1)
context = context.transpose(1, 2).contiguous().view(B, T, D)
return self.out_proj(context)
# Transformer block
@accelerator
class TransformerBlock(nn.Module):
def __init__(self, d_model, nhead, ff_dim):
super().__init__()
self.attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, batch_first=True)
self.ff = nn.Sequential(
nn.Linear(d_model, ff_dim),
nn.ReLU(),
nn.Linear(ff_dim, d_model)
)
self.ln1 = nn.LayerNorm(d_model)
self.ln2 = nn.LayerNorm(d_model)
def forward(self, x, alibi_bias=None):
attn_output, _ = self.attn(x, x, x)
x = self.ln1(x + attn_output)
ff_output = self.ff(x)
return self.ln2(x + ff_output)
# MoE (Mixture of Experts) Transformer block
@accelerator
class MoETransformerBlock(nn.Module):
def __init__(self, d_model, nhead, ff_dim):
super().__init__()
self.attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, batch_first=True)
self.ff = MoE(dim=d_model, hidden_dim=d_model * 4, num_experts=16)
self.ln1 = nn.LayerNorm(d_model)
self.ln2 = nn.LayerNorm(d_model)
def forward(self, x, alibi_bias=None):
attn_output, _ = self.attn(x, x, x)
x = self.ln1(x + attn_output)
ff_output = self.ff(x)
return self.ln2(x + ff_output)
# Linear Transformer block
@accelerator
class LinearTransformerBlock(nn.Module):
def __init__(self, d_model, nhead, ff_dim):
super().__init__()
self.attn = LinearAttention(d_model, nhead)
self.ff = nn.Sequential(
nn.Linear(d_model, ff_dim),
nn.ReLU(),
nn.Linear(ff_dim, d_model)
)
self.ln1 = nn.LayerNorm(d_model)
self.ln2 = nn.LayerNorm(d_model)
def forward(self, x, alibi_bias=None):
attn_output = self.attn(x)
x = self.ln1(x + attn_output)
ff_output = self.ff(x)
return self.ln2(x + ff_output)
# Alibi bias generator
@accelerator
def get_alibi_bias(n_heads, seq_len, device):
return torch.zeros((1, n_heads, seq_len, seq_len), device=device)
# Full transformer model
@model_accelerator
class TransformerModel(nn.Module):
def __init__(self, model_type, model_class, dropout, vocab_size, max_seq_len, d_model, nhead, ff_dim, num_layers, e_model=2, ehead=2):
super().__init__()
self.token_embed = nn.Embedding(vocab_size, d_model)
self.model_type = model_type
if model_type in [0, 2]:
self.encoder = nn.ModuleList([TransformerBlock(d_model, nhead, ff_dim) if model_class == 0 else MoETransformerBlock(e_model, ehead, ff_dim) if model_class == 1 else LinearTransformerBlock(d_model, nhead, ff_dim) for _ in range(num_layers)])
if model_type in [1, 2]:
self.decoder = nn.ModuleList([TransformerBlock(d_model, nhead, ff_dim) if model_class == 0 else MoETransformerBlock(e_model, ehead, ff_dim) if model_class == 1 else LinearTransformerBlock(d_model, nhead, ff_dim) for _ in range(num_layers)])
self.ln_f = nn.LayerNorm(d_model)
self.head = nn.Linear(d_model, vocab_size)
self.max_seq_len = max_seq_len
self.d_model = d_model
self.nhead = nhead
self.vocab = vocab_size if vocab_size < tokenizer.vocab_size else tokenizer.vocab_size
self.dropout = nn.Dropout(dropout)
@accelerator
def forward(self, input_ids, decoder_input_ids=None):
if decoder_input_ids is None:
decoder_input_ids = input_ids
B, T = input_ids.shape
alibi_bias = get_alibi_bias(self.nhead, T, input_ids.device)
x = self.token_embed(input_ids)
x = self.dropout(x)
if self.model_type == 0:
for block in self.encoder:
x = block(x, alibi_bias=alibi_bias)
x = self.ln_f(x)
return self.head(x)
elif self.model_type == 1:
for block in self.decoder:
x = block(x, alibi_bias=alibi_bias)
x = self.ln_f(x)
return self.head(x)
elif self.model_type == 2:
enc_x = self.token_embed(input_ids)
for block in self.encoder:
enc_x = block(enc_x, alibi_bias=alibi_bias)
dec_input = self.token_embed(decoder_input_ids)
for block in self.decoder:
dec_input = block(dec_input, alibi_bias=alibi_bias)
x = self.ln_f(dec_input)
return self.head(x)
@accelerator
def pretrain_on_corpus(self, corpus_path, tokenizer, num_epochs=3, batch_size=4, learning_rate=1e-4, weight_decay=3e-4):
"""
Pretrain the model on a text corpus for next-token prediction (causal language modeling).
"""
# Load and tokenize corpus
with open(corpus_path, 'r', encoding='utf-8') as file:
corpus = file.read().splitlines()
token_sequences = []
for line in corpus:
tokens = tokenizer.encode(line, add_special_tokens=False)
token_sequences.append(tokens)
# Create input-output pairs for next-token prediction
input_ids = []
target_ids = []
for tokens in token_sequences:
for i in range(1, len(tokens)):
input_ids.append(torch.tensor(tokens[:i], dtype=torch.long)) # e.g., [101]
target_ids.append(torch.tensor(tokens, dtype=torch.long)) # e.g., 102
# Pad input sequences to the max sequence length
input_tensor = pad_sequence(input_ids, batch_first=True, padding_value=0) # shape: (N, seq_len)
target_tensor = torch.stack(target_ids) # shape: (N,)
# DataLoader
train_data = TensorDataset(input_tensor, target_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
# Training setup
optimizer = torch.optim.AdamW(self.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.to(device)
self.train()
print(f"Starting pretraining on corpus: {corpus_path}")
for epoch in range(num_epochs):
total_loss = 0
pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Pretraining]")
for x, y in pbar:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
outputs = self.forward(x) # outputs shape: (batch_size, seq_len, vocab_size)
# Get last token logits for each sequence
lengths = (x != 0).sum(dim=1) - 1 # index of last token in each sequence
batch_indices = torch.arange(x.size(0), device=device)
logits = outputs[batch_indices, lengths] # shape: (batch_size, vocab_size)
loss = loss_fn(logits, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
pbar.set_postfix(loss=loss.item())
avg_loss = total_loss / len(train_loader)
print(f"Epoch {epoch+1}: Avg Loss = {avg_loss:.4f}")
print("Pretraining complete!")
@accelerator
def train_on_dataset(self, dataset_path, tokenizer, input_col="prompt", output_col="completion", num_epochs=3, batch_size=4, learning_rate=1e-4, weight_decay=3e-4):
"""
Train the model for next-token prediction using a CSV dataset.
The dataset should contain 'prompt' and 'completion' columns.
"""
# Load the CSV dataset
df = pd.read_csv(dataset_path)
# Convert the dataset to a text-based dialogue format
corpus = []
for _, row in df.iterrows():
prompt = f"User: {row[input_col]}\nAI: "
completion = f"{row[output_col]}"
corpus.append(prompt + " " + completion)
# Tokenize the entire corpus
inputs = []
for line in corpus:
tokenized = tokenizer.encode(line, add_special_tokens=False)
inputs.append(tokenized)
# Create the input-output pairs for next-token prediction
input_ids = []
target_ids = []
for tokens in inputs:
for i in range(len(tokens) - 1):
input_ids.append(tokens[:i+1]) # Input sequence up to current token
target_ids.append(tokens[i+1]) # Target is the next token
# Padding to make sequences of equal length
max_seq_len = max(len(seq) for seq in input_ids)
input_ids = [seq + [0] * (max_seq_len - len(seq)) for seq in input_ids]
target_ids = [seq + [0] * (max_seq_len - len(seq)) for seq in target_ids]
# Convert to tensors
input_tensor = torch.tensor(input_ids)
target_tensor = torch.tensor(target_ids)
# Create DataLoader
train_data = torch.utils.data.TensorDataset(input_tensor, target_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
# Optimizer and weight decay
optimizer = torch.optim.AdamW(self.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss()
self.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.to(device)
print(f"Starting training on CSV dataset: {dataset_path}")
for epoch in range(num_epochs):
total_loss = 0
pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Training]")
for x, y in pbar:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
output = self.forward(x)
loss = loss_fn(output.view(-1, output.size(-1)), y.view(-1))
loss.backward()
optimizer.step()
total_loss += loss.item()
pbar.set_postfix(loss=loss.item())
avg_loss = total_loss / len(train_loader)
print(f"Epoch {epoch+1}: Avg Loss = {avg_loss:.4f}")
print("Training complete!")
# @accelerator
# def train_on_dataset(self, dataset_path, input_col, output_col, num_epochs, learning_rate, weight_decay, tokenizer, batch_size=4):
# df = pd.read_csv(dataset_path)
# train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True)
# train_set = TextDataset(train_df, input_col, output_col, tokenizer, self.max_seq_len)
# test_set = TextDataset(test_df, input_col, output_col, tokenizer, self.max_seq_len)
# train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, drop_last=False)
# test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, drop_last=False)
# ln = learning_rate
# optimizer = torch.optim.AdamW(self.parameters(), lr=ln, weight_decay=weight_decay)
# loss_fn = nn.CrossEntropyLoss()
# self.train()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# self.to(device)
# print(f"Train samples: {len(train_set)}, Batches: {len(train_loader)}")
# print(f"Test samples: {len(test_set)}, Batches: {len(test_loader)}")
# for epoch in range(num_epochs):
# total_loss = 0
# self.train()
# pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} [Training]")
# for x, y in pbar:
# x, y = x.to(device), y.to(device)
# if self.model_type == 2:
# out = self.forward(x, y[:, :-1])
# target = y[:, 1:]
# else:
# out = self.forward(x)
# target = y
# out = out.view(-1, out.size(-1))
# target = target.reshape(-1)
# loss = loss_fn(out, target)
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()
# total_loss += loss.item()
# pbar.set_postfix(loss=loss.item())
# avg_train_loss = total_loss / len(train_loader)
# self.eval()
# val_loss = 0
# with torch.no_grad():
# for x, y in test_loader:
# x, y = x.to(device), y.to(device)
# if self.model_type == 2:
# out = self.forward(x, y[:, :-1])
# target = y[:, 1:]
# else:
# out = self.forward(x)
# target = y
# out = out.view(-1, out.size(-1))
# target = target.reshape(-1)
# loss = loss_fn(out, target)
# val_loss += loss.item()
# avg_val_loss = val_loss / len(test_loader)
# print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")
# ln = learning_rate * avg_train_loss
# optimizer = torch.optim.AdamW(self.parameters(), lr=ln, weight_decay=weight_decay)
@general_accelerator
def save_model(self, path):
torch.save(self.state_dict(), path)
print(f"Model saved to {path}")
@general_accelerator
def load_model(self, path):
self.load_state_dict(torch.load(path))
print(f"Model loaded from {path}")
@accelerator
def communicate(self, prompt, tokenizer, max_length=50):
self.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.to(device)
input_ids = tokenizer.encode(prompt)[:self.max_seq_len]
input_ids += [0] * (self.max_seq_len - len(input_ids))
input_tensor = torch.tensor([input_ids]).to(device)
with torch.no_grad():
output = self.forward(input_tensor)
logits = output
logits[:, tokenizer.pad_token_id] = float('-inf') # nuke pad probs
predicted_ids = output.argmax(dim=-1)[0].tolist()
return tokenizer.decode([predicted_id if predicted_id < self.vocab - 1 else self.vocab - 1 for predicted_id in predicted_ids], skip_special_tokens=True)
@general_accelerator
def process_data(full_path, max_length, similarity_threshold):
input_text = ""
with open(full_path, "r") as file:
input_text = file.read()
qa_pairs = generate_qa_pairs(input_text, max_length, similarity_threshold)
qa_pairs = [[qa['question'], qa['answer']] for qa in qa_pairs]
with open("./dataset.csv", "w", newline='') as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL)
writer.writerow(["prompt", "completion"])
writer.writerows(qa_pairs)
< /code>
Ich versuche, ein von GPT4 inspirierter KI -Modell inspirieren zu lassen, aber dann erhalte ich ständig den oben angegebenen Fehler. Speziell während der Vorab- und möglicherweise während des Trainings. Ich würde mich über jede Hilfe bei der Behebung aller Fehler und Probleme im Code freuen.