Der Decoder modelliert nur die KI und gibt sich wiederholende AntwortenPython

Python-Programme
Anonymous
 Der Decoder modelliert nur die KI und gibt sich wiederholende Antworten

Post by Anonymous »

Ich erstelle mit Pytorch einen Nur-Decoder-Transformator und mein Datensatz meiner Wahl ist der vollständige englische Datensatz aus kaggle Plaintext Wikipedia (vollständiges Englisch).
Das Problem ist, dass meine Modellausgabe sich wiederholt:

Code: Select all

print(generate(model=model, prompt="what is anarchism", tokenizer=tokenizer, max_new_tokens=100, temperature=1.0, device=DEVICE))
für diese Dummy-Generierung lautet die Ausgabe:

Code: Select all

what is anarchism anarchism anarchism anarch anarch anarchism anarchism anarchismismismismismismismismismismismismismismism anarchism anarchismismismismismismismismismismismismismismismismismismismismismismism anarchismismism anarchismismism anarchism anarchismismismismismismismism anarchismismismismism anarch anarchism anarchismismism anarchism anarchism anarchismismismismism anarchism anarchism anarchism
Da ich relativ neu in Sachen Transformatoren bin, kann ich das Problem selbst nicht wirklich herausfinden. Auch wenn es nicht ideal ist, den gesamten Code zu veröffentlichen, entschuldige ich mich dafür.
Hier ist mein Code

Code: Select all

import os
import math
from pathlib import Path
import random
import bisect

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm

"""#Config"""

DATA_DIR = './data'
TOKENIZER_DIR = './tokenizer'
OUT_DIR = './out'
MODEL_DIR = './models'
MODEL_PATH = './models/wiki_chatbot.pth'
BLOCK_SIZE = 256
APPROX_SAMPLE = 100000
BATCH_SIZE = 8
EPOCHS = 2
LR = 3e-4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
D_MODEL = 256
N_HEADS = 4
N_LAYERS = 4
DROPOUT = 0.1
SEED = 42

torch.manual_seed(SEED)
if DEVICE=='cuda': torch.cuda.manual_seed_all(SEED)

"""#Tokenizer"""

if not Path(TOKENIZER_DIR).exists():
Path(TOKENIZER_DIR).mkdir()
files = [str(p) for p in Path(DATA_DIR).glob('**/*') if p.is_file() and not p.name.startswith('.')]
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=files, vocab_size=30000, min_frequency=2, special_tokens=["", "", "", "", ""])
tokenizer.save_model(TOKENIZER_DIR)
else:
tokenizer = ByteLevelBPETokenizer.from_file(f"{TOKENIZER_DIR}/vocab.json", f"{TOKENIZER_DIR}/merges.txt")

VOCAB_SIZE = tokenizer.get_vocab_size()

"""#Dataset & Dataloader"""

class RandomWindowWikiDataset(Dataset):
def __init__(self, data_dir, tokenizer, block_size, approx_sample):
self.files = sorted([p for p in Path(DATA_DIR).glob('**/*') if p.is_file() and not p.name.startswith('.')])

if len(self.files) == 0:
raise FileNotFoundError('No files found in data_dir')

self.tokenizer = tokenizer
self.block_size = block_size
self.approx_sample = approx_sample

self.sizes = [os.path.getsize(p) for p in self.files]
self.cum_sizes = []
total = 0

for s in self.sizes:
total += s
self.cum_sizes.append(total)

self.total = total

def __len__(self):
return self.approx_sample

def __getitem__(self, idx):
offset = random.randint(0, self.total - 1)
file_idx = bisect.bisect_right(self.cum_sizes, offset)
if file_idx == 0:
local_offset = offset
else:
local_offset = offset - self.cum_sizes[file_idx - 1]

fp = self.files[file_idx]
with open(fp, 'rb') as f:
f.seek(local_offset)
chunk = f.read(self.block_size * 4)

text = chunk.decode('utf-8', errors='ignore')

ids = self.tokenizer.encode(text).ids
if len(ids) <  self.block_size + 1:
pad_id = self.tokenizer.token_to_id('') or 0
ids += [pad_id] * (self.block_size + 1 - len(ids))

x = ids[:self.block_size]
y = ids[1:self.block_size+1]
return torch.tensor(x), torch.tensor(y)

dataset = RandomWindowWikiDataset(data_dir=DATA_DIR, tokenizer=tokenizer, block_size=BLOCK_SIZE, approx_sample=APPROX_SAMPLE)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)

"""#DecoderModel"""

class DecoderOnlyModel(nn.Module):
def __init__(self, vocab_size, block_size, d_model, n_heads, n_layers, dropout):
super().__init__()
self.tok_emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
self.pos_emb = nn.Parameter(torch.zeros(1, block_size, d_model))
self.dropout = nn.Dropout(dropout)
dec_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=4*d_model, dropout=dropout, activation='gelu', batch_first=True)
self.decoder = nn.TransformerDecoder(decoder_layer=dec_layer, num_layers=n_layers)
self.ln = nn.LayerNorm(d_model)
self.head = nn.Linear(in_features=d_model, out_features=vocab_size, bias=False)
self.block_size = block_size
self._init_weights()

def _init_weights(self):
nn.init.normal_(tensor=self.tok_emb.weight, mean=0.0, std=0.2)
nn.init.normal_(tensor=self.head.weight, mean=0.0, std=0.2)
nn.init.normal_(tensor=self.pos_emb, mean=0.0, std=0.2)

def forward(self, x):
b, t = x.size()
tok = self.tok_emb(x) * math.sqrt(self.tok_emb.embedding_dim)
pos = self.pos_emb[:, :t, :]
x = self.dropout(tok + pos)
mask = torch.triu(torch.ones(t, t, device=x.device) * float('-inf'), diagonal=1)
out = self.decoder(tgt=x, memory=x, tgt_mask=mask)
out = self.ln(out)
logits = self.head(out)
return logits

"""#Train function"""

model = DecoderOnlyModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, dropout=DROPOUT)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

def train(model, optimizer, criterion, epochs, device, vocab_size, dataloader):
model.to(device)
model.train()

for epoch in range(epochs):
epoch_loss = 0.0

for xb, yb in tqdm(dataloader):
xb, yb = xb.to(device), yb.to(device)
optimizer.zero_grad()
logits = model(xb)
loss = criterion(logits.view(-1, vocab_size), yb.view(-1))
loss.backward()
optimizer.step()
epoch_loss += loss.item()

print(f"Epoch: {epoch+1}/{epochs} | Loss: {epoch_loss/len(dataloader):.4f}")

if not os.path.exists(MODEL_PATH):
if not Path(MODEL_DIR).exists():
Path(MODEL_DIR).mkdir()
train(model=model, optimizer=optimizer, criterion=criterion, epochs=EPOCHS, device=DEVICE, vocab_size=VOCAB_SIZE, dataloader=dataloader)
torch.save(obj=model.state_dict(), f=MODEL_PATH)

else:
model.load_state_dict(torch.load(MODEL_PATH))

"""#Generate function"""

@torch.inference_mode()
def generate(model, prompt, tokenizer, max_new_tokens, temperature, top_k, device):
model.to(device)
model.eval()

input_ids = torch.tensor([tokenizer.encode(prompt).ids], dtype=torch.long).to(device)

for _ in range(max_new_tokens):
logits = model(input_ids)[:, -1, :] / temperature
probs = torch.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
input_ids = torch.cat([input_ids, next_token], dim=1)

text = tokenizer.decode(input_ids[0].tolist())
return text

print(generate(model=model, prompt="what is anarchism", tokenizer=tokenizer, max_new_tokens=100, temperature=1.0, device=DEVICE))

Quick Reply

Change Text Case: 
   
  • Similar Topics
    Replies
    Views
    Last post