Guest
Warum erhöht meine KI mit Verstärkungslernen ihre Belohnung nicht mit der Zeit?
Post
by Guest » 17 Jan 2025, 06:20
Ich arbeite an der Entwicklung einer Reinforcement Learning AI, um das Spiel Master Mind zu spielen. Ich habe einem Tutorial (
für das Spiel Snake gefolgt, habe es aber so geändert, dass es für mein Spiel funktioniert. Ich bin an einem Punkt angelangt, an dem die KI Vermutungen anstellen kann, aber es scheint keine Verbesserung zu geben.
Das ist meine agent.py
Code: Select all
import torch
import random
import numpy as np
from collections import deque
from game import MasterMindAI
from model import Linear_QNet, QTrainer
from helper import plot
MAX_MEMORY = 100_000
BATCH_SIZE = 1000
LR = 0.001
class Agent:
def __init__(self):
self.n_games = 0
self.epsilon = 0 # randomness
self.gamma = 0.9 # discount rate
self.memory = deque(maxlen=MAX_MEMORY) # popleft()
self.model = Linear_QNet(4, 256, 4)
self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma)
def get_state(self, game):
compare = game.get_compare()
guesses = game.get_guesses()
while len(compare) < 8:
compare.append([-1, -1, -1, -1])
while len(compare) >= 8:
compare.pop()
while len(guesses) < 8:
guesses.append([-1, -1, -1, -1])
while len(guesses) >= 8:
guesses.pop()
# print("compare ", compare)
# print("guesses ", guesses)
state = np.array(compare + guesses,dtype=int)
#print("state", state)
# state = [
# #DOES THIS WORK
# compare,
# guesses
# ]
# return np.array(state, dtype=int)
return state
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached
def train_long_memory(self):
if len(self.memory) > BATCH_SIZE:
mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples
else:
mini_sample = self.memory
states, actions, rewards, next_states, dones = zip(*mini_sample)
self.trainer.train_step(states, actions, rewards, next_states, dones)
def train_short_memory(self, state, action, reward, next_state, done):
self.trainer.train_step(state, action, reward, next_state, done)
def get_action(self, state):
# random moves: tradeoff exploration / exploitation
self.epsilon = 800 - self.n_games
final_move = [0,0,0,0]
if random.randint(0, 2000) < self.epsilon:
final_move = [random.randint(1, 5) for _ in range(4)]
else:
state_tensor = torch.tensor(state, dtype=torch.float)
prediction = self.model(state_tensor) # Get model prediction
move = torch.argmax(prediction).item() # Choose the highest predicted value
#
# print("move ", move)
# Assuming the output needs to be transformed to 4 integers:
final_move = [
(move % 5) + 1, # First digit (1-5)
((move // 5) % 5) + 1, # Second digit (1-5)
((move // 25) % 5) + 1, # Third digit (1-5)
((move // 125) % 5) + 1 # Fourth digit (1-5)
]
return final_move
def train():
plot_scores = []
plot_mean_scores = []
total_score = 0
plot_reward = []
plot_mean_reward = []
total_reward = 0
record = 10
agent = Agent()
game = MasterMindAI()
game.start_game()
while True:
# get old state
state_old = agent.get_state(game)
# get move
final_move = agent.get_action(state_old)
# perform move and get new state
reward, done, score = game.play_step(final_move)
# print("reward in here", reward)
state_new = agent.get_state(game)
# train short memory
agent.train_short_memory(state_old, final_move, reward, state_new, done)
# remember
agent.remember(state_old, final_move, reward, state_new, done)
if done:
#print("reward", reward)
# train long memory, plot result
game.start_game()
agent.n_games += 1
agent.train_long_memory()
if score < record:
record = score
agent.model.save()
print('Game', agent.n_games, 'Score', score, 'Record:', record)
# plot_scores.append(score)
# total_score += score
# mean_score = total_score / agent.n_games
# plot_mean_scores.append(mean_score)
# plot(plot_scores, plot_mean_scores)
plot_reward.append(reward)
total_reward += reward
mean_reward = total_reward / agent.n_games
plot_mean_reward.append(mean_reward)
plot(plot_reward, plot_mean_reward)
if __name__ == '__main__':
train()
Das ist model.py
Code: Select all
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
class Linear_QNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = F.relu(self.linear1(x))
x = self.linear2(x)
return x
def save(self, file_name='model.pth'):
model_folder_path = './model'
if not os.path.exists(model_folder_path):
os.makedirs(model_folder_path)
file_name = os.path.join(model_folder_path, file_name)
torch.save(self.state_dict(), file_name)
class QTrainer:
def __init__(self, model, lr, gamma):
self.lr = lr
self.gamma = gamma
self.model = model
self.optimizer = optim.Adam(model.parameters(), lr=self.lr)
self.criterion = nn.MSELoss()
def train_step(self, state, action, reward, next_state, done):
state = torch.tensor(state, dtype=torch.float)
next_state = torch.tensor(next_state, dtype=torch.float)
action = torch.tensor(action, dtype=torch.long)
reward = torch.tensor(reward, dtype=torch.float)
# (n, x)
if len(state.shape) == 2:
# (1, x)
state = torch.unsqueeze(state, 0)
next_state = torch.unsqueeze(next_state, 0)
action = torch.unsqueeze(action, 0)
reward = torch.unsqueeze(reward, 0)
done = (done, )
# 1: predicted Q values with current state
pred = self.model(state)
target = pred.clone()
for idx in range(len(done)):
Q_new = reward[idx]
if not done[idx]:
Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx]))
target[idx][torch.argmax(action[idx]).item()] = Q_new
#target[idx][action[idx]] = Q_new
# 2: Q_new = r + y * max(next_predicted Q value) -> only do this if not done
# pred.clone()
# preds[argmax(action)] = Q_new
self.optimizer.zero_grad()
loss = self.criterion(target, pred)
loss.backward()
self.optimizer.step()
Und mein game.py
Code: Select all
import random
class MasterMindAI:
def start_game(self):
self.hidden = [random.randint(1, 5) for _ in range(4)]
self.guessMax = 8
self.guessMade = 0
self.reward = 0
self.allGuess = []
self.compare = []
def get_guesses(self):
if(self.allGuess == []):
return [[-1,-1,-1,-1]]
else:
return self.allGuess
def get_compare(self):
if(self.compare == []):
return [[-1,-1,-1,-1]]
else:
return self.compare
#Action
def guess_list(self):
self.guessMade += 1
guess = input("Enter four numbers separated by spaces: ")
nums = [int(num) for num in guess.split()]
return nums
def play_step(self,action):
game_over = False
self.compare.insert(0, self.checkHidden(self.hidden, action))
self.guessMade += 1
for num in self.compare[0]:
if num == 2:
self.reward += 3
elif num == 1:
self.reward += 1
self.allGuess.insert(0, action)
if(action == self.hidden or self.guessMade >= self.guessMax ):
game_over = True
if(action == self.hidden):
self.reward += (self.guessMax-self.guessMade) * 20
else:
self.reward += 0
#self.display()
#print("reward ", self.reward)
return self.reward, game_over, self.guessMade
def display(self):
for i in range(self.guessMax - self.guessMade):
print("_ _ _ _ |")
for lis, lis2 in zip(self.allGuess, self.compare):
if(lis != [-1,-1,-1,-1]):
print(*lis, "|", *lis2)
def checkHidden(self, answer, guess):
spot = [0,1,2,3]
ans = answer.copy()
lis = []
if(guess[0] == self.hidden[0]):
lis.append(2)
spot[0] = -1
ans[0] = -1
if(guess[1] == self.hidden[1]):
lis.append(2)
spot[1] = -1
ans[1] = -1
if(guess[2] == self.hidden[2]):
lis.append(2)
spot[2] = -1
ans[2] = -1
if(guess[3] == self.hidden[3]):
lis.append(2)
spot[3] = -1
ans[3] = -1
if 0 in spot:
if guess[0] in ans:
lis.append(1)
ans[0] = -1
if 1 in spot:
if guess[1] in ans:
lis.append(1)
ans[1] = -1
if 2 in spot:
if guess[2] in ans:
lis.append(1)
ans[2] = -1
if 3 in spot:
if guess[3] in ans:
lis.append(1)
ans[3] = -1
while len(lis) < 4:
lis.append(0)
return lis
# def play(self):
# self.start_game()
# self.display()
#play_step
#recent_guess = self.guess_list()
#self.play_step(recent_guess)
#
# while(recent_guess != self.hidden and self.guessMade < self.guessMax):
# self.display()
# #
# recent_guess = self.guess_list()
# self.play_step(recent_guess)
# #
# if (recent_guess == self.hidden):
# reward = (self.guessMax - self.guessMade) * 10
# return 1, reward
# else:
# reward = 0
# return 0, reward
if __name__ == "__main__":
ai = MasterMindAI()
num, reward = ai.play()
Ich weiß gerade nicht, wo ich Fehler beheben muss.
Vielen Dank für jede Hilfe
1737091209
Guest
Ich arbeite an der Entwicklung einer Reinforcement Learning AI, um das Spiel Master Mind zu spielen. Ich habe einem Tutorial ([youtube]L8ypSXwyBds[/youtube] für das Spiel Snake gefolgt, habe es aber so geändert, dass es für mein Spiel funktioniert. Ich bin an einem Punkt angelangt, an dem die KI Vermutungen anstellen kann, aber es scheint keine Verbesserung zu geben. Das ist meine agent.py [code]import torch import random import numpy as np from collections import deque from game import MasterMindAI from model import Linear_QNet, QTrainer from helper import plot MAX_MEMORY = 100_000 BATCH_SIZE = 1000 LR = 0.001 class Agent: def __init__(self): self.n_games = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) # popleft() self.model = Linear_QNet(4, 256, 4) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) def get_state(self, game): compare = game.get_compare() guesses = game.get_guesses() while len(compare) < 8: compare.append([-1, -1, -1, -1]) while len(compare) >= 8: compare.pop() while len(guesses) < 8: guesses.append([-1, -1, -1, -1]) while len(guesses) >= 8: guesses.pop() # print("compare ", compare) # print("guesses ", guesses) state = np.array(compare + guesses,dtype=int) #print("state", state) # state = [ # #DOES THIS WORK # compare, # guesses # ] # return np.array(state, dtype=int) return state def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves: tradeoff exploration / exploitation self.epsilon = 800 - self.n_games final_move = [0,0,0,0] if random.randint(0, 2000) < self.epsilon: final_move = [random.randint(1, 5) for _ in range(4)] else: state_tensor = torch.tensor(state, dtype=torch.float) prediction = self.model(state_tensor) # Get model prediction move = torch.argmax(prediction).item() # Choose the highest predicted value # # print("move ", move) # Assuming the output needs to be transformed to 4 integers: final_move = [ (move % 5) + 1, # First digit (1-5) ((move // 5) % 5) + 1, # Second digit (1-5) ((move // 25) % 5) + 1, # Third digit (1-5) ((move // 125) % 5) + 1 # Fourth digit (1-5) ] return final_move def train(): plot_scores = [] plot_mean_scores = [] total_score = 0 plot_reward = [] plot_mean_reward = [] total_reward = 0 record = 10 agent = Agent() game = MasterMindAI() game.start_game() while True: # get old state state_old = agent.get_state(game) # get move final_move = agent.get_action(state_old) # perform move and get new state reward, done, score = game.play_step(final_move) # print("reward in here", reward) state_new = agent.get_state(game) # train short memory agent.train_short_memory(state_old, final_move, reward, state_new, done) # remember agent.remember(state_old, final_move, reward, state_new, done) if done: #print("reward", reward) # train long memory, plot result game.start_game() agent.n_games += 1 agent.train_long_memory() if score < record: record = score agent.model.save() print('Game', agent.n_games, 'Score', score, 'Record:', record) # plot_scores.append(score) # total_score += score # mean_score = total_score / agent.n_games # plot_mean_scores.append(mean_score) # plot(plot_scores, plot_mean_scores) plot_reward.append(reward) total_reward += reward mean_reward = total_reward / agent.n_games plot_mean_reward.append(mean_reward) plot(plot_reward, plot_mean_reward) if __name__ == '__main__': train() [/code] Das ist model.py [code]import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import os class Linear_QNet(nn.Module): def __init__(self, input_size, hidden_size, output_size): super().__init__() self.linear1 = nn.Linear(input_size, hidden_size) self.linear2 = nn.Linear(hidden_size, output_size) def forward(self, x): x = F.relu(self.linear1(x)) x = self.linear2(x) return x def save(self, file_name='model.pth'): model_folder_path = './model' if not os.path.exists(model_folder_path): os.makedirs(model_folder_path) file_name = os.path.join(model_folder_path, file_name) torch.save(self.state_dict(), file_name) class QTrainer: def __init__(self, model, lr, gamma): self.lr = lr self.gamma = gamma self.model = model self.optimizer = optim.Adam(model.parameters(), lr=self.lr) self.criterion = nn.MSELoss() def train_step(self, state, action, reward, next_state, done): state = torch.tensor(state, dtype=torch.float) next_state = torch.tensor(next_state, dtype=torch.float) action = torch.tensor(action, dtype=torch.long) reward = torch.tensor(reward, dtype=torch.float) # (n, x) if len(state.shape) == 2: # (1, x) state = torch.unsqueeze(state, 0) next_state = torch.unsqueeze(next_state, 0) action = torch.unsqueeze(action, 0) reward = torch.unsqueeze(reward, 0) done = (done, ) # 1: predicted Q values with current state pred = self.model(state) target = pred.clone() for idx in range(len(done)): Q_new = reward[idx] if not done[idx]: Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx])) target[idx][torch.argmax(action[idx]).item()] = Q_new #target[idx][action[idx]] = Q_new # 2: Q_new = r + y * max(next_predicted Q value) -> only do this if not done # pred.clone() # preds[argmax(action)] = Q_new self.optimizer.zero_grad() loss = self.criterion(target, pred) loss.backward() self.optimizer.step() [/code] Und mein game.py [code]import random class MasterMindAI: def start_game(self): self.hidden = [random.randint(1, 5) for _ in range(4)] self.guessMax = 8 self.guessMade = 0 self.reward = 0 self.allGuess = [] self.compare = [] def get_guesses(self): if(self.allGuess == []): return [[-1,-1,-1,-1]] else: return self.allGuess def get_compare(self): if(self.compare == []): return [[-1,-1,-1,-1]] else: return self.compare #Action def guess_list(self): self.guessMade += 1 guess = input("Enter four numbers separated by spaces: ") nums = [int(num) for num in guess.split()] return nums def play_step(self,action): game_over = False self.compare.insert(0, self.checkHidden(self.hidden, action)) self.guessMade += 1 for num in self.compare[0]: if num == 2: self.reward += 3 elif num == 1: self.reward += 1 self.allGuess.insert(0, action) if(action == self.hidden or self.guessMade >= self.guessMax ): game_over = True if(action == self.hidden): self.reward += (self.guessMax-self.guessMade) * 20 else: self.reward += 0 #self.display() #print("reward ", self.reward) return self.reward, game_over, self.guessMade def display(self): for i in range(self.guessMax - self.guessMade): print("_ _ _ _ |") for lis, lis2 in zip(self.allGuess, self.compare): if(lis != [-1,-1,-1,-1]): print(*lis, "|", *lis2) def checkHidden(self, answer, guess): spot = [0,1,2,3] ans = answer.copy() lis = [] if(guess[0] == self.hidden[0]): lis.append(2) spot[0] = -1 ans[0] = -1 if(guess[1] == self.hidden[1]): lis.append(2) spot[1] = -1 ans[1] = -1 if(guess[2] == self.hidden[2]): lis.append(2) spot[2] = -1 ans[2] = -1 if(guess[3] == self.hidden[3]): lis.append(2) spot[3] = -1 ans[3] = -1 if 0 in spot: if guess[0] in ans: lis.append(1) ans[0] = -1 if 1 in spot: if guess[1] in ans: lis.append(1) ans[1] = -1 if 2 in spot: if guess[2] in ans: lis.append(1) ans[2] = -1 if 3 in spot: if guess[3] in ans: lis.append(1) ans[3] = -1 while len(lis) < 4: lis.append(0) return lis # def play(self): # self.start_game() # self.display() #play_step #recent_guess = self.guess_list() #self.play_step(recent_guess) # # while(recent_guess != self.hidden and self.guessMade < self.guessMax): # self.display() # # # recent_guess = self.guess_list() # self.play_step(recent_guess) # # # if (recent_guess == self.hidden): # reward = (self.guessMax - self.guessMade) * 10 # return 1, reward # else: # reward = 0 # return 0, reward if __name__ == "__main__": ai = MasterMindAI() num, reward = ai.play() [/code] Ich weiß gerade nicht, wo ich Fehler beheben muss. Vielen Dank für jede Hilfe