Warum erhöht meine KI mit Verstärkungslernen ihre Belohnung nicht mit der Zeit?
Posted: 17 Jan 2025, 06:20
Ich arbeite an der Entwicklung einer Reinforcement Learning AI, um das Spiel Master Mind zu spielen. Ich habe einem Tutorial ( für das Spiel Snake gefolgt, habe es aber so geändert, dass es für mein Spiel funktioniert. Ich bin an einem Punkt angelangt, an dem die KI Vermutungen anstellen kann, aber es scheint keine Verbesserung zu geben.
Das ist meine agent.py
Das ist model.py
Und mein game.py
Ich weiß gerade nicht, wo ich Fehler beheben muss.
Vielen Dank für jede Hilfe
Das ist meine agent.py
Code: Select all
import torch
import random
import numpy as np
from collections import deque
from game import MasterMindAI
from model import Linear_QNet, QTrainer
from helper import plot
MAX_MEMORY = 100_000
BATCH_SIZE = 1000
LR = 0.001
class Agent:
def __init__(self):
self.n_games = 0
self.epsilon = 0 # randomness
self.gamma = 0.9 # discount rate
self.memory = deque(maxlen=MAX_MEMORY) # popleft()
self.model = Linear_QNet(4, 256, 4)
self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma)
def get_state(self, game):
compare = game.get_compare()
guesses = game.get_guesses()
while len(compare) < 8:
compare.append([-1, -1, -1, -1])
while len(compare) >= 8:
compare.pop()
while len(guesses) < 8:
guesses.append([-1, -1, -1, -1])
while len(guesses) >= 8:
guesses.pop()
# print("compare ", compare)
# print("guesses ", guesses)
state = np.array(compare + guesses,dtype=int)
#print("state", state)
# state = [
# #DOES THIS WORK
# compare,
# guesses
# ]
# return np.array(state, dtype=int)
return state
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached
def train_long_memory(self):
if len(self.memory) > BATCH_SIZE:
mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples
else:
mini_sample = self.memory
states, actions, rewards, next_states, dones = zip(*mini_sample)
self.trainer.train_step(states, actions, rewards, next_states, dones)
def train_short_memory(self, state, action, reward, next_state, done):
self.trainer.train_step(state, action, reward, next_state, done)
def get_action(self, state):
# random moves: tradeoff exploration / exploitation
self.epsilon = 800 - self.n_games
final_move = [0,0,0,0]
if random.randint(0, 2000) < self.epsilon:
final_move = [random.randint(1, 5) for _ in range(4)]
else:
state_tensor = torch.tensor(state, dtype=torch.float)
prediction = self.model(state_tensor) # Get model prediction
move = torch.argmax(prediction).item() # Choose the highest predicted value
#
# print("move ", move)
# Assuming the output needs to be transformed to 4 integers:
final_move = [
(move % 5) + 1, # First digit (1-5)
((move // 5) % 5) + 1, # Second digit (1-5)
((move // 25) % 5) + 1, # Third digit (1-5)
((move // 125) % 5) + 1 # Fourth digit (1-5)
]
return final_move
def train():
plot_scores = []
plot_mean_scores = []
total_score = 0
plot_reward = []
plot_mean_reward = []
total_reward = 0
record = 10
agent = Agent()
game = MasterMindAI()
game.start_game()
while True:
# get old state
state_old = agent.get_state(game)
# get move
final_move = agent.get_action(state_old)
# perform move and get new state
reward, done, score = game.play_step(final_move)
# print("reward in here", reward)
state_new = agent.get_state(game)
# train short memory
agent.train_short_memory(state_old, final_move, reward, state_new, done)
# remember
agent.remember(state_old, final_move, reward, state_new, done)
if done:
#print("reward", reward)
# train long memory, plot result
game.start_game()
agent.n_games += 1
agent.train_long_memory()
if score < record:
record = score
agent.model.save()
print('Game', agent.n_games, 'Score', score, 'Record:', record)
# plot_scores.append(score)
# total_score += score
# mean_score = total_score / agent.n_games
# plot_mean_scores.append(mean_score)
# plot(plot_scores, plot_mean_scores)
plot_reward.append(reward)
total_reward += reward
mean_reward = total_reward / agent.n_games
plot_mean_reward.append(mean_reward)
plot(plot_reward, plot_mean_reward)
if __name__ == '__main__':
train()
Code: Select all
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
class Linear_QNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = F.relu(self.linear1(x))
x = self.linear2(x)
return x
def save(self, file_name='model.pth'):
model_folder_path = './model'
if not os.path.exists(model_folder_path):
os.makedirs(model_folder_path)
file_name = os.path.join(model_folder_path, file_name)
torch.save(self.state_dict(), file_name)
class QTrainer:
def __init__(self, model, lr, gamma):
self.lr = lr
self.gamma = gamma
self.model = model
self.optimizer = optim.Adam(model.parameters(), lr=self.lr)
self.criterion = nn.MSELoss()
def train_step(self, state, action, reward, next_state, done):
state = torch.tensor(state, dtype=torch.float)
next_state = torch.tensor(next_state, dtype=torch.float)
action = torch.tensor(action, dtype=torch.long)
reward = torch.tensor(reward, dtype=torch.float)
# (n, x)
if len(state.shape) == 2:
# (1, x)
state = torch.unsqueeze(state, 0)
next_state = torch.unsqueeze(next_state, 0)
action = torch.unsqueeze(action, 0)
reward = torch.unsqueeze(reward, 0)
done = (done, )
# 1: predicted Q values with current state
pred = self.model(state)
target = pred.clone()
for idx in range(len(done)):
Q_new = reward[idx]
if not done[idx]:
Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx]))
target[idx][torch.argmax(action[idx]).item()] = Q_new
#target[idx][action[idx]] = Q_new
# 2: Q_new = r + y * max(next_predicted Q value) -> only do this if not done
# pred.clone()
# preds[argmax(action)] = Q_new
self.optimizer.zero_grad()
loss = self.criterion(target, pred)
loss.backward()
self.optimizer.step()
Code: Select all
import random
class MasterMindAI:
def start_game(self):
self.hidden = [random.randint(1, 5) for _ in range(4)]
self.guessMax = 8
self.guessMade = 0
self.reward = 0
self.allGuess = []
self.compare = []
def get_guesses(self):
if(self.allGuess == []):
return [[-1,-1,-1,-1]]
else:
return self.allGuess
def get_compare(self):
if(self.compare == []):
return [[-1,-1,-1,-1]]
else:
return self.compare
#Action
def guess_list(self):
self.guessMade += 1
guess = input("Enter four numbers separated by spaces: ")
nums = [int(num) for num in guess.split()]
return nums
def play_step(self,action):
game_over = False
self.compare.insert(0, self.checkHidden(self.hidden, action))
self.guessMade += 1
for num in self.compare[0]:
if num == 2:
self.reward += 3
elif num == 1:
self.reward += 1
self.allGuess.insert(0, action)
if(action == self.hidden or self.guessMade >= self.guessMax ):
game_over = True
if(action == self.hidden):
self.reward += (self.guessMax-self.guessMade) * 20
else:
self.reward += 0
#self.display()
#print("reward ", self.reward)
return self.reward, game_over, self.guessMade
def display(self):
for i in range(self.guessMax - self.guessMade):
print("_ _ _ _ |")
for lis, lis2 in zip(self.allGuess, self.compare):
if(lis != [-1,-1,-1,-1]):
print(*lis, "|", *lis2)
def checkHidden(self, answer, guess):
spot = [0,1,2,3]
ans = answer.copy()
lis = []
if(guess[0] == self.hidden[0]):
lis.append(2)
spot[0] = -1
ans[0] = -1
if(guess[1] == self.hidden[1]):
lis.append(2)
spot[1] = -1
ans[1] = -1
if(guess[2] == self.hidden[2]):
lis.append(2)
spot[2] = -1
ans[2] = -1
if(guess[3] == self.hidden[3]):
lis.append(2)
spot[3] = -1
ans[3] = -1
if 0 in spot:
if guess[0] in ans:
lis.append(1)
ans[0] = -1
if 1 in spot:
if guess[1] in ans:
lis.append(1)
ans[1] = -1
if 2 in spot:
if guess[2] in ans:
lis.append(1)
ans[2] = -1
if 3 in spot:
if guess[3] in ans:
lis.append(1)
ans[3] = -1
while len(lis) < 4:
lis.append(0)
return lis
# def play(self):
# self.start_game()
# self.display()
#play_step
#recent_guess = self.guess_list()
#self.play_step(recent_guess)
#
# while(recent_guess != self.hidden and self.guessMade < self.guessMax):
# self.display()
# #
# recent_guess = self.guess_list()
# self.play_step(recent_guess)
# #
# if (recent_guess == self.hidden):
# reward = (self.guessMax - self.guessMade) * 10
# return 1, reward
# else:
# reward = 0
# return 0, reward
if __name__ == "__main__":
ai = MasterMindAI()
num, reward = ai.play()
Vielen Dank für jede Hilfe