import numpy as np
import gym
import random
env = gym.make("Taxi-v2")
env.render()
print("Number of actions: %d" % env.action_space.n)
print("Number of states: %d" % env.observation_space.n)
action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size, action_size))
print(qtable)
total_episodes = 50000
total_test_episodes = 5
max_steps = 99
learning_rate = 0.7
discount_rate = 0.9 #Also known as gamma
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01
for episode in range(total_episodes):
#Reset environment every time a new episode begins
state = env.reset()
step = 0
done = False
for step in range(max_steps):
#Choose an action in current state
#Generate random number
exp_exp_tradeoff = random.uniform(0,1)
#If random number > epsilon --> exploitation (select the action with the biggest Q value for this state)
if exp_exp_tradeoff > epsilon:
action = np.argmax(qtable[state, :])
#Else, do a random choice --> exploration
else:
action = env.action_space.sample()
#Do the action (a) and observe the outcome state (s') and reward (R)
new_state, reward, done, info = env.step(action)
#Update q value for the state based on the formula
#Q(s,a) = Q(s,a) + lr[R(s,a) + gamma * max Q(s',a') - Q(s,a)]
qtable[state, action] = qtable[state, action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state, :]) - qtable[state, action])
state = new_state
if done is True:
break
episode += 1
#Reduce epsilon (because we want to reduce the number of exploration as time passes)
epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
env.reset()
rewards = []
for episode in range(total_test_episodes):
state = env.reset()
step = 0
done = False
total_rewards = 0
print("******************************************************************")
print("EPISODE ", episode)
for step in range(max_steps):
env.render()
action = np.argmax(qtable[state, :])
new_state, reward, done, info = env.step(action)
total_rewards += reward
if done is True:
env.render()
rewards.append(total_rewards)
print("Score: ", total_rewards)
break
state = new_state
env.close()
print("Mean score over time: " + str(sum(rewards) / total_test_episodes))