import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

gamma=0.5;
epsilon = 1

def tau(x,u):
    if (u == -1 and x[0] == 0): 
        return(np.roll(x,-1))
    elif (u == 1 and x[4] == 0):     
        return(np.roll(x, 1))
    else: return(x)
def rho(x):
    return(x[0]==1)+2*(x[4]==1)    
def terminal_state(x):
    return(x[0]==1) or (x[4]==1)    

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev = 0.1)
    return tf.Variable(initial)
def bias_variable(shape):
    initial = tf.constant(0.1, shape = shape)
    return tf.Variable(initial)
    
tf.reset_default_graph()
sess = tf.InteractiveSession()
# the network
inpt = tf.placeholder(tf.float32, [None,5])  
W1 = weight_variable([5, 10])
b1 = bias_variable([10])
W2 = weight_variable([10, 2])
b2 = bias_variable([2])
h1 = tf.nn.relu(tf.matmul(inpt, W1) + b1)
outpt = tf.matmul(h1, W2) + b2
# the loss function and trainer
Qtarget = tf.placeholder("float", [None,2])
cost = tf.reduce_sum(tf.square(Qtarget - outpt))
train_step = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(cost)
sess.run(tf.initialize_all_variables())

print('training the network ... ')
for trial in range(500):
    x= [0, 0, 1, 0, 0]
    for t in range(0,5):
        if terminal_state(x): break
        if trial > 3 and epsilon > 0.1: epsilon -= 0.001
        prediction = sess.run(outpt,feed_dict = {inpt : np.expand_dims(x, axis=0)})
        uidx=np.argmax(prediction)
        if (np.random.rand() < epsilon): uidx=np.random.randint(0,2)
        u=2*uidx-1          
        next_x =  tau(x,u)
        if terminal_state(next_x): 
            yy = rho(next_x)
        else:
            yy = rho(next_x) + gamma*np.max(sess.run(outpt,feed_dict = {inpt : np.expand_dims(next_x, axis=0)}))
        prediction[0,uidx] = yy    
        sess.run([train_step,cost,h1],feed_dict={inpt:np.expand_dims(x, axis=0),Qtarget : prediction})
        x = np.copy(next_x) 
     
policy = np.zeros(5)
xarray = [1,0,0,0,0]
Q=[]
for xx in range(0,5):
    Qxx=(sess.run(outpt,feed_dict = {inpt : np.expand_dims(xarray, axis=0)}))
    uidxx=np.argmax(Qxx)
    Q.append(Qxx)
    policy[xx]=2*uidxx-1
    xarray = np.roll(xarray,1)
print(np.transpose(Q),'policy:',np.transpose(policy))