import numpy as np import matplotlib.pyplot as plt import tensorflow as tf gamma=0.5; epsilon = 1 def tau(x,u): if (u == -1 and x[0] == 0): return(np.roll(x,-1)) elif (u == 1 and x[4] == 0): return(np.roll(x, 1)) else: return(x) def rho(x): return(x[0]==1)+2*(x[4]==1) def terminal_state(x): return(x[0]==1) or (x[4]==1) def weight_variable(shape): initial = tf.truncated_normal(shape, stddev = 0.1) return tf.Variable(initial) def bias_variable(shape): initial = tf.constant(0.1, shape = shape) return tf.Variable(initial) tf.reset_default_graph() sess = tf.InteractiveSession() # the network inpt = tf.placeholder(tf.float32, [None,5]) W1 = weight_variable([5, 10]) b1 = bias_variable([10]) W2 = weight_variable([10, 2]) b2 = bias_variable([2]) h1 = tf.nn.relu(tf.matmul(inpt, W1) + b1) outpt = tf.matmul(h1, W2) + b2 # the loss function and trainer Qtarget = tf.placeholder("float", [None,2]) cost = tf.reduce_sum(tf.square(Qtarget - outpt)) train_step = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(cost) sess.run(tf.initialize_all_variables()) print('training the network ... ') for trial in range(500): x= [0, 0, 1, 0, 0] for t in range(0,5): if terminal_state(x): break if trial > 3 and epsilon > 0.1: epsilon -= 0.001 prediction = sess.run(outpt,feed_dict = {inpt : np.expand_dims(x, axis=0)}) uidx=np.argmax(prediction) if (np.random.rand() < epsilon): uidx=np.random.randint(0,2) u=2*uidx-1 next_x = tau(x,u) if terminal_state(next_x): yy = rho(next_x) else: yy = rho(next_x) + gamma*np.max(sess.run(outpt,feed_dict = {inpt : np.expand_dims(next_x, axis=0)})) prediction[0,uidx] = yy sess.run([train_step,cost,h1],feed_dict={inpt:np.expand_dims(x, axis=0),Qtarget : prediction}) x = np.copy(next_x) policy = np.zeros(5) xarray = [1,0,0,0,0] Q=[] for xx in range(0,5): Qxx=(sess.run(outpt,feed_dict = {inpt : np.expand_dims(xarray, axis=0)})) uidxx=np.argmax(Qxx) Q.append(Qxx) policy[xx]=2*uidxx-1 xarray = np.roll(xarray,1) print(np.transpose(Q),'policy:',np.transpose(policy))