## Reinforcement learning in a maze import numpy as np import matplotlib.pyplot as plt def tau(s,u): if s==0 or s==4: return(s) else: return(s+u) def rho(s,u): return(s==1 and u==-1)+2*(s==3 and u==1) def calc_policy(Q): policy=np.zeros(5) for s in range(0,5): uidx=np.argmax(Q[s,:]) policy[s]=2*uidx-1 return policy def idx(u): return((u+1)/2) gamma=0.5; ################################################# print('Analytic solution for optimal policy:') # Defining reward vector R i=0; R=np.zeros(10) for s in range(0,5): for u in range(-1,2,2): R[i]=rho(s,u) i += 1 # Defining transition matrix T=np.zeros([10,10]); T[0,0]=1; T[1,1]=1; T[2,0]=1; T[3,5]=1; T[4,2]=1 T[5,7]=1; T[6,5]=1; T[7,9]=1; T[8,7]=1; T[9,9]=1 # Calculate Q-function Q=np.linalg.inv(np.eye(10)-gamma*T) @ np.transpose(R) Q=np.reshape(Q,[5,2]) policy=calc_policy(Q) print(np.transpose(Q),'policy:',np.transpose(policy)) Qana=Q ################################################# print('Dynamic Prograaming:') Q=np.zeros([5,2]) for iter in range(3): for s in range(0,5): for u in range(-1,2,2): act = np.int(policy[tau(s,u)]) Q[s,idx(u)]=rho(s,u)+gamma*Q[tau(s,u),idx(act)] print(np.transpose(Q),'policy:',np.transpose(calc_policy(Q))) ################################################# print('Policy iteration:') Q=np.zeros([5,2]) policy=calc_policy(Q) for iter in range(3): for s in range(0,5): for u in range(-1,2,2): act = np.int(policy[tau(s,u)]) Q[s,idx(u)]=rho(s,u)+gamma*Q[tau(s,u),idx(act)] policy=calc_policy(Q) print(np.transpose(Q),'policy:',np.transpose(policy)) ################################################# print('Q-iteration') Q_new=np.zeros([5,2]) Q=np.zeros([5,2]) policy = np.zeros(5) for iter in range(2): for s in range(0,5): for u in range(-1,2,2): maxValue = np.maximum(Q[tau(s,u),0],Q[tau(s,u),1]) Q_new[s,idx(u)]=rho(s,u)+gamma*maxValue Q=np.copy(Q_new); print(np.transpose(Q),'policy:',np.transpose(calc_policy(Q))) ################################################# print('SARSA:') Q=np.zeros([5,2]) error = [] alpha=1; for trial in range(100): policy=calc_policy(Q) s=2 for t in range(0,5): u=policy[s] if np.random.rand()<0.1: u=-u #epsilon greedy u2=idx(policy[tau(s,u)]) Q[s,idx(u)]=(1-alpha)*Q[s,idx(u)]+alpha*(rho(s,u)+gamma*Q[tau(s,u),u2]) s=tau(s,u) error.append(np.sum(np.sum(np.abs(np.subtract(Q,Qana))))) print(np.transpose(Q),'policy:',np.transpose(calc_policy(Q))) plt.figure() plt.plot(error) ################################################# print('Q-Learning:') Q=np.zeros([5,2]) alpha=1 error = [] for trial in range(1000): s=2 for t in range(0,5): uidx=np.argmax(Q[s,:]) u=2*uidx-1 if np.random.rand()<0.1: u=-u #epsilon greedy Q[s,idx(u)]=(1-alpha)*Q[s,idx(u)]+alpha*(rho(s,u)+gamma*np.maximum(Q[tau(s,u),0],Q[tau(s,u),1])) Q[0]=0;Q[4]=0; s=tau(s,u) error.append(np.sum(np.sum(np.abs(np.subtract(Q,Qana))))) print(np.transpose(Q),'policy:',np.transpose(calc_policy(Q))) plt.plot(error,'r') plt.show() ################################################# print('TD(lambda) for Q-Learning:') Q=np.zeros([5,2]) alpha=1 lam=.1 error = [] for trial in range(100): s=2; eligibility=np.zeros(5); for t in range(0,5): if s==0 or s==4: break eligibility*=gamma*lam eligibility[s]=1 uidx=np.argmax(Q[s,:]) u=2*uidx-1 if np.random.rand()<0.1: u=-u #epsilon greedy for x in range(1,4): Q[x,idx(u)]=Q[x,idx(u)]+alpha*(rho(x,u)+gamma*np.maximum(Q[tau(x,u),0],Q[tau(x,u),1])-Q[x,idx(u)])*eligibility[x] s=tau(s,u) error.append(np.sum(np.sum(np.abs(np.subtract(Q,Qana))))) print(np.transpose(Q),'policy:',np.transpose(calc_policy(Q))) plt.plot(error,'r') plt.show()