www.gusucode.com > 马尔科夫决策过程包括一些例程源码程序 > demo_russell.m

    function russell_demo()

% Do the example in ch 17 (p501) of Russell and Norvig
% (1,1) is top left corner.

r = 3; c = 4; p = 0.8; action_cost = -1/25;
obstacle = zeros(r,c); obstacle(2,2)=1;
terminal = zeros(r,c); terminal(1,4)=1; terminal(2,4)=1;
absorb = 1;
wrap_around = 0;
noop = 0;
T = mk_grid_world(r, c, p, obstacle, terminal, absorb, wrap_around, noop);
% Add rewards for terminal states
nstates = r*c + 1;
if noop
  nact = 5;
else
  nact = 4;
end
R = action_cost*ones(nstates, nact);
R(10,:) = 1;
R(11,:) = -1;
R(nstates,:) = 0;
discount_factor = 1;


V = value_iteration(T, R, discount_factor);
%reshape(V(1:end-1),[r c])
%    0.8116    0.8678    0.9178    1.0000
%    0.7616    0.7964    0.6603   -1.0000
%    0.7053    0.6553    0.6114    0.3878
% Same as the book p501

Q = Q_from_V(V, T, R, discount_factor);
[V, p] = max(Q, [], 2);


use_val_iter = 1;
% (I-gT) is singular since g=1 and there is an absorbing state (i.e., T(i,i)=1)
% Hence we cannot use value determination.
[p,V] = policy_iteration(T, R, discount_factor, use_val_iter);

%reshape(V(1:end-1),[r c])
%    0.8115    0.8678    0.9178    1.0000
%    0.7615    0.7964    0.6603   -1.0000
%    0.7048    0.6539    0.6085    0.3824