www.gusucode.com > 马尔科夫决策过程包括一些例程源码程序 > demo_sutton.m

    function sutton_demo()
% do the example in Sutton and Barto (1998) p79

nrows = 5; ncols = 5;
obstacle = zeros(nrows, ncols);
terminal = zeros(nrows, ncols);
psucc_act = 1.0;
absorb = 0;
wrap_around = 0;
noop = 0;
T = mk_grid_world(nrows, ncols, psucc_act, obstacle, terminal, absorb, wrap_around, noop);
bump = mk_bump(nrows, ncols);
R = -1*bump;

A = 6; AA = 10;
R(A,:) = 10;
T(A,:,:) = 0.0; T(A,:,AA) = 1.0; 
B = 16; BB = 18;
R(B,:) = 5;
T(B,:,:) = 0.0; T(B,:,BB) = 1.0;

discount_factor = 0.9;
V = value_iteration(T, R, discount_factor);

%reshape(V,[nrows ncols])
%   21.9773   24.4193   21.9773   19.4193   17.4773
%   19.7796   21.9773   19.7796   17.8016   16.0214
%   17.8016   19.7796   17.8016   16.0214   14.4193
%   16.0214   17.8016   16.0214   14.4193   12.9773
%   14.4193   16.0214   14.4193   12.9773   11.6796

% Extract the policy
Q = Q_from_V(V, T, R, discount_factor);
[V, p] = max(Q, [], 2);
%reshape(p,[nrows ncols])
%     2     1     4     1     4
%     1     1     1     4     4
%     1     1     1     1     1
%     1     1     1     1     1
%     1     1     1     1     1
% Note: this might not match the book because of ties in the argmax

[p, V] = policy_iteration(T, R, discount_factor);

%reshape(V,[nrows ncols])
%ans =
%   21.9775   24.4194   21.9775   19.4194   17.4775
%   19.7797   21.9775   19.7797   17.8018   16.0216
%   17.8018   19.7797   17.8018   16.0216   14.4194
%   16.0216   17.8018   16.0216   14.4194   12.9775
%   14.4194   16.0216   14.4194   12.9775   11.6797

%%%%%%%%%

function bump = mk_bump(nrows, ncols)
% MK_BUMP Will moving cause the agent to bump into the boundary?
% bump = mk_bump(nrows, ncols)

N = 1; E = 2; S = 3; W = 4;
nact = 4;

nstates = nrows*ncols;
node = reshape(1:nstates, [nrows ncols]);
bump = zeros(nstates, nact);
bump(node(1,1:ncols), N) = 1;
bump(node(nrows,1:ncols), S) = 1;
bump(node(1:nrows,1), W) = 1;
bump(node(1:nrows,ncols), E) = 1;