www.gusucode.com > stats 源码程序 matlab案例代码 > stats/CrossValidateLinearRegressionModelExample.m

    %% Find Good Lasso Penalty Using Cross-Validation
% To determine a good lasso-penalty strength for a linear regression 
% model that uses least squares, implement 5-fold
% cross-validation.
% 
%%
% Simulate 10000 observations from this model
%
% $$y = x_{100} + 2x_{200} + e.$$
%
%
% * $X = \{x_1,...,x_{1000}\}$ is a 10000-by-1000 sparse matrix with 10%
% nonzero standard normal elements.
% * _e_ is random normal error with mean 0 and standard deviation
% 0.3.
%
rng(1) % For reproducibility
n = 1e4;
d = 1e3;
nz = 0.1;
X = sprandn(n,d,nz);
Y = X(:,100) + 2*X(:,200) + 0.3*randn(n,1);
%%
% Create a set of 15 logarithmically-spaced regularization strengths from
% $10^{-5}$ through $10^{-1}$.
Lambda = logspace(-5,-1,15);  
%%
% Cross-validate the models.  To increase execution speed, transpose the
% predictor data and specify that the observations are in columns. Solve
% the objective function using SpaRSA.
%
X = X'; 
CVMdl = fitrlinear(X,Y,'ObservationsIn','columns','KFold',5,'Lambda',Lambda,...
    'Learner','leastsquares','Solver','sparsa','Regularization','lasso');

numCLModels = numel(CVMdl.Trained)
%%
% |CVMdl| is a |RegressionPartitionedLinear| model.  Because |fitrlinear|
% implements 5-fold cross-validation, |CVMdl| contains 5
% |RegressionLinear| models that the software trains on each fold.
%%
% Display the first trained linear regression model.
Mdl1 = CVMdl.Trained{1}
%%
% |Mdl1| is a |RegressionLinear| model object. |fitrlinear| constructed
% |Mdl1| by training on the first four folds.  Because |Lambda| is a
% sequence of regularization strengths, you can think of |Mdl1| as 11 
% models, one for each regularization strength in |Lambda|.
%%
% Estimate the cross-validated MSE.
mse = kfoldLoss(CVMdl);
%%
% Higher values of |Lambda| lead to predictor variable sparsity, which is a
% good quality of a regression model.  For each regularization strength,
% train a linear regression model using the entire data set and the same
% options as when you cross-validated the models.  Determine the number of
% nonzero coefficients per model.
Mdl = fitrlinear(X,Y,'ObservationsIn','columns','Lambda',Lambda,...
    'Learner','leastsquares','Solver','sparsa','Regularization','lasso');
numNZCoeff = sum(Mdl.Beta~=0);
%%
% In the same figure, plot the cross-validated MSE and frequency of nonzero
% coefficients for each regularization strength. Plot all variables on the
% log scale.
figure;
[h,hL1,hL2] = plotyy(log10(Lambda),log10(mse),...
    log10(Lambda),log10(numNZCoeff)); 
hL1.Marker = 'o';
hL2.Marker = 'o';
ylabel(h(1),'log_{10} MSE')
ylabel(h(2),'log_{10} nonzero-coefficient frequency')
xlabel('log_{10} Lambda')
hold off
%%
% Choose the index of the regularization strength that balances predictor
% variable sparsity and low MSE (for example,
% |Lambda(10)|).
idxFinal = 10;
%%
% Extract the model with corresponding to the minimal MSE.
MdlFinal = selectModels(Mdl,idxFinal)
idxNZCoeff = find(MdlFinal.Beta~=0)
EstCoeff = Mdl.Beta(idxNZCoeff)
%%
% |MdlFinal| is a |RegressionLinear| model with one regularization strength.
% The nonzero coefficients |EstCoeff| are close to the coefficients that
% simulated the data.