www.gusucode.com > stats 源码程序 matlab案例代码 > stats/FindGoodLassoPenaltyUsingCrossValidationExample.m

    %% Find Good Lasso Penalty Using Cross-Validation
% To determine a good lasso-penalty strength for an ECOC model composed of
% linear classification models that use logistic regression learners,
% implement 5-fold cross-validation.
% 
%%
% Load the NLP data set.
load nlpdata
%%
% |X| is a sparse matrix of predictor data, and |Y| is a categorical vector
% of class labels. 
%%
% For simplicity, use the label 'others' for all observations in |Y| that
% are not |'simulink'|, |'dsp'|, or |'comm'|.
Y(~(ismember(Y,{'simulink','dsp','comm'}))) = 'others';
%%
% Create a set of 11 logarithmically-spaced regularization strengths from $10^{-7}$
% through $10^{-2}$.
Lambda = logspace(-7,-2,11);  
%%
% Create a linear classification model template that specifies to use
% logistic regression learners, use lasso penalties with strengths in
% |Lambda|, train using SpaRSA, and lower the tolerance on the gradient of the
% objective function to |1e-8|.
t = templateLinear('Learner','logistic','Solver','sparsa',...
    'Regularization','lasso','Lambda',Lambda,'GradientTolerance',1e-8);
%%
% Cross-validate the models.  To increase execution speed, transpose the
% predictor data and specify that the observations are in columns.
X = X'; 
rng(10); % For reproducibility
CVMdl = fitcecoc(X,Y,'Learners',t,'ObservationsIn','columns','KFold',5);
%%
% |CVMdl| is a |ClassificationPartitionedLinearECOC| model.
%%
% Dissect |CVMdl|, and each model within it.
numECOCModels = numel(CVMdl.Trained)
ECOCMdl1 = CVMdl.Trained{1}
numCLModels = numel(ECOCMdl1.BinaryLearners)
CLMdl1 = ECOCMdl1.BinaryLearners{1}
%%
% Because |fitcecoc| implements 5-fold cross-validation, |CVMdl| contains
% a 5-by-1 cell array of |CompactClassificationECOC| models that the
% software trains on each fold. The |BinaryLearners| property of each
% |CompactClassificationECOC| model contains the |ClassificationLinear|
% models.  The number of |ClassificationLinear| models within each compact
% ECOC model depends on the number of distinct labels and coding design.
% Because |Lambda| is a sequence of regularization strengths, you can think
% of |CLMdl1| as 11 models, one for each regularization strength in
% |Lambda|.

%%
% Determine how well the models generalize by plotting the averages of the
% 5-fold classification error for each regularization strength.  Identify
% the regularization strength that minimizes the generalization error over
% the grid.
ce = kfoldLoss(CVMdl);
figure;
plot(log10(Lambda),log10(ce))
[~,minCEIdx] = min(ce);
minLambda = Lambda(minCEIdx);
hold on
plot(log10(minLambda),log10(ce(minCEIdx)),'ro');
ylabel('log_{10} 5-fold classification error')
xlabel('log_{10} Lambda')
legend('MSE','Min classification error')
hold off

%%
% Train an ECOC model composed of linear classification model using the
% entire data set, and specify the minimal regularization strength.
t = templateLinear('Learner','logistic','Solver','sparsa',...
    'Regularization','lasso','Lambda',minLambda,'GradientTolerance',1e-8);
MdlFinal = fitcecoc(X,Y,'Learners',t,'ObservationsIn','columns');
%%
% To estimate labels for new observations, pass |MdlFinal| and the new data
% to |predict|.