www.gusucode.com > stats 源码程序 matlab案例代码 > stats/FindGoodLassoPenaltyUsingkfoldMarginsECOCExample.m

    %% Find Good Lasso Penalty Using _k_-fold Margins
% To determine a good lasso-penalty strength for a linear classification
% model that uses a logistic regression learner, compare distributions of
% _k_-fold margins.
%%
% Load the NLP data set.  Preprocess the data as in
% <docid:stats_ug.bu624sq-1>.
load nlpdata
Y(~(ismember(Y,{'simulink','dsp','comm'}))) = 'others';
X = X'; 
%%
% Create a set of 11 logarithmically-spaced regularization strengths from
% $10^{-8}$ through $10^{1}$.
Lambda = logspace(-8,1,11);  
%%
% Create a linear classification model template that specifies to use
% logistic regression with a lasso penalty, use each of the regularization
% strengths, solve the objective function using SpaRSA, and reduce the
% tolerance on the gradient of the objective function to |1e-8|.
t = templateLinear('Learner','logistic','Solver','sparsa',...
    'Regularization','lasso','Lambda',Lambda,'GradientTolerance',1e-8);
%%
% Cross-validate an ECOC model composed of binary, linear classification
% models using 5-fold cross-validation and that 
rng(10); % For reproducibility
CVMdl = fitcecoc(X,Y,'Learners',t,'ObservationsIn','columns','KFold',5)
%%
% |CVMdl| is a |ClassificationPartitionedLinearECOC| model.
%%
% Estimate the _k_-fold margins for each regularization strength. The
% scores for logistic regression are in [0,1].  Apply the quadratic binary
% loss.
m = kfoldMargin(CVMdl,'BinaryLoss','quadratic');
size(m)
%%
% |m| is a 31572-by-11 matrix of cross-validated margins for each
% observation. The columns correspond to the regularization strengths.
%%
% Plot the _k_-fold margins for each regularization strength.
figure;
boxplot(m)
ylabel('Cross-validated margins')
xlabel('Lambda indices')
%%
% Several values of |Lambda| yield similarly high margin distribution
% centers with low spreads. Higher values of |Lambda| lead to predictor
% variable sparsity, which is a good quality of a classifier.
%%
% Choose the regularization strength that occurs just before the margin
% distribution center starts decreasing and spread starts increasing.
LambdaFinal = Lambda(5);
%%
% Train an ECOC model composed of linear classification model using the
% entire data set and specify the regularization strength |LambdaFinal|.
t = templateLinear('Learner','logistic','Solver','sparsa',...
    'Regularization','lasso','Lambda',Lambda(5),'GradientTolerance',1e-8);
MdlFinal = fitcecoc(X,Y,'Learners',t,'ObservationsIn','columns');
%%
% To estimate labels for new observations, pass |MdlFinal| and the new data
% to |predict|.