www.gusucode.com > stats 源码程序 matlab案例代码 > stats/FindGoodLassoPenaltyUsingCrossValidatedAUCExample.m

    %% Find Good Lasso Penalty Using Cross-Validated AUC
% To determine a good lasso-penalty strength for a linear classification
% model that uses a logistic regression learner, compare cross-validated
% AUC values.
%%
% Load the NLP data set.  Preprocess the data as in
% <docid:stats_ug.bu7wsfc-1>.
load nlpdata
Ystats = Y == 'stats';
X = X'; 
%%
% There are 9471 observations in the test sample.
%%
% Create a set of 11 logarithmically-spaced regularization strengths from
% $10^{-6}$ through $10^{-0.5}$.
Lambda = logspace(-6,-0.5,11);  
%%
% Cross-validate a binary, linear classification models that use each of
% the regularization strengths and 5-fold cross-validation.  Solve the
% objective function using SpaRSA. Lower the tolerance on the gradient of
% the objective function to |1e-8|.
rng(10); % For reproducibility
CVMdl = fitclinear(X,Ystats,'ObservationsIn','columns',...
    'KFold',5,'Learner','logistic','Solver','sparsa',...
    'Regularization','lasso','Lambda',Lambda,'GradientTolerance',1e-8)
Mdl1 = CVMdl.Trained{1}
%%
% |Mdl1| is a |ClassificationLinear| model object. Because |Lambda| is a
% sequence of regularization strengths, you can think of |Mdl1| as 11
% models, one for each regularization strength in |Lambda|.
%%
% Predict the cross-validated labels and posterior class
% probabilities.
[label,posterior] = kfoldPredict(CVMdl);
CVMdl.ClassNames;
[n,K,L] = size(posterior)
posterior(3,1,5)
%%
% |label| is a 31572-by-11 matrix of predicted labels. Each column
% corresponds to the predicted labels of the model trained using the
% corresponding regularization strength. |posterior| is a 31572-by-2-by-11
% matrix of posterior class probabilities.  Columns correspond to classes
% and pages correspond to regularization strengths. For example,
% |posterior(3,2,5)| indicates that the posterior probability that the
% first class (label |0|) is assigned to observation 3 by the model that uses
% |Lambda(5)| as a regularization strength is 1.0000.
%%
% For each model, compute the AUC.  Designate the second class as the
% positive class.
auc = 1:numel(Lambda);  % Preallocation
for j = 1:numel(Lambda)
    [~,~,~,auc(j)] = perfcurve(Ystats,posterior(:,2,j),CVMdl.ClassNames(2));
end
%%
% Higher values of |Lambda| lead to predictor variable sparsity, which is a
% good quality of a classifier.  For each regularization strength, train a
% linear classification model for each regularization strength using the
% entire data set and the same options as when you trained the model.
% Determine the number of nonzero coefficients per model.
Mdl = fitclinear(X,Ystats,'ObservationsIn','columns',...
    'Learner','logistic','Solver','sparsa','Regularization','lasso',...
    'Lambda',Lambda,'GradientTolerance',1e-8);
numNZCoeff = sum(Mdl.Beta~=0);
%%
% In the same figure, plot the test-sample error rates and frequency of
% nonzero coefficients for each regularization strength. Plot all variables
% on the log scale.
figure;
[h,hL1,hL2] = plotyy(log10(Lambda),log10(auc),...
    log10(Lambda),log10(numNZCoeff + 1)); 
hL1.Marker = 'o';
hL2.Marker = 'o';
ylabel(h(1),'log_{10} AUC')
ylabel(h(2),'log_{10} nonzero-coefficient frequency')
xlabel('log_{10} Lambda')
title('Cross-Validated Statistics')
hold off
%%
% Choose the index of the regularization strength that balances predictor
% variable sparsity and high AUC.  In this case, a value
% between $10^{-3}$ to $10^{-1}$ should suffice.
idxFinal = 9;
%%
% Select the model from |Mdl| with the chosen regularization strength.
MdlFinal = selectModels(Mdl,idxFinal);
%%
% |MdlFinal| is a |ClassificationLinear| model containing one
% regularization strength. To estimate labels for new observations, pass
% |MdlFinal| and the new data to |predict|.