www.gusucode.com > stats 源码程序 matlab案例代码 > stats/SelectBestRegularizedModelsExample.m

    %% Select Best Regularized Models
%%
% Load the NLP data set.
load nlpdata
%%
% |X| is a sparse matrix of predictor data, and |Y| is a categorical vector
% of class labels.  There are more than two classes in the data.
%%
% The models should identify whether the word counts in a web page are
% from the Statistics and Machine Learning Toolbox(TM) documentation. So,
% identify the labels that correspond to the Statistics and Machine
% Learning Toolbox(TM) documentation web pages.
Ystats = Y == 'stats';
%%
% Create a set of 11 logarithmically-spaced regularization strengths from
% $10^{-8}$ through $10^{-1}$.
Lambda = logspace(-8,-1,11);  
%%
% Hold out 30% of the data for testing.  Identify the test-sample indices.
rng(1); % For reproducibility
cvp = cvpartition(Ystats,'Holdout',0.30);
idxTest = test(cvp);
%%
% Train a binary, linear classification model that uses a lasso penalty.
% Specify the regularization strengths, solve the objective function using
% SpaRSA, and the data partition.  For quicker execution time, orient the
% predictor data so that individual observations correspond to columns.
X = X';
CVMdl = fitclinear(X,Ystats,'Lambda',Lambda,'Solver','SpaRSA',...
    'ObservationsIn','columns','Regularization','lasso','CVPartition',cvp);
Mdl1 = CVMdl.Trained{1};
numel(Mdl1.Lambda)
%%
% |Mdl1| is a |ClassificationLinear| model object.  Because |Lambda| is an
% 11-dimensional vector of regularization strengths, you can think of |Mdl|
% as eleven trained models, one for each regularization strength.
%%
% Estimate the test-sample misclassification rates for each regularized
% model.
ce = loss(Mdl1,X(:,idxTest),Ystats(idxTest),'ObservationsIn','columns');
%%
% Higher values of |Lambda| lead to predictor variable sparsity, which is a
% good quality of a classifier.  Retrain the model using the entire data
% set and all options used previously, except the data-partition
% specification. Determine the number of nonzero coefficients per model.
Mdl2 = fitclinear(X,Ystats,'Lambda',Lambda,'Solver','SpaRSA',...
    'ObservationsIn','columns','Regularization','lasso');
numNZCoeff = sum(Mdl2.Beta~=0);
%%
% In the same figure, plot the classification error rates
% and frequency of nonzero coefficients for each regularization strength.
% Plot all variables on the log scale.
figure;
[h,hL1,hL2] = plotyy(log10(Lambda),log10(ce),log10(Lambda),log10(numNZCoeff)); 
hL1.Marker = 'o';
hL2.Marker = 'o';
ylabel(h(1),'log_{10} classification error')
ylabel(h(2),'log_{10} nonzero-coefficient frequency')
xlabel('log_{10} Lambda')
hold off
%%
% Select the index or indices of |Lambda| that balance minimal
% classification error and predictor-variable sparsity (for example,
% |Lambda(8)|).
idx = 8;
MdlFinal = selectModels(Mdl2,idx)
%%
% |MdlFinal| is a trained |ClassificationLinear| model object that uses
% |Lambda(8)| as a regularization strength.