www.gusucode.com > stats 源码程序 matlab案例代码 > stats/SelectFeaturesAndPerformClassificationNCAExample.m

    %% Identify Relevant Features for Classification
%% Load sample data
load ovariancancer; 
whos
%%
% This example uses the high-resolution ovarian cancer data set that was
% generated using the WCX2 protein array. The data is from the
% <http://home.ccr.cancer.gov/ncifdaproteomics/ppatterns.asp FDA-NCI
% Clinical Proteomics Program Databank>. After some preprocessing steps,
% the data set has two variables: |obs| and |grp|. The |obs| variable
% consists 216 observations with 4000 features. Each element in |grp|
% defines the group to which the corresponding row of |obs| belongs.

%% Divide data into training and test sets
% Use |cvpartition| to divide data into a training set of size 160 and a
% test set of size 56. Both the training set and the test set have roughly
% the same group proportions as in |grp|.
rng(1); % For reproducibility
cvp = cvpartition(grp,'holdout',56)
Xtrain = obs(cvp.training,:);
ytrain = grp(cvp.training,:);
Xtest  = obs(cvp.test,:);
ytest  = grp(cvp.test,:);

%% Determine if feature selection is necessary 
% Compute generalization error without fitting. 
nca = fscnca(Xtrain,ytrain,'FitMethod','none');
L = loss(nca,Xtest,ytest)

%%
% This option computes the generalization error of the neighborhood
% component analysis (NCA) feature selection model using the initial
% feature weights (in this case the default feature weights) provided in
% |fscnca|.
%%
% Fit NCA without regularization parameter (Lambda = 0)
nca = fscnca(Xtrain,ytrain,'FitMethod','exact','Lambda',0,...
      'Solver','sgd','Standardize',true);
L = loss(nca,Xtest,ytest)

%%
% The improvement on the loss value suggests that feature selection is a
% good idea. Tuning the $\lambda$ value usually improves the results.

%% Tune the regularization parameter for NCA using five-fold cross-validation
% Tuning $\lambda$ means finding the $\lambda$ value that produces produce
% the minimum classification loss. To tune $\lambda$ using
% cross-validation:
%
% 1. Partition the training data into five folds and extract the number of
% validation (test) sets. For each fold, |cvpartition| assigns four-fifths
% of the data as a training set, and one-fifth of the data as a test set.
cvp = cvpartition(ytrain,'kfold',5);
numvalidsets = cvp.NumTestSets;

%%
% Assign $\lambda$ values and create an array to store the loss function values.
n = length(ytrain);
lambdavals = linspace(0,20,20)/n;
lossvals = zeros(length(lambdavals),numvalidsets);

%%
% 2. Train the NCA model for each $\lambda$ value, using the training set
% in each fold.
%
% 3. Compute the classification loss for the corresponding test
% set in the fold using the NCA model. Record the loss value. 
%
% 4. Repeat this process for all folds and all $\lambda$ values.

for i = 1:length(lambdavals)
    for k = 1:numvalidsets
        X = Xtrain(cvp.training(k),:);
        y = ytrain(cvp.training(k),:);
        Xvalid = Xtrain(cvp.test(k),:);
        yvalid = ytrain(cvp.test(k),:);

        nca = fscnca(X,y,'FitMethod','exact', ...
             'Solver','sgd','Lambda',lambdavals(i), ...
             'IterationLimit',30,'GradientTolerance',1e-4, ...
             'Standardize',true);
                  
        lossvals(i,k) = loss(nca,Xvalid,yvalid,'LossFunction','classiferror');
    end
end

%%
% Compute the average loss obtained from the folds for each $\lambda$ value.
meanloss = mean(lossvals,2);

%%
% Plot the average loss values versus the $\lambda$ values.
figure()
plot(lambdavals,meanloss,'ro-')
xlabel('Lambda')
ylabel('Loss (MSE)')
grid on

%%
% Find the best lambda value that corresponds to the minimum average loss.
[~,idx] = min(meanloss) % Find the index
bestlambda = lambdavals(idx) % Find the best lambda value
bestloss = meanloss(idx)

%% Fit the nca model on all data using best $\lambda$ and plot the feature weights
% Use the solver lbfgs and standardize the predictor values.    
nca = fscnca(Xtrain,ytrain,'FitMethod','exact','Solver','sgd',...
    'Lambda',bestlambda,'Standardize',true,'Verbose',1);
%%
% Plot the feature weights.
figure()
plot(nca.FeatureWeights,'ro')
xlabel('Feature index')
ylabel('Feature weight')
grid on

%%
% Select features using the feature weights and a relative threshold.
tol    = 0.02;
selidx = find(nca.FeatureWeights > tol*max(1,max(nca.FeatureWeights)))
%%
% Compute the classification loss using the test set.
L = loss(nca,Xtest,ytest)

%% Classify observations using the selected features
% Extract the features with feature weights greater than 0 from the training data.
features = Xtrain(:,selidx);

%%
% Apply a support vector machine classifier using the selected features to
% the reduced training set.
svmMdl = fitcsvm(features,ytrain);

%%
% Evaluate the accuracy of the trained classifier on the test data which
% has not been used for selecting features. 
L = loss(svmMdl,Xtest(:,selidx),ytest)