www.gusucode.com > stats 源码程序 matlab案例代码 > stats/SelectFeaturesAndPerformClassificationNCAExample.m
%% Identify Relevant Features for Classification %% Load sample data load ovariancancer; whos %% % This example uses the high-resolution ovarian cancer data set that was % generated using the WCX2 protein array. The data is from the % <http://home.ccr.cancer.gov/ncifdaproteomics/ppatterns.asp FDA-NCI % Clinical Proteomics Program Databank>. After some preprocessing steps, % the data set has two variables: |obs| and |grp|. The |obs| variable % consists 216 observations with 4000 features. Each element in |grp| % defines the group to which the corresponding row of |obs| belongs. %% Divide data into training and test sets % Use |cvpartition| to divide data into a training set of size 160 and a % test set of size 56. Both the training set and the test set have roughly % the same group proportions as in |grp|. rng(1); % For reproducibility cvp = cvpartition(grp,'holdout',56) Xtrain = obs(cvp.training,:); ytrain = grp(cvp.training,:); Xtest = obs(cvp.test,:); ytest = grp(cvp.test,:); %% Determine if feature selection is necessary % Compute generalization error without fitting. nca = fscnca(Xtrain,ytrain,'FitMethod','none'); L = loss(nca,Xtest,ytest) %% % This option computes the generalization error of the neighborhood % component analysis (NCA) feature selection model using the initial % feature weights (in this case the default feature weights) provided in % |fscnca|. %% % Fit NCA without regularization parameter (Lambda = 0) nca = fscnca(Xtrain,ytrain,'FitMethod','exact','Lambda',0,... 'Solver','sgd','Standardize',true); L = loss(nca,Xtest,ytest) %% % The improvement on the loss value suggests that feature selection is a % good idea. Tuning the $\lambda$ value usually improves the results. %% Tune the regularization parameter for NCA using five-fold cross-validation % Tuning $\lambda$ means finding the $\lambda$ value that produces produce % the minimum classification loss. To tune $\lambda$ using % cross-validation: % % 1. Partition the training data into five folds and extract the number of % validation (test) sets. For each fold, |cvpartition| assigns four-fifths % of the data as a training set, and one-fifth of the data as a test set. cvp = cvpartition(ytrain,'kfold',5); numvalidsets = cvp.NumTestSets; %% % Assign $\lambda$ values and create an array to store the loss function values. n = length(ytrain); lambdavals = linspace(0,20,20)/n; lossvals = zeros(length(lambdavals),numvalidsets); %% % 2. Train the NCA model for each $\lambda$ value, using the training set % in each fold. % % 3. Compute the classification loss for the corresponding test % set in the fold using the NCA model. Record the loss value. % % 4. Repeat this process for all folds and all $\lambda$ values. for i = 1:length(lambdavals) for k = 1:numvalidsets X = Xtrain(cvp.training(k),:); y = ytrain(cvp.training(k),:); Xvalid = Xtrain(cvp.test(k),:); yvalid = ytrain(cvp.test(k),:); nca = fscnca(X,y,'FitMethod','exact', ... 'Solver','sgd','Lambda',lambdavals(i), ... 'IterationLimit',30,'GradientTolerance',1e-4, ... 'Standardize',true); lossvals(i,k) = loss(nca,Xvalid,yvalid,'LossFunction','classiferror'); end end %% % Compute the average loss obtained from the folds for each $\lambda$ value. meanloss = mean(lossvals,2); %% % Plot the average loss values versus the $\lambda$ values. figure() plot(lambdavals,meanloss,'ro-') xlabel('Lambda') ylabel('Loss (MSE)') grid on %% % Find the best lambda value that corresponds to the minimum average loss. [~,idx] = min(meanloss) % Find the index bestlambda = lambdavals(idx) % Find the best lambda value bestloss = meanloss(idx) %% Fit the nca model on all data using best $\lambda$ and plot the feature weights % Use the solver lbfgs and standardize the predictor values. nca = fscnca(Xtrain,ytrain,'FitMethod','exact','Solver','sgd',... 'Lambda',bestlambda,'Standardize',true,'Verbose',1); %% % Plot the feature weights. figure() plot(nca.FeatureWeights,'ro') xlabel('Feature index') ylabel('Feature weight') grid on %% % Select features using the feature weights and a relative threshold. tol = 0.02; selidx = find(nca.FeatureWeights > tol*max(1,max(nca.FeatureWeights))) %% % Compute the classification loss using the test set. L = loss(nca,Xtest,ytest) %% Classify observations using the selected features % Extract the features with feature weights greater than 0 from the training data. features = Xtrain(:,selidx); %% % Apply a support vector machine classifier using the selected features to % the reduced training set. svmMdl = fitcsvm(features,ytrain); %% % Evaluate the accuracy of the trained classifier on the test data which % has not been used for selecting features. L = loss(svmMdl,Xtest(:,selidx),ytest)