www.gusucode.com > stats 源码程序 matlab案例代码 > stats/SelectNaiveBayesClassifierFeaturesByExaminingTestSampleMExample.m

    %% Select Naive Bayes Classifier Features by Examining Test Sample Margins
% The classifier margins measure, for each observation, the difference
% between the true class observed score and the maximal false class score
% for a particular class. One way to perform feature selection is to
% compare test sample margins from multiple models.  Based solely on this
% criterion, the model with the highest margins is the best model.
%%
% Load Fisher's iris data set.

% Copyright 2015 The MathWorks, Inc.

load fisheriris
X = meas;    % Predictors
Y = species; % Response
rng(1);
%%
% Partition the data set into training and test sets. Specify a 30% holdout
% sample for testing.
Partition = cvpartition(Y,'Holdout',0.30);
testInds = test(Partition); % Indices for the test set
XTest = X(testInds,:);
YTest = Y(testInds);
%%
% Partition defines the data set partition.
%%
% Define these two data sets:
%
% * |fullX| contains all predictors.
% * |partX| contains the last 2 predictors.
%
fullX = X;
partX = X(:,3:4);
%%
% Train naive Bayes classifiers for each predictor set. Specify the partition
% definition.
FCVMdl = fitcnb(fullX,Y,'CVPartition',Partition);
PCVMdl = fitcnb(partX,Y,'CVPartition',Partition);
FCMdl = FCVMdl.Trained{1};
PCMdl = PCVMdl.Trained{1};
%%
% |FullCVMdl| and |PartCVMdl| are
% |ClassificationPartitionedModel| classifiers. They contain the property
% |Trained|, which is a 1-by-1 cell array holding a
% |CompactClassificationNaiveBayes| classifier that the software trained using the
% training set.
%%
% Estimate the test sample margins for each classifier. Display the
% distributions of the margins for each model using boxplots.
fullM = margin(FCMdl,XTest,YTest);
partM = margin(PCMdl,XTest(:,3:4),YTest);

figure;
boxplot([fullM partM],'Labels',{'All Predictors','Two Predictors'})
h = gca;
h.YLim = [0.98 1.01]; % Modify axis to see boxes.
title 'Boxplots of Test-Sample Margins';
%%
% The margins have a similar distribution, but |PCMdl| is less complex.