www.gusucode.com > stats 源码程序 matlab案例代码 > stats/RobustFeatureSelectionUsingNCAExample.m

    %% Robust Feature Selection Using NCA for Regression
% Perform feature selection that is robust to outliers using a custom
% robust loss function in NCA.

%% Generate data with outliers
% Generate sample data for regression where the response depends on three
% of the predictors, namely predictors 4, 7, and 13.
rng(123,'twister'); % For reproducibility
n = 200;
X = randn(n,20);
y = cos(X(:,7)) + sin(X(:,4).*X(:,13)) + 0.1*randn(n,1);

%%
% Add outliers to data.
numoutliers   = 25;
outlieridx    = floor(linspace(10,90,numoutliers));
y(outlieridx) = 5*randn(numoutliers,1);

%%
% Plot the data.
figure;
plot(y);

%% Use non-robust loss function
% The performance of the feature selection algorithm highly depends on the
% value of the regularization parameter. It is a good practice to tune the
% regularization parameter for the best value to use in feature selection.
% Tune the regularization parameter using five-fold cross validation. Use
% the mean squared error (mse):
%
% $mse = \frac{1}{n}\sum\limits_{i = 1}^n {{{\left( {{y_i} - {y_j}}
% \right)}^2}}$

%%
% First, partition the data into five folds. In each fold, the software
% uses 4/5th of the data for training and 1/5th of the data for validation
% (testing).

cvp         = cvpartition(length(y),'kfold',5);
numtestsets = cvp.NumTestSets;
%%
% Compute the lambda values to test for and create an array to store the
% loss values.
lambdavals  = linspace(0,3,50)*std(y)/length(y);
lossvals    = zeros(length(lambdavals),numtestsets);

%%
% Perform NCA and compute the loss for each $\lambda$ value and each fold.
    for i = 1:length(lambdavals)
        for k = 1:numtestsets

            Xtrain = X(cvp.training(k),:);
            ytrain = y(cvp.training(k),:);
            Xtest  = X(cvp.test(k),:);
            ytest  = y(cvp.test(k),:);

            nca = fsrnca(Xtrain,ytrain,'FitMethod','exact',...
                'Solver','lbfgs','Verbose',0,'Lambda',lambdavals(i),...
                'LossFunction','mse');

            lossvals(i,k) = loss(nca,Xtest,ytest,'LossFunction','mse');
        end
    end

%%
% Plot the mean loss corresponding to each lambda value.
meanloss = mean(lossvals,2);
figure;
plot(lambdavals,meanloss,'ro-');
xlabel('Lambda');
ylabel('Loss (MSE)');
grid on;

%%
% Find the $\lambda$ value that produces the minimum average loss.
[~,idx] = min(mean(lossvals,2));
bestlambda = lambdavals(idx)

%%
% Perform feature selection using the best $\lambda$ value and mse.
nca = fsrnca(X,y,'FitMethod','exact','Solver','lbfgs',...
     'Verbose',1,'Lambda',bestlambda,'LossFunction','mse');
 
%%
% Plot selected features.
figure;
plot(nca.FeatureWeights,'ro');
grid on;
xlabel('Feature index');
ylabel('Feature weight');

%%
% Predict the reponse values using the |nca| model and plot the fitted
% (predicted) response values and the actual response values.
fitted = predict(nca,X);
figure;
plot(y,'r.');
hold on;
plot(fitted,'b-');
xlabel('index');
ylabel('Fitted values');

%%
% |fsrnca| tries to fit every point in data including the outliers. As a
% result it assigns nonzero weights to many features besides predictors 4,
% 7, and 13.

%% Use built-in robust loss function
% Repeat the same process of tuning the regularization parameter, this time
% using the built-in $\epsilon$-insensitive loss function:
%
% $l\left({{y_i},{y_j}} \right) = \max \left( {0,\left| {{y_i} - {y_j}}
% \right| - \epsilon} \right)$
%
% $\epsilon$-insensitive loss function is more robust to outliers
% than mean squared error.
lambdavals  = linspace(0,3,50)*std(y)/length(y);
cvp         = cvpartition(length(y),'kfold',5);
numtestsets = cvp.NumTestSets;
lossvals    = zeros(length(lambdavals),numtestsets);

    for i = 1:length(lambdavals)
        for k = 1:numtestsets

            Xtrain = X(cvp.training(k),:);
            ytrain = y(cvp.training(k),:);
            Xtest  = X(cvp.test(k),:);
            ytest  = y(cvp.test(k),:);

            nca = fsrnca(Xtrain,ytrain,'FitMethod','exact',...
                'Solver','sgd','Verbose',0,'Lambda',lambdavals(i),...
                'LossFunction','epsiloninsensitive','Epsilon',0.8);

            lossvals(i,k) = loss(nca,Xtest,ytest,'LossFunction','mse');
        end
    end

%%
% The $\epsilon$ value to use depends on the data and the best value can be
% determined using cross-validation as well. But choosing the $\epsilon$
% value is out of scope of this example. The choice of $\epsilon$ in this
% example is mainly for illustrating the robustness of the method.

%%
% Plot the mean loss corresponding to each lambda value.
meanloss = mean(lossvals,2);
figure;
plot(lambdavals,meanloss,'ro-');
xlabel('Lambda');
ylabel('Loss (MSE)');
grid on;

%%
% Find the lambda value that produces the minimum average loss.
[~,idx] = min(mean(lossvals,2));
bestlambda = lambdavals(idx)

%%
% Fit neighborhood component analysis model using $\epsilon$-insensitive loss function and best lambda value.
nca = fsrnca(X,y,'FitMethod','exact','Solver','sgd',...
     'Lambda',bestlambda,'LossFunction','epsiloninsensitive','Epsilon',0.8);

%% 
% Plot selected features.
figure;
plot(nca.FeatureWeights,'ro');
grid on;
xlabel('Feature index');
ylabel('Feature weight');

%%
% Plot fitted values.
fitted = predict(nca,X);
figure;
plot(y,'r.');
hold on;
plot(fitted,'b-');
xlabel('index');
ylabel('Fitted values');

%%
% $\epsilon$-insensitive loss seems more robust to outliers. It identified
% fewer features than mse as relevant. The fit shows that it is still
% impacted by some of the outliers.

%% Use custom robust loss function
% Define a custom robust loss function that is robust to outliers to use in
% feature selection for regression: 
%
% $f(y_i,y_j)=1-\exp(-|y_i,y_j|)$.
%
customlossFcn = @(yi,yj) 1 - exp(-abs(bsxfun(@minus,yi,yj')));

%% 
% Tune the regularization parameter using the custom-defined robust loss
% function.
lambdavals  = linspace(0,3,50)*std(y)/length(y);
cvp         = cvpartition(length(y),'kfold',5);
numtestsets = cvp.NumTestSets;
lossvals    = zeros(length(lambdavals),numtestsets);

    for i = 1:length(lambdavals)
        for k = 1:numtestsets

            Xtrain = X(cvp.training(k),:);
            ytrain = y(cvp.training(k),:);
            Xtest  = X(cvp.test(k),:);
            ytest  = y(cvp.test(k),:);

            nca = fsrnca(Xtrain,ytrain,'FitMethod','exact',...
                'Solver','lbfgs','Verbose',0,'Lambda',lambdavals(i),...
                'LossFunction',customlossFcn);

            lossvals(i,k) = loss(nca,Xtest,ytest,'LossFunction','mse');
        end
    end
    
%%
% Plot the mean loss corresponding to each lambda value.
meanloss = mean(lossvals,2);
figure;
plot(lambdavals,meanloss,'ro-');
xlabel('Lambda');
ylabel('Loss (MSE)');
grid on;

%%
% Find the $\lambda$ value that produces the minimum average loss.
[~,idx] = min(mean(lossvals,2));
bestlambda = lambdavals(idx)

%%
% Perform feature selection using the custom robust loss function and best
% $\lambda$ value.
nca = fsrnca(X,y,'FitMethod','exact','Solver','lbfgs',...
     'Verbose',1,'Lambda',bestlambda,'LossFunction',customlossFcn);
 
%%
% Plot selected features.
figure;
plot(nca.FeatureWeights,'ro');
grid on;
xlabel('Feature index');
ylabel('Feature weight');

%%
% Plot fitted values.
fitted = predict(nca,X);
figure;
plot(y,'r.');
hold on;
plot(fitted,'b-');
xlabel('index');
ylabel('Fitted values');

%%
% In this case, the loss is not affected by the outliers and results are
% based on most of the observation values. |fsrnca| detects the predictors 4,
% 7, and 13 as relevant features and does not select any other features.

%% Why does the loss function choice affect the results?
% First, compute the loss functions for a series of values for the
% difference between two observations.
deltay   = linspace(-10,10,1000)';
% Compute custom loss function values
customlossvals = customlossFcn(deltay,0); 
% Compute epsilon insensitive loss function and values
epsinsensitive = @(yi,yj,E) max(0,abs(bsxfun(@minus,yi,yj'))-E); 
epsinsenvals = epsinsensitive(deltay,0,0.5);
% Compute mse loss function and values
mse = @(yi,yj) (yi-yj').^2;
msevals = mse(deltay,0);

%%
% Now, plot the loss functions to see their difference and why they affect
% the results in the way they do.
figure;
xlabel('(yi - yj)')
ylabel('loss(yi,yj)')
plot(deltay,customlossvals,'g-',deltay,epsinsenvals,'b-',deltay,msevals,'r-')
legend('customloss','epsiloninsensitive','mse')
ylim([0 20])

%%
% As the difference between two response values increases, mse increases
% quadratically, which makes it very sensitive to outliers. As fsrnca tries
% to minimize this loss, it ends up identifying more features as relevant.
% The epsilon insensitive loss is more resistant to outliers than mse, but
% eventually it does start to increase linearly as the difference between
% two observations increase. As the difference between two observations
% increase, the robust loss function does approach 1 and stays at that value even
% though the difference between the observations keeps increasing. Out of
% three, it is the most robust to outliers.