www.gusucode.com > stats 源码程序 matlab案例代码 > stats/FindNearestNeighborUsingACustomDistanceMetricExample.m

    %% Find Nearest Neighbors Using a Custom Distance Metric
% This example shows how to find the indices of the three nearest
% observations in |X| to each observation in |Y| with respect to the
% chi-square distance.  This distance metric is used in correspondence
% analysis, particularly in ecological applications.
%%
% Randomly generate normally distributed data into two matrices.  The
% number of rows can vary, but the number of columns must be equal. This
% example uses 2-D data for plotting.

% Copyright 2015 The MathWorks, Inc.

rng(1); % For reproducibility
X = randn(50,2);
Y = randn(4,2);

h = zeros(3,1);
figure;
h(1) = plot(X(:,1),X(:,2),'bx');
hold on;
h(2) = plot(Y(:,1),Y(:,2),'rs','MarkerSize',10);
title('Heterogenous Data')
%%
% The rows of |X| and |Y| correspond to observations, and the columns are,
% in general, dimensions (for example, predictors).
%%
% The chi-square distance between _j_-dimensional points _x_ and _z_ is 
%
% $$\chi(x,z) = \sqrt{\displaystyle\sum^J_{j = 1}w_j\left(x_j - z_j\right)^2},$$
%
% where $w_j$ is the weight associated with dimension _j_.  
%%
% Choose weights for each dimension, and specify the chi-square distance
% function.  The distance function must:
% 
% * Take as input arguments one row of |X|, e.g., |x|, and the matrix |Z|.
% * Compare |x| to each row of |Z|.
% * Return a vector |D| of length $n_z$, where $n_z$ is the number of
% rows of |Z|.  Each element of |D| is the distance between the observation
% corresponding to |x| and the observations
% corresponding to each row of |Z|.
%
w = [0.4; 0.6];
chiSqrDist = @(x,Z)sqrt((bsxfun(@minus,x,Z).^2)*w);
%%
% This example uses arbitrary weights for illustration.
%%
% Find the indices of the three nearest observations in |X| to each
% observation in |Y|.
k = 3;
[Idx,D] = knnsearch(X,Y,'Distance',chiSqrDist,'k',k);
%%
% |idx| and |D| are 4-by-3 matrices.
% 
% * |idx(j,1)| is the row index of the closest observation in |X| to
% observation _j_ of |Y|, and |D(j,1)| is their distance.
% * |idx(j,2)| is the row index of the next closest observation in |X| to observation _j_ of |Y|, and |D(j,2)| is their
% distance.
% * And so on.
%
%%
% Identify the nearest observations in the plot.
for j = 1:k;
    h(3) = plot(X(Idx(:,j),1),X(Idx(:,j),2),'ko','MarkerSize',10);
end 
legend(h,{'\texttt{X}','\texttt{Y}','Nearest Neighbor'},'Interpreter','latex');
title('Heterogenous Data and Nearest Neighbors')
hold off;
%%
% Several observations of |Y| share nearest neighbors. 
%%
% Verify that the chi-square distance metric is equivalent to the Euclidean
% distance metric, but with an optional scaling parameter.
[IdxE,DE] = knnsearch(X,Y,'Distance','seuclidean','k',k,...
    'Scale',1./(sqrt(w)));
AreDiffIdx = sum(sum(Idx ~= IdxE))
AreDiffDist = sum(sum(abs(D - DE) > eps))
%%
% The indices and distances between the two implementations of three
% nearest neighbors are practically equivalent.