%-------
% "compareLink" demonstrate two clustering algorithms
% single link and complete link
%-------
n = input('dimensionality of data space n (n >= 2):');
assert( n >= 2 );
m = input('number of data points (vectors in R^n) to generate:');
assert( m >=2 );
% generate random points in the 0:1 hypercube
% represent the points in an m*n array called d, so that each point
% is a row of the d matrix
d = rand(m, n);
page_output_immediately = 1;
more off;
% keep the same plots going through each pass
hold on;
% build a distance matrix, using Euclidean distance
% to speed up the clustering algorithms, we might want to build a list of
% each node's nearest neighbors
% with the O(n^4) loops below, I'm not going to worry about optimizing this
% use the same distance matrix s for both algorithms
s=zeros(m);
for i = 1:m
for j = 1:m
s(i,j) = sqrt(sumsq((d(i,:) - d(j,:)))) ;
endfor
endfor
% using the same data matrix d and distance matrix s, run the single link
% and then run the complete link
%
% the single link
%
printf("begin single link\n");
if n <= 3
% plot the points in a subplot
subplot(1,2,1);
if n == 2
gplot clear
gset noparametric
gset size ratio 1
gset nokey
gset pointsize 2
xlabel("x axis");
ylabel("y axis");
gplot d title "single link" with points
elseif n == 3
gsplot clear
gset parametric
gset size ratio 1
gset nokey
gset pointsize 2
xlabel("x axis");
ylabel("y axis");
zlabel("z axis");
gsplot d title "single link" with points
endif
endif
clusterS = zeros(1,m); % clusters according to single link
for i=1:m
clusterS(i) = i; % point i belongs to cluster clusterS(i) in single link
endfor
nClusters = m; % each point is in its own cluster to start
for pass = 1:m-1
% loop through the points, pairwise, and find closesti and closestj, the points
% in different clusters with the smallest distance
closestDist = 2*n;
for i=1:m;
for j=i+1:m;
if (clusterS(i) != clusterS(j)) & (s(i,j) < closestDist)
closesti = i;
closestj = j;
closestDist = s(i,j);
endif
endfor
endfor
% closesti and closestj are the two closest points still in different clusters
assert(clusterS(closesti) != clusterS(closestj));
newLink = zeros(2,n);
newLink(1,:) = d(closesti,:);
newLink(2,:) = d(closestj,:);
assert(closesti < closestj);
oldc = clusterS(closestj);
newc = clusterS(closesti);
printf("merge cluster %d into cluster %d\n", oldc, newc);
if n == 2
gplot newLink with lines
elseif n == 3
gsplot newLink with lines
endif
% connect the clusters by making each point in closesti's cluster a member of
% closestj's cluster
for k=1:m
if clusterS(k) == oldc
clusterS(k) = newc;
endif
if clusterS(k) > oldc
clusterS(k) = clusterS(k) - 1;
endif
endfor
nClusters = nClusters - 1;
endfor
printf("end single link\n");
%
% the complete link
%
printf("begin complete link\n");
if n <= 3
% plot the points in a subplot
subplot(1,2,2);
if n == 2
gplot clear
gset noparametric
gset size ratio 1
gset nokey
gset pointsize 2
xlabel("x axis");
ylabel("y axis");
gplot d title "complete link" with points
elseif n == 3
gsplot clear
gset parametric
gset size ratio 1
gset nokey
gset pointsize 2
xlabel("x axis");
ylabel("y axis");
zlabel("z axis");
gsplot d title "complete link" with points
endif
endif
clusterC = zeros(1,m); % clusters according to complete link
for i=1:m
clusterC(i) = i; % point i belongs to cluster clusterC(i) in complete link
endfor
nClusters = m; % each point is in its own cluster to start
for pass = 1:m-1
% loop through the points, pairwise, and find points ci and cj
% in different clusters, such that no other pair of points in
% their respective clusters are farther apart
% bestCi and bestCj are the closest clusters seen so far
% bestDist is the distance between besti's and bestj's most distant points
bestCi = 0;
bestCj = 0;
bestDist = 2*n;
% loop through the existing clusters, and find the two whose farthest points are closest
for i=1:nClusters-1
for j=i+1:nClusters
assert(i != j);
% look at each point in distinct clusters i and j
% find the points bigki and bigkj, belonging to clusters i and j resp.
% where s(bigki,bigkj) is maximal
bigs = 0;
for ki = 1:m
if clusterC(ki) == i % is point ki in cluster i?
for kj=1:m
if clusterC(kj) == j % and is point kj in cluster j?
if s(ki,kj) > bigs % are they farther apart than any other pair of points?
bigs = s(ki,kj);
bigki = ki;
bigkj = kj;
endif
endif
endfor % next kj
endif
endfor % next ki
% at this point, bigki and bigkj are the most distant points
% in clusters i and j, and bigs = s(bigki,bigkj)
assert(clusterC(bigki) != clusterC(bigkj));
if bigs < bestDist % are clusters i and j closer than any other pair of clusters?
bestDist = bigs;
bestCi = i;
bestCj = j;
endif
endfor % next j
endfor % next i
% at this point, bestCi and bestCj are the closest clusters
% in terms of their most distant points
%printf ("the two closest clusters are %d and %d\n", bestCi, bestCj);
assertMsg( bestCi != bestCj, "the two closest clusters are the same");
if bestCi < bestCj
oldc = bestCj;
newc = bestCi;
elseif bestCi > bestCj
oldc = bestCi;
newc = bestCj;
else
error("the two closest clusters are the same :-(")
endif
% in case we're graphing, find the two closest points in the two clusters
if (n <= 3)
closestD = 2*n;
for ki = 1:m
if clusterC(ki) == bestCi % point ki is in cluster bestCi
for kj = 1:m
if clusterC(kj) == bestCj % point kj is in cluster bestCj
if s(ki,kj) < closestD % are points ki and kj closer?
closestD = s(ki,kj);
closesti = ki;
closestj = kj;
endif
endif
endfor % next kj
endif
endfor % next ki
%printf ("their closest points are %d and %d\n", closesti, closestj);
newLink = zeros(2,n);
newLink(1,:) = d(closesti,:);
newLink(2,:) = d(closestj,:);
if n == 2
gplot newLink with lines
elseif n == 3
gsplot newLink with lines
replot
endif
endif
% connect the clusters by making each point in oldc a member of newc
printf("merge cluster %d into cluster %d\n", oldc, newc);
% since we're eliminating oldc as a separate cluster,
% renumber the clusters greater than oldc to prevent gaps in the
% range 1..nClusters
for k=1:m
if clusterC(k) == oldc
clusterC(k) = newc;
endif
if clusterC(k) > oldc
clusterC(k) = clusterC(k) - 1;
endif
endfor
nClusters = nClusters - 1;
endfor
printf("end complete link\n");