%-------
% "completeLink" demonstrate clustering algorithm
%-------
n = input('dimensionality of data space n (n >= 2):');
assert( n >= 2 );
m = input('number of data points (vectors in R^n) to generate:');
assert( m >=2 );
% generate random points in the 0:1 hypercube
% represent the points in an m*n array called d, so that each point
% is a row of the d matrix
d = rand(m, n);
page_output_immediately = 1;
more off;
% clear the old plots, if any
if n == 2
gplot clear
gset noparametric
elseif n == 3
gsplot clear
gset parametric
endif
if n <= 3
xlabel("x axis");
ylabel("y axis");
zlabel("z axis");
gset size ratio 1
gset nokey
gset pointsize 2
if n == 2
gplot [0:1] [0:1] d with points
elseif n == 3
gsplot [0:1] [0:1] [0:1] d with points
endif
% keep the same plot going through each pass
hold on;
endif;
% now build a distance matrix, using Euclidean distance
% to speed up the clustering algorithms, we might want to build a list of
% each node's nearest neighbors
% with the O(n^4) loops below, I'm not going to worry about optimizing this
if n>3
printf("building similarity matrix\n");
endif
s=zeros(m);
for i = 1:m
for j = 1:m
s(i,j) = sqrt(sumsq((d(i,:) - d(j,:)))) ;
endfor
endfor
% complete link is another of the hierarchical, agglomerative methods
% to implement, following Rasmussen, we loop through the data
% at each step, join the most similar pair of objects (points or clusters)
% that are not yet in the same cluster. Differs from single link in
% calculating intercluster similarity, in that the least similar pair of points
% in any two clusters is used
% make a vector which says which cluster a given point is in
% initialze with each point in its own cluster
printf("initial clustering\n");
cluster = zeros(1,m);
for i=1:m
cluster(i) = i;
endfor
nClusters = m;
for pass = 1:m-1
printf("starting pass %d\n",pass);
% loop through the points, pairwise, and find points ci and cj
% in different clusters, such that no other pair of points in
% their respective clusters are farther apart
% bestCi and bestCj are the closest clusters seen so far
% bestDist is the distance between besti's and bestj's most distant points
bestCi = 0;
bestCj = 0;
bestDist = 2;
for i=1:nClusters-1
for j=i+1:nClusters
assert(i != j);
% look at each point in distinct clusters i and j
% find the points bigki and bigkj, belonging to clusters i and j resp.
% where s(bigki,bigkj) is maximal
bigs = 0;
for ki = 1:m
if cluster(ki) == i % point ki is in cluster i
for kj=1:m
if cluster(kj) == j % point kj is in cluster j
if s(ki,kj) > bigs
bigs = s(ki,kj);
bigki = ki;
bigkj = kj;
endif
endif
endfor % next kj
endif
endfor % next ki
% at this point, bigki and bigkj are the most distant points
% in clusters i and j, and bigs = s(bigki,bigkj)
assert(cluster(bigki) != cluster(bigkj));
if bigs < bestDist
bestDist = bigs;
bestCi = i;
bestCj = j;
endif
endfor % next j
endfor % next i
% at this point, bestCi and bestCj are the closest clusters
% in terms of their most distant points
printf ("the two closest clusters are %d and %d\n", bestCi, bestCj);
if bestCi < bestCj
oldc = bestCj;
newc = bestCi;
elseif bestCi > bestCj
oldc = bestCi;
newc = bestCj;
else
error("the two closest clusters are the same :-(")
endif
% in case we're graphing, find the two closest points
if (n <= 3)
closestD = 2;
for ki = 1:m
if cluster(ki) == bestCi % point ki is in cluster bestCi
for kj = 1:m
if cluster(kj) == bestCj % point kj is in cluster bestCj
if s(ki,kj) < closestD % are points ki and kj closer?
closestD = s(ki,kj);
closesti = ki;
closestj = kj;
endif
endif
endfor % next kj
endif
endfor % next ki
printf ("their closest points are %d and %d\n", closesti, closestj);
newLink = zeros(2,n);
newLink(1,:) = d(closesti,:);
newLink(2,:) = d(closestj,:);
if n == 2
gplot newLink with lines
elseif n == 3
gsplot newLink with lines
endif
endif
% connect the clusters by making each point in oldc a member of newc
printf("merge cluster %d into cluster %d\n", oldc, newc);
% since we're eliminating oldc as a separate cluster,
% renumber the clusters greater than oldc to prevent gaps in the
% range 1..nClusters
for k=1:m
if cluster(k) == oldc
cluster(k) = newc;
endif
if cluster(k) > oldc
cluster(k) = cluster(k) - 1;
endif
endfor
% cluster
nClusters = nClusters - 1;
endfor