%------- % "compareLink" demonstrate two clustering algorithms % single link and complete link %------- n = input('dimensionality of data space n (n >= 2):'); assert( n >= 2 ); m = input('number of data points (vectors in R^n) to generate:'); assert( m >=2 ); % generate random points in the 0:1 hypercube % represent the points in an m*n array called d, so that each point % is a row of the d matrix d = rand(m, n); page_output_immediately = 1; more off; % keep the same plots going through each pass hold on; % build a distance matrix, using Euclidean distance % to speed up the clustering algorithms, we might want to build a list of % each node's nearest neighbors % with the O(n^4) loops below, I'm not going to worry about optimizing this % use the same distance matrix s for both algorithms s=zeros(m); for i = 1:m for j = 1:m s(i,j) = sqrt(sumsq((d(i,:) - d(j,:)))) ; endfor endfor % using the same data matrix d and distance matrix s, run the single link % and then run the complete link % % the single link % printf("begin single link\n"); if n <= 3 % plot the points in a subplot subplot(1,2,1); if n == 2 gplot clear gset noparametric gset size ratio 1 gset nokey gset pointsize 2 xlabel("x axis"); ylabel("y axis"); gplot d title "single link" with points elseif n == 3 gsplot clear gset parametric gset size ratio 1 gset nokey gset pointsize 2 xlabel("x axis"); ylabel("y axis"); zlabel("z axis"); gsplot d title "single link" with points endif endif clusterS = zeros(1,m); % clusters according to single link for i=1:m clusterS(i) = i; % point i belongs to cluster clusterS(i) in single link endfor nClusters = m; % each point is in its own cluster to start for pass = 1:m-1 % loop through the points, pairwise, and find closesti and closestj, the points % in different clusters with the smallest distance closestDist = 2*n; for i=1:m; for j=i+1:m; if (clusterS(i) != clusterS(j)) & (s(i,j) < closestDist) closesti = i; closestj = j; closestDist = s(i,j); endif endfor endfor % closesti and closestj are the two closest points still in different clusters assert(clusterS(closesti) != clusterS(closestj)); newLink = zeros(2,n); newLink(1,:) = d(closesti,:); newLink(2,:) = d(closestj,:); assert(closesti < closestj); oldc = clusterS(closestj); newc = clusterS(closesti); printf("merge cluster %d into cluster %d\n", oldc, newc); if n == 2 gplot newLink with lines elseif n == 3 gsplot newLink with lines endif % connect the clusters by making each point in closesti's cluster a member of % closestj's cluster for k=1:m if clusterS(k) == oldc clusterS(k) = newc; endif if clusterS(k) > oldc clusterS(k) = clusterS(k) - 1; endif endfor nClusters = nClusters - 1; endfor printf("end single link\n"); % % the complete link % printf("begin complete link\n"); if n <= 3 % plot the points in a subplot subplot(1,2,2); if n == 2 gplot clear gset noparametric gset size ratio 1 gset nokey gset pointsize 2 xlabel("x axis"); ylabel("y axis"); gplot d title "complete link" with points elseif n == 3 gsplot clear gset parametric gset size ratio 1 gset nokey gset pointsize 2 xlabel("x axis"); ylabel("y axis"); zlabel("z axis"); gsplot d title "complete link" with points endif endif clusterC = zeros(1,m); % clusters according to complete link for i=1:m clusterC(i) = i; % point i belongs to cluster clusterC(i) in complete link endfor nClusters = m; % each point is in its own cluster to start for pass = 1:m-1 % loop through the points, pairwise, and find points ci and cj % in different clusters, such that no other pair of points in % their respective clusters are farther apart % bestCi and bestCj are the closest clusters seen so far % bestDist is the distance between besti's and bestj's most distant points bestCi = 0; bestCj = 0; bestDist = 2*n; % loop through the existing clusters, and find the two whose farthest points are closest for i=1:nClusters-1 for j=i+1:nClusters assert(i != j); % look at each point in distinct clusters i and j % find the points bigki and bigkj, belonging to clusters i and j resp. % where s(bigki,bigkj) is maximal bigs = 0; for ki = 1:m if clusterC(ki) == i % is point ki in cluster i? for kj=1:m if clusterC(kj) == j % and is point kj in cluster j? if s(ki,kj) > bigs % are they farther apart than any other pair of points? bigs = s(ki,kj); bigki = ki; bigkj = kj; endif endif endfor % next kj endif endfor % next ki % at this point, bigki and bigkj are the most distant points % in clusters i and j, and bigs = s(bigki,bigkj) assert(clusterC(bigki) != clusterC(bigkj)); if bigs < bestDist % are clusters i and j closer than any other pair of clusters? bestDist = bigs; bestCi = i; bestCj = j; endif endfor % next j endfor % next i % at this point, bestCi and bestCj are the closest clusters % in terms of their most distant points %printf ("the two closest clusters are %d and %d\n", bestCi, bestCj); assertMsg( bestCi != bestCj, "the two closest clusters are the same"); if bestCi < bestCj oldc = bestCj; newc = bestCi; elseif bestCi > bestCj oldc = bestCi; newc = bestCj; else error("the two closest clusters are the same :-(") endif % in case we're graphing, find the two closest points in the two clusters if (n <= 3) closestD = 2*n; for ki = 1:m if clusterC(ki) == bestCi % point ki is in cluster bestCi for kj = 1:m if clusterC(kj) == bestCj % point kj is in cluster bestCj if s(ki,kj) < closestD % are points ki and kj closer? closestD = s(ki,kj); closesti = ki; closestj = kj; endif endif endfor % next kj endif endfor % next ki %printf ("their closest points are %d and %d\n", closesti, closestj); newLink = zeros(2,n); newLink(1,:) = d(closesti,:); newLink(2,:) = d(closestj,:); if n == 2 gplot newLink with lines elseif n == 3 gsplot newLink with lines replot endif endif % connect the clusters by making each point in oldc a member of newc printf("merge cluster %d into cluster %d\n", oldc, newc); % since we're eliminating oldc as a separate cluster, % renumber the clusters greater than oldc to prevent gaps in the % range 1..nClusters for k=1:m if clusterC(k) == oldc clusterC(k) = newc; endif if clusterC(k) > oldc clusterC(k) = clusterC(k) - 1; endif endfor nClusters = nClusters - 1; endfor printf("end complete link\n");