输入: 包含n个对象的数据库,终止条件k
输出: k个簇,到达终止条件规定簇的数目
伪代码:
FOR(i=1;i≠k;i++) DO BEGIN
在所有簇中挑具有最大直径的簇;
找出所挑选的簇里与其他平均相异度最大的一个点放入splinter group,剩余的放入old party中
REPEAT
在old party里找到splinter group中点的最近距离不大于到old party中点的最近距离的点,并将该点加入splinter group中
UNTIL 没有新的old party的点被分配给splinter group;
splinter group 和old party 为被选中的簇分裂成两个簇,与其他簇一起组成新的簇集合
END
import numpy as np
def DIANA(d, kk):
"""DIANA algorithm"""
# 当前簇的个数
c = 1
cluster = []
cluster.append(list(range(len(d))))
a = cluster[0]
if kk == 1:
return cluster
while True:
dmaxi = 0
# 找最大直径的簇
idx = 0
if c > 1:
dmax = 0
for i in cluster:
mx = 0
idx += 1
for j in range(len(i)):
for k in range(len(i)):
t = np.linalg.norm(np.array(d[i[j]]) - np.array(d[i[k]]))
if mx < t:
mx = t
if dmax < mx:
dmax = mx
dmaxi = idx
splinter = []
remaining = cluster[dmaxi]
cluster.remove(cluster[dmaxi])
mxd = 0
mxi = 0
for i in range(len(remaining)):
sm = 0
for j in range(len(remaining)):
sm += np.linalg.norm(np.array(d[remaining[i]]) - np.array(d[remaining[j]]))
if mxd<sm:
mxd = sm
mxi = i
print(remaining)
print(c)
splinter.append(remaining[mxi])
remaining.remove(remaining[mxi])
bChange = True
while bChange:
bChange = False
for i in range(len(remaining)):
d1 = 0.0
for j in range(len(splinter)):
d1 += np.linalg.norm(np.array(splinter[j]) - np.array(remaining[i]))
d1 /= float(len(splinter))
d2 = 0.0
for j in range(len(remaining)):
d2 += np.linalg.norm(np.array(remaining[j]) - np.array(remaining[i]))
if len(remaining) > 1:
d2 /= (len(remaining) - 1)
if d1 < d2:
bChange = True
splinter.append(remaining[i])
remaining.remove(remaining[i])
break
cluster.append(splinter)
cluster.append(remaining)
c += 1
if c == kk:
break
return cluster
from sklearn import datasets
data = datasets.load_iris()
# data.data[:5]
data.target
print(len(data.target[data.target==0]))
cluster = DIANA(data.data, 3)
print(data.target[cluster[0]])
print(data.target[cluster[1]])
print(data.target[cluster[2]])