DIANA(自顶向下分裂算法)

输入: 包含n个对象的数据库,终止条件k
输出: k个簇,到达终止条件规定簇的数目
伪代码:

FOR(i=1;ik;i++) DO BEGIN
        在所有簇中挑具有最大直径的簇;
        找出所挑选的簇里与其他平均相异度最大的一个点放入splinter group,剩余的放入old party中
        REPEAT
            old party里找到splinter group中点的最近距离不大于到old party中点的最近距离的点,并将该点加入splinter group中
        UNTIL 没有新的old party的点被分配给splinter group;
        splinter group old party 为被选中的簇分裂成两个簇,与其他簇一起组成新的簇集合
     END
In [35]:
import numpy as np
def DIANA(d, kk):
    """DIANA algorithm"""
    
        
    # 当前簇的个数
    c = 1
    cluster = []
    cluster.append(list(range(len(d))))
    a = cluster[0]
    if kk == 1:
        return cluster
    while True:
        
        dmaxi = 0
        # 找最大直径的簇
        idx = 0
        if c > 1:
            dmax = 0
            
            for i in cluster:
                mx = 0
                idx += 1
                for j in range(len(i)):
                    for k in range(len(i)):
                        t = np.linalg.norm(np.array(d[i[j]]) - np.array(d[i[k]]))
                        if mx < t:
                            mx = t
                if dmax < mx:
                    dmax = mx
                    dmaxi = idx
        
        splinter = []
        remaining = cluster[dmaxi]
        cluster.remove(cluster[dmaxi])
        
        
        mxd = 0
        mxi = 0
        
        for i in range(len(remaining)):
            sm = 0 
            for j in range(len(remaining)):
                sm += np.linalg.norm(np.array(d[remaining[i]]) - np.array(d[remaining[j]]))
            if mxd<sm:
                mxd = sm
                mxi = i
        print(remaining)
        print(c)    
        splinter.append(remaining[mxi])
        remaining.remove(remaining[mxi])
        
        
        bChange = True
        while bChange:
            bChange = False
            for i in range(len(remaining)):
                d1 = 0.0
                for j in range(len(splinter)):
                    d1 += np.linalg.norm(np.array(splinter[j]) - np.array(remaining[i]))
                d1 /= float(len(splinter))
                d2 = 0.0
                for j in range(len(remaining)):
                    d2 += np.linalg.norm(np.array(remaining[j]) - np.array(remaining[i]))
                if len(remaining) > 1:
                    d2 /= (len(remaining) - 1)
                
                if d1 < d2:
                    bChange = True
                    splinter.append(remaining[i])
                    remaining.remove(remaining[i])
                    break
        cluster.append(splinter)
        cluster.append(remaining)
        c += 1
        if c == kk:
            break
            
    return cluster
In [13]:
from sklearn import datasets
data = datasets.load_iris()
# data.data[:5]
In [37]:
data.target
Out[37]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [ ]:
print(len(data.target[data.target==0]))
In [38]:
cluster = DIANA(data.data, 3)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
1
[76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
2
In [41]:
print(data.target[cluster[0]])
[2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1]
In [42]:
print(data.target[cluster[1]])
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
In [44]:
print(data.target[cluster[2]])
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
In [ ]: