我怎么能继续合并集群直到没有更多的集群可以合并?

问题描述 投票:0回答:0

我有一个代码,我想根据特定条件(Radius = 500, K, alpha)连续聚类。我试过使用levels,发现并没有按照num_level=2之后的条件执行。我该如何修改它?

你好,
我的原始数据是一个 OD(起点-终点)数据集,其中包括 id、rent_statino、x1、y1、return_station、x2、y2、st 等

x1,y1为rent_station的坐标

x2,y2为return_station的坐标

st是出发时间

et是到达时间

我想通过计算相似度并在 500m 的中点距离限制内对所有 OD 流进行 KNN,以找到最终的簇数、平均 OD 坐标和流数的总和。

但是我发现无法继续合并到最终结果。于是,我设置了要聚类的层数,但测试后发现后续层数不符合Radius = 500, K, alpha的条件

我的代码如下:

有什么解决方案可以修改这段代码或者让它在不设置级别的情况下连续自动聚类,直到不能再聚类为止?

非常感谢!

import os
import math
import csv
import sys
import folium
from  tqdm import tqdm
import numpy as np
from sklearn.neighbors import BallTree

def readData(fileName):
    data = []
    full_data = []
    with open(fileName, 'r',encoding="utf_8_sig") as f:
        f.readline()
        while True:
            line = f.readline()
            if line:
                sl = line.split(',')
                if len(sl) > 1: 
                    d = [float(sl[2]),float(sl[3]),float(sl[5]),float(sl[6])]# x1 y1 x2 y2
                    data.append(d+d+[1,1])# [x1 y1 x2 y2 cx1 cy1 cx2 cy2 weight num_of_flows_in_cluster]
                    sl[-1]=sl[-1][:-1]
                    full_data.append(sl)
            else:
                break
    return data, full_data

def KNN(i, k, data_mid_point, tree):
    dist, ind = tree.query(np.expand_dims(data_mid_point[i], axis=0), k=k)
    return list(ind[0])

def N_radius(i, d, data_mid_point, tree):     
    distance_xy = d *0.000001
    ind = tree.query_radius(np.expand_dims(data_mid_point[i], axis=0), r=distance_xy)
    return list(ind[0])

def calcClusterFlow(c, data):
    ox = 0
    oy = 0
    dx = 0
    dy = 0
    for k in c:
        ox += data[k][0]*data[k][8]
        oy += data[k][1]*data[k][8]
        dx += data[k][2]*data[k][8]
        dy += data[k][3]*data[k][8]
    d = 0
    for k in c:
        d += data[k][8]
    ox /= d
    oy /= d
    dx /= d
    dy /= d
    return ox, oy, dx, dy

def flowSim(vi, vj, alpha):
    leni = math.sqrt((vi[0]**2+vi[1]**2))
    lenj = math.sqrt((vj[0]**2+vj[1]**2))
    dv = math.sqrt((vi[0] - vj[0]) ** 2 + (vi[1] - vj[1]) ** 2)
    if leni > lenj:
        return dv/(alpha*leni)
    else:
        return dv/(alpha*lenj)

def clusterSim(ci, cj, data, alpha):
    oix, oiy, dix, diy = data[ci[0]][4], data[ci[0]][5], data[ci[0]][6], data[ci[0]][7]
    ojx, ojy, djx, djy = data[cj[0]][4], data[cj[0]][5], data[cj[0]][6], data[cj[0]][7]
    vi = [dix-oix, diy-oiy]
    vj = [djx-ojx, djy-ojy]
    return flowSim(vi, vj, alpha)

def merge(c, ci_ID, cj_ID, l):
    if ci_ID > cj_ID :
        ci_ID, cj_ID = cj_ID, ci_ID
    for l_ID in c[cj_ID]:
        l[l_ID] = ci_ID
        c[ci_ID].append(l_ID)
    c.pop(cj_ID)

def outputSLabeledData(filename, full_data):
    rf = open(filename, 'w', newline='',encoding="utf_8_sig")
    sheet = csv.writer(rf)
    sheet.writerow(['id','ori_id','rent_station','x1','y1','return_station','x2','y2','st','et',]+['cluster_level'+str(i+1) for i in range(len(full_data[0])-9)])
    for i in range(len(full_data)):
        r = [i]
        r=r+full_data[i]
        sheet.writerow(r)
    rf.close()

def outputSClusterData(filename, data, c):
    rf = open(filename, 'w', newline='',encoding="utf_8_sig")
    sheet = csv.writer(rf)
    sheet.writerow(['clusterID','x1','y1','x2','y2','flownum'])
    for clusterID in c.keys():
        r = [clusterID, data[c[clusterID][0]][4], data[c[clusterID][0]][5], data[c[clusterID][0]][6], data[c[clusterID][0]][7], data[c[clusterID][0]][9]]
        sheet.writerow(r)
    rf.close()

def Scluster(K,alpha):        
    num_level = 5
    Radius =  500
    dataFile = "tbike0304.csv" # 'test2_flow.csv' ## # #sys.argv[1]
    ldataFile = dataFile[:-4]+'_ld '+str(K)+' '+str(alpha)+' r'+str(Radius)+' 0.csv' 
    clusterFile = dataFile[:-4]+'_c '+str(K)+' '+str(alpha)+' r'+str(Radius)+' 0.csv'

    print('file: ', dataFile)
    print('alpha =', alpha, '; k =', K)#, '; Radius =', Radius)

    #----------------------------initialize------------------------------------
    print('\ninitialize...')
    data, full_data = readData(dataFile)
    dataLen = len(data)
    c = [{} for i in range(num_level)]
    l = [[] for i in range(num_level)]
    level = 0
    #----------------------------spatial clustering----------------------------------
    for i in range(dataLen):
        c[0][i] = [i] 
        l[0].append(i)

    print('start clustering...')

    data_arr = np.array(data)
    data_mid_point_x = (data_arr[:,0]+data_arr[:,2])/2
    data_mid_point_y = (data_arr[:,1]+data_arr[:,3])/2
    data_mid_point = np.concatenate((np.expand_dims(data_mid_point_x, axis=1),np.expand_dims(data_mid_point_y, axis=1)),axis=1)
    tree = BallTree(data_mid_point, metric='l2', leaf_size=2)              

    for i in tqdm(range(dataLen)):
        neighbors = KNN(i, K, data_mid_point, tree)
        for j in neighbors:
            if (data_mid_point[i][0]-data_mid_point[j][0])**2+(data_mid_point[i][1]-data_mid_point[j][1])**2>(Radius*0.000009)**2:
                continue
            if l[level][i] != l[level][j]:
                if clusterSim(c[level][l[level][i]], c[level][l[level][j]], data, alpha) <= 1:
                    merge(c[level], l[level][i], l[level][j], l[level])
                    new_cluster_ID = min(l[level][i],l[level][j])
                    cox, coy, cdx, cdy = calcClusterFlow(c[level][new_cluster_ID],data)
                    num_of_flow_in_cluster=0
                    for m in c[level][new_cluster_ID]:
                        num_of_flow_in_cluster+=data[m][8]
                    for m in c[level][new_cluster_ID]:
                        data[m][4], data[m][5], data[m][6], data[m][7], data[m][9] = cox, coy, cdx, cdy, num_of_flow_in_cluster

    if os.path.exists(ldataFile):
        os.remove(ldataFile)
    if os.path.exists(clusterFile):
        os.remove(clusterFile)

    outputSClusterData(clusterFile, data, c[level])
    print('num of cluster:',len(c[level]))
    num_c = [len(value) for key, value in c[level].items()]
    average_num_c = sum(num_c)/len(num_c)
    print('average num of c:',average_num_c)
    for i in range(dataLen):
        full_data[i].append(l[0][i])

    for level in range(1,num_level):
        print('The',level,'run')
        ldataFile = dataFile[:-4]+'_ld '+str(K)+' '+str(alpha)+' '+str(level)+'.csv'
        clusterFile = dataFile[:-4]+'_c '+str(K)+' '+str(alpha)+' level'+str(level+1)+'.csv' 

        new_data = [[data[c[level-1][i][0]][4], data[c[level-1][i][0]][5], data[c[level-1][i][0]][6], data[c[level-1][i][0]][7], data[c[level-1][i][0]][4], data[c[level-1][i][0]][5], data[c[level-1][i][0]][6], data[c[level-1][i][0]][7], data[c[level-1][i][0]][9], data[c[level-1][i][0]][9]] for i in c[level-1].keys()]
        dataLen = len(new_data)

        for i in range(dataLen):
            c[level][i] = [i]
            l[level].append(i)

        data_arr = np.array(new_data)
        data_mid_point_x = (data_arr[:,0]+data_arr[:,2])/2
        data_mid_point_y = (data_arr[:,1]+data_arr[:,3])/2
        data_mid_point = np.concatenate((np.expand_dims(data_mid_point_x, axis=1),np.expand_dims(data_mid_point_y, axis=1)),axis=1)
        tree = BallTree(data_mid_point, metric='l2', leaf_size=2)

        for i in tqdm(range(dataLen)):
            neighbors = KNN(i, K, data_mid_point, tree)
            for j in neighbors:
                if (data_mid_point[i][0]-data_mid_point[j][0])**2+(data_mid_point[i][1]-data_mid_point[j][1])**2>(Radius**0.000009)**2:
                    continue
                if l[level][i] != l[level][j]:
                    if clusterSim(c[level][l[level][i]], c[level][l[level][j]], new_data, alpha) <= 1:
                        merge(c[level], l[level][i], l[level][j], l[level])
                        new_cluster_ID = min(l[level][i],l[level][j])
                        cox, coy, cdx, cdy = calcClusterFlow(c[level][new_cluster_ID],new_data)
                        num_of_flow_in_cluster=0
                        for m in c[level][new_cluster_ID]:
                            num_of_flow_in_cluster+=new_data[m][8]
                        for m in c[level][new_cluster_ID]:
                            new_data[m][4], new_data[m][5], new_data[m][6], new_data[m][7], new_data[m][9] = cox, coy, cdx, cdy, num_of_flow_in_cluster

        if os.path.exists(ldataFile):
            os.remove(ldataFile)
        if os.path.exists(clusterFile):
            os.remove(clusterFile)

        outputSClusterData(clusterFile, new_data, c[level])
        print('num of cluster:',len(c[level]))
        num_c = [len(value) for key, value in c[level].items()]
        average_num_c = sum(num_c)/len(num_c)
        print('average num of c:',average_num_c)

    prev_level = 0
    cur_level = 1
    while(cur_level<num_level):
        cur2prev=[i for i in c[prev_level].keys()]
        for cur_cluster_ID, prev_cluster_ID in enumerate(c[prev_level].keys()):
            try:   
                l_prev=[]
                for i in c[cur_level][cur_cluster_ID]:
                    iter_ID = cur2prev[i]
                    for j in c[prev_level][iter_ID]:
                        l_prev.append(j)
                        full_data[j].append(cur_cluster_ID)
                c[cur_level][cur_cluster_ID] = l_prev
            except:
                pass
        cur_level+=1
        prev_level+=1
    outputSLabeledData(ldataFile, full_data)

alpha = 0.3
for K in [8]:
    Scluster(K,alpha)

#K = 634
#for alpha in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
#    Scluster(K,alpha)
python cluster-analysis knn
© www.soinside.com 2019 - 2024. All rights reserved.