我有一个代码,我想根据特定条件(Radius = 500, K, alpha)连续聚类。我试过使用levels,发现并没有按照num_level=2之后的条件执行。我该如何修改它?
你好,
我的原始数据是一个 OD(起点-终点)数据集,其中包括 id、rent_statino、x1、y1、return_station、x2、y2、st 等
x1,y1为rent_station的坐标
x2,y2为return_station的坐标
st是出发时间
et是到达时间
我想通过计算相似度并在 500m 的中点距离限制内对所有 OD 流进行 KNN,以找到最终的簇数、平均 OD 坐标和流数的总和。
但是我发现无法继续合并到最终结果。于是,我设置了要聚类的层数,但测试后发现后续层数不符合Radius = 500, K, alpha的条件
我的代码如下:
有什么解决方案可以修改这段代码或者让它在不设置级别的情况下连续自动聚类,直到不能再聚类为止?
非常感谢!
import os
import math
import csv
import sys
import folium
from tqdm import tqdm
import numpy as np
from sklearn.neighbors import BallTree
def readData(fileName):
data = []
full_data = []
with open(fileName, 'r',encoding="utf_8_sig") as f:
f.readline()
while True:
line = f.readline()
if line:
sl = line.split(',')
if len(sl) > 1:
d = [float(sl[2]),float(sl[3]),float(sl[5]),float(sl[6])]# x1 y1 x2 y2
data.append(d+d+[1,1])# [x1 y1 x2 y2 cx1 cy1 cx2 cy2 weight num_of_flows_in_cluster]
sl[-1]=sl[-1][:-1]
full_data.append(sl)
else:
break
return data, full_data
def KNN(i, k, data_mid_point, tree):
dist, ind = tree.query(np.expand_dims(data_mid_point[i], axis=0), k=k)
return list(ind[0])
def N_radius(i, d, data_mid_point, tree):
distance_xy = d *0.000001
ind = tree.query_radius(np.expand_dims(data_mid_point[i], axis=0), r=distance_xy)
return list(ind[0])
def calcClusterFlow(c, data):
ox = 0
oy = 0
dx = 0
dy = 0
for k in c:
ox += data[k][0]*data[k][8]
oy += data[k][1]*data[k][8]
dx += data[k][2]*data[k][8]
dy += data[k][3]*data[k][8]
d = 0
for k in c:
d += data[k][8]
ox /= d
oy /= d
dx /= d
dy /= d
return ox, oy, dx, dy
def flowSim(vi, vj, alpha):
leni = math.sqrt((vi[0]**2+vi[1]**2))
lenj = math.sqrt((vj[0]**2+vj[1]**2))
dv = math.sqrt((vi[0] - vj[0]) ** 2 + (vi[1] - vj[1]) ** 2)
if leni > lenj:
return dv/(alpha*leni)
else:
return dv/(alpha*lenj)
def clusterSim(ci, cj, data, alpha):
oix, oiy, dix, diy = data[ci[0]][4], data[ci[0]][5], data[ci[0]][6], data[ci[0]][7]
ojx, ojy, djx, djy = data[cj[0]][4], data[cj[0]][5], data[cj[0]][6], data[cj[0]][7]
vi = [dix-oix, diy-oiy]
vj = [djx-ojx, djy-ojy]
return flowSim(vi, vj, alpha)
def merge(c, ci_ID, cj_ID, l):
if ci_ID > cj_ID :
ci_ID, cj_ID = cj_ID, ci_ID
for l_ID in c[cj_ID]:
l[l_ID] = ci_ID
c[ci_ID].append(l_ID)
c.pop(cj_ID)
def outputSLabeledData(filename, full_data):
rf = open(filename, 'w', newline='',encoding="utf_8_sig")
sheet = csv.writer(rf)
sheet.writerow(['id','ori_id','rent_station','x1','y1','return_station','x2','y2','st','et',]+['cluster_level'+str(i+1) for i in range(len(full_data[0])-9)])
for i in range(len(full_data)):
r = [i]
r=r+full_data[i]
sheet.writerow(r)
rf.close()
def outputSClusterData(filename, data, c):
rf = open(filename, 'w', newline='',encoding="utf_8_sig")
sheet = csv.writer(rf)
sheet.writerow(['clusterID','x1','y1','x2','y2','flownum'])
for clusterID in c.keys():
r = [clusterID, data[c[clusterID][0]][4], data[c[clusterID][0]][5], data[c[clusterID][0]][6], data[c[clusterID][0]][7], data[c[clusterID][0]][9]]
sheet.writerow(r)
rf.close()
def Scluster(K,alpha):
num_level = 5
Radius = 500
dataFile = "tbike0304.csv" # 'test2_flow.csv' ## # #sys.argv[1]
ldataFile = dataFile[:-4]+'_ld '+str(K)+' '+str(alpha)+' r'+str(Radius)+' 0.csv'
clusterFile = dataFile[:-4]+'_c '+str(K)+' '+str(alpha)+' r'+str(Radius)+' 0.csv'
print('file: ', dataFile)
print('alpha =', alpha, '; k =', K)#, '; Radius =', Radius)
#----------------------------initialize------------------------------------
print('\ninitialize...')
data, full_data = readData(dataFile)
dataLen = len(data)
c = [{} for i in range(num_level)]
l = [[] for i in range(num_level)]
level = 0
#----------------------------spatial clustering----------------------------------
for i in range(dataLen):
c[0][i] = [i]
l[0].append(i)
print('start clustering...')
data_arr = np.array(data)
data_mid_point_x = (data_arr[:,0]+data_arr[:,2])/2
data_mid_point_y = (data_arr[:,1]+data_arr[:,3])/2
data_mid_point = np.concatenate((np.expand_dims(data_mid_point_x, axis=1),np.expand_dims(data_mid_point_y, axis=1)),axis=1)
tree = BallTree(data_mid_point, metric='l2', leaf_size=2)
for i in tqdm(range(dataLen)):
neighbors = KNN(i, K, data_mid_point, tree)
for j in neighbors:
if (data_mid_point[i][0]-data_mid_point[j][0])**2+(data_mid_point[i][1]-data_mid_point[j][1])**2>(Radius*0.000009)**2:
continue
if l[level][i] != l[level][j]:
if clusterSim(c[level][l[level][i]], c[level][l[level][j]], data, alpha) <= 1:
merge(c[level], l[level][i], l[level][j], l[level])
new_cluster_ID = min(l[level][i],l[level][j])
cox, coy, cdx, cdy = calcClusterFlow(c[level][new_cluster_ID],data)
num_of_flow_in_cluster=0
for m in c[level][new_cluster_ID]:
num_of_flow_in_cluster+=data[m][8]
for m in c[level][new_cluster_ID]:
data[m][4], data[m][5], data[m][6], data[m][7], data[m][9] = cox, coy, cdx, cdy, num_of_flow_in_cluster
if os.path.exists(ldataFile):
os.remove(ldataFile)
if os.path.exists(clusterFile):
os.remove(clusterFile)
outputSClusterData(clusterFile, data, c[level])
print('num of cluster:',len(c[level]))
num_c = [len(value) for key, value in c[level].items()]
average_num_c = sum(num_c)/len(num_c)
print('average num of c:',average_num_c)
for i in range(dataLen):
full_data[i].append(l[0][i])
for level in range(1,num_level):
print('The',level,'run')
ldataFile = dataFile[:-4]+'_ld '+str(K)+' '+str(alpha)+' '+str(level)+'.csv'
clusterFile = dataFile[:-4]+'_c '+str(K)+' '+str(alpha)+' level'+str(level+1)+'.csv'
new_data = [[data[c[level-1][i][0]][4], data[c[level-1][i][0]][5], data[c[level-1][i][0]][6], data[c[level-1][i][0]][7], data[c[level-1][i][0]][4], data[c[level-1][i][0]][5], data[c[level-1][i][0]][6], data[c[level-1][i][0]][7], data[c[level-1][i][0]][9], data[c[level-1][i][0]][9]] for i in c[level-1].keys()]
dataLen = len(new_data)
for i in range(dataLen):
c[level][i] = [i]
l[level].append(i)
data_arr = np.array(new_data)
data_mid_point_x = (data_arr[:,0]+data_arr[:,2])/2
data_mid_point_y = (data_arr[:,1]+data_arr[:,3])/2
data_mid_point = np.concatenate((np.expand_dims(data_mid_point_x, axis=1),np.expand_dims(data_mid_point_y, axis=1)),axis=1)
tree = BallTree(data_mid_point, metric='l2', leaf_size=2)
for i in tqdm(range(dataLen)):
neighbors = KNN(i, K, data_mid_point, tree)
for j in neighbors:
if (data_mid_point[i][0]-data_mid_point[j][0])**2+(data_mid_point[i][1]-data_mid_point[j][1])**2>(Radius**0.000009)**2:
continue
if l[level][i] != l[level][j]:
if clusterSim(c[level][l[level][i]], c[level][l[level][j]], new_data, alpha) <= 1:
merge(c[level], l[level][i], l[level][j], l[level])
new_cluster_ID = min(l[level][i],l[level][j])
cox, coy, cdx, cdy = calcClusterFlow(c[level][new_cluster_ID],new_data)
num_of_flow_in_cluster=0
for m in c[level][new_cluster_ID]:
num_of_flow_in_cluster+=new_data[m][8]
for m in c[level][new_cluster_ID]:
new_data[m][4], new_data[m][5], new_data[m][6], new_data[m][7], new_data[m][9] = cox, coy, cdx, cdy, num_of_flow_in_cluster
if os.path.exists(ldataFile):
os.remove(ldataFile)
if os.path.exists(clusterFile):
os.remove(clusterFile)
outputSClusterData(clusterFile, new_data, c[level])
print('num of cluster:',len(c[level]))
num_c = [len(value) for key, value in c[level].items()]
average_num_c = sum(num_c)/len(num_c)
print('average num of c:',average_num_c)
prev_level = 0
cur_level = 1
while(cur_level<num_level):
cur2prev=[i for i in c[prev_level].keys()]
for cur_cluster_ID, prev_cluster_ID in enumerate(c[prev_level].keys()):
try:
l_prev=[]
for i in c[cur_level][cur_cluster_ID]:
iter_ID = cur2prev[i]
for j in c[prev_level][iter_ID]:
l_prev.append(j)
full_data[j].append(cur_cluster_ID)
c[cur_level][cur_cluster_ID] = l_prev
except:
pass
cur_level+=1
prev_level+=1
outputSLabeledData(ldataFile, full_data)
alpha = 0.3
for K in [8]:
Scluster(K,alpha)
#K = 634
#for alpha in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
# Scluster(K,alpha)