Python中的集群:如何对一组唯一的ID对进行分组?

问题描述 投票:0回答:1

我需要将彼此交叉的唯一ID对列表进行聚类。最简单的例子:

data_rows=[
    {'clid':1,'uid':'a'},
    {'clid':2,'uid':'b'},
    {'clid':3,'uid':'b'},
    {'clid':4,'uid':'c'},
    {'clid':4,'uid':'x'},
    {'clid':5,'uid':'d'},
    {'clid':5,'uid':'e'},
    {'clid':6,'uid':'d'},
    {'clid':6,'uid':'e'},
    {'clid':6,'uid':'f'},
]
# I transform data to this to this.
clid_dic = {1:['a'], 2:['b'],3:['b'], 4:['c','x'],5:['d','e'],6:['d','e','f']}
uid_dic = {'a':[1], 'b':[2,3], 'c':[4], 'x':[4], 'd':[5,6], 'e':[5,6], 'f':[6]}
# expected result
eid_dic = {
    1:{'clids':[1],'uids':['a']},
    2:{'clids':[2,3],'uids':['b']},
    3:{'clids':[4],'uids':['c','x']},
    4:{'clids':[5,6],'uids':['d','e','f']}
}

我一直都在努力创建解决方案,该解决方案将适用于更大的对对(> 500.000),并且不会陷入递归或其他错误。我的解决方案如下所示:

def crawl(clid):
    clids, uids = [],[]
    # add current clid to batch
    if clid not in clids: clids.append(clid)

    #collect its uids
    for uid in clid_dic[clid]:
        if uid not in uids: uids.append(uid)

    #remove from dic
    del clid_dic[clid]

    # collect new clids for each uid
    for uid in uids:
        if uid in uid_dic:
            for cld in uid_dic[uid]:
                if cld not in clids:
                    clids.append(cld)
    # for each collected clid call self
    for cld in clids:
        if cld in clid_dic:
            tmp  = crawl(cld)
            clids.extend([x for x in tmp[0] if x not in clids])
            uids.extend([x for x in tmp[1] if x not in uids])
    return(clids,uids)

def loop():
    i = 0
    eid_dic = {}
    while True:
        n = len(clid_dic)
        if n == 0: break
        i += 1
        clid = next(iter(clid_dic))
        tmp = crawl(clid)
        clids, uids= tmp[0], tmp[1]

        eid_dic[i] = {'clids': clids, 'uids': uids}
    print(eid_dic)
if __name__ == '__main__':
    loop()

似乎是基本任务(集群化),但是缺乏经验和知识使我走到了尽头。

python python-3.x algorithm graph-algorithm
1个回答
0
投票

如果项目顺序没有问题,这是一个解决方案(使用clid_dicuid_dic值的集合):

data_rows=[
    {'clid':1,'uid':'a'},
    {'clid':2,'uid':'b'},
    {'clid':3,'uid':'b'},
    {'clid':4,'uid':'c'},
    {'clid':4,'uid':'x'},
    {'clid':5,'uid':'d'},
    {'clid':5,'uid':'e'},
    {'clid':6,'uid':'d'},
    {'clid':6,'uid':'e'},
    {'clid':6,'uid':'f'},
]

clid_dic = {}
uid_dic = {}
for d in data_rows:
    clid_dic.setdefault(d['clid'], set()).add(d['uid'])
    uid_dic.setdefault(d['uid'], set()).add(d['clid'])

eid_dic, cnt = {}, 1

while clid_dic:
    k, v = clid_dic.popitem()
    i, s = {k}, set(v)

    to_remove = []
    for k2, v2 in clid_dic.items():
        if s & v2:
            s = s.union(v2)
            to_remove.append(k2)
    for k2 in to_remove:
        del clid_dic[k2]

    to_remove = []
    for k2, v2 in uid_dic.items():
        if i & v2:
            i = i.union(v2)
            to_remove.append(k2)
    for k2 in to_remove:
        del uid_dic[k2]

    eid_dic[cnt] = {'clids':list(i), 'uids':list(s)}
    cnt += 1

from pprint import pprint
pprint(eid_dic)

打印:

{1: {'clids': [5, 6], 'uids': ['e', 'd', 'f']},
 2: {'clids': [4], 'uids': ['c', 'x']},
 3: {'clids': [2, 3], 'uids': ['b']},
 4: {'clids': [1], 'uids': ['a']}}
© www.soinside.com 2019 - 2024. All rights reserved.