在Python中从边列表到邻接矩阵的快速转换

问题描述 投票:0回答:1

在Python中将边列表转换为邻接矩阵最有效的方法是什么? 下面是迄今为止我最好的镜头,但对于我的需要来说仍然很慢。

import string
import random
import pandas as pd

users = [''.join(random.choice(string.ascii_letters) for _ in range(10)) for _ in range(100)]
connections = 100000

edge_list = pd.DataFrame({'source':[random.choice(users) for _ in range(connections)],
              'target':[random.choice(users) for _ in range(connections)],
              'event':[random.choice(['event1', 'event2', 'event3', 'event4', 'event5']) for _ in range(connections)]})

adj_matrix = edge_list.groupby(['source', 'target'])['target'].count().unstack(fill_value=0)
%%timeit
adj_matrix = edge_list.groupby(['source', 'target'])['target'].count().unstack(fill_value=0)

9.95 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

python pandas network-programming
1个回答
0
投票

您可以尝试使用

pd.factorize
您的
source
/
target
列并使用
numpy
构建邻接矩阵:

def get_adj_matrix(edge_list):
    source_codes, source_uniques = pd.factorize(edge_list["source"])
    target_codes, target_uniques = pd.factorize(edge_list["target"])

    out = np.zeros((len(source_uniques), len(target_uniques)), dtype=np.int32)
    np.add.at(out, (source_codes, target_codes), 1)

    return pd.DataFrame(out, index=source_uniques, columns=target_uniques, copy=False)

基准:

import random
import string

import pandas as pd
import numpy as np
import perfplot
from matplotlib import pyplot as plt


def get_edge_list(connections):
    users = [
        "".join(random.choice(string.ascii_letters) for _ in range(10))
        for _ in range(100)
    ]

    edge_list = pd.DataFrame(
        {
            "source": [random.choice(users) for _ in range(connections)],
            "target": [random.choice(users) for _ in range(connections)],
            "event": [
                random.choice(["event1", "event2", "event3", "event4", "event5"])
                for _ in range(connections)
            ],
        }
    )

    return edge_list


def get_adj_matrix(edge_list):
    source_codes, source_uniques = pd.factorize(edge_list["source"])
    target_codes, target_uniques = pd.factorize(edge_list["target"])

    out = np.zeros((len(source_uniques), len(target_uniques)), dtype=np.int32)
    np.add.at(out, (source_codes, target_codes), 1)

    return pd.DataFrame(out, index=source_uniques, columns=target_uniques, copy=False)

plt.rcParams["figure.autolayout"] = True

perfplot.show(
    setup=get_edge_list,
    kernels=[
        lambda edge_list: edge_list.groupby(["source", "target"])["target"]
        .count()
        .unstack(fill_value=0),
        lambda edge_list: pd.crosstab(edge_list["source"], edge_list["target"]),
        get_adj_matrix,
    ],
    labels=["groupby", "crosstab", "pd.factorize"],
    n_range=[10, 100, 1000, 10000, 20000, 30000, 50000, 100000, 200000],
    xlabel="N",
    logx=True,
    logy=True,
    equality_check=None,
)

结果:

© www.soinside.com 2019 - 2024. All rights reserved.