在Python中将边列表转换为邻接矩阵最有效的方法是什么? 下面是迄今为止我最好的镜头,但对于我的需要来说仍然很慢。
import string
import random
import pandas as pd
users = [''.join(random.choice(string.ascii_letters) for _ in range(10)) for _ in range(100)]
connections = 100000
edge_list = pd.DataFrame({'source':[random.choice(users) for _ in range(connections)],
'target':[random.choice(users) for _ in range(connections)],
'event':[random.choice(['event1', 'event2', 'event3', 'event4', 'event5']) for _ in range(connections)]})
adj_matrix = edge_list.groupby(['source', 'target'])['target'].count().unstack(fill_value=0)
%%timeit
adj_matrix = edge_list.groupby(['source', 'target'])['target'].count().unstack(fill_value=0)
9.95 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
您可以尝试使用
pd.factorize
您的source
/target
列并使用numpy
构建邻接矩阵:
def get_adj_matrix(edge_list):
source_codes, source_uniques = pd.factorize(edge_list["source"])
target_codes, target_uniques = pd.factorize(edge_list["target"])
out = np.zeros((len(source_uniques), len(target_uniques)), dtype=np.int32)
np.add.at(out, (source_codes, target_codes), 1)
return pd.DataFrame(out, index=source_uniques, columns=target_uniques, copy=False)
基准:
import random
import string
import pandas as pd
import numpy as np
import perfplot
from matplotlib import pyplot as plt
def get_edge_list(connections):
users = [
"".join(random.choice(string.ascii_letters) for _ in range(10))
for _ in range(100)
]
edge_list = pd.DataFrame(
{
"source": [random.choice(users) for _ in range(connections)],
"target": [random.choice(users) for _ in range(connections)],
"event": [
random.choice(["event1", "event2", "event3", "event4", "event5"])
for _ in range(connections)
],
}
)
return edge_list
def get_adj_matrix(edge_list):
source_codes, source_uniques = pd.factorize(edge_list["source"])
target_codes, target_uniques = pd.factorize(edge_list["target"])
out = np.zeros((len(source_uniques), len(target_uniques)), dtype=np.int32)
np.add.at(out, (source_codes, target_codes), 1)
return pd.DataFrame(out, index=source_uniques, columns=target_uniques, copy=False)
plt.rcParams["figure.autolayout"] = True
perfplot.show(
setup=get_edge_list,
kernels=[
lambda edge_list: edge_list.groupby(["source", "target"])["target"]
.count()
.unstack(fill_value=0),
lambda edge_list: pd.crosstab(edge_list["source"], edge_list["target"]),
get_adj_matrix,
],
labels=["groupby", "crosstab", "pd.factorize"],
n_range=[10, 100, 1000, 10000, 20000, 30000, 50000, 100000, 200000],
xlabel="N",
logx=True,
logy=True,
equality_check=None,
)
结果: