我想计算 pandas DataFrame 列的皮尔逊相关性。我不仅仅想使用
DataFrame.corr()
,因为我还需要相关性的 p 值;因此,我正在使用scipy.stats.pearsonr(x, y)
。我现在的问题是我的数据框很大(形状:(1166, 49262)),所以我正在查看 (49262^2-49262)/2 相关性。
请告知我如何优化它以减少计算时间。
我的相关性代码:
# the variable `data` contains the dataframe of shape (1166, 49262)
# setting up output dataframes
dfcols = pd.DataFrame(columns=data.columns)
correlation = dfcols.T.join(dfcols, how='outer')
pvalues = correlation.copy()
# pairwise calculation
for r in range(len(data.columns)):
for c in range(r+1, len(data.columns)):
# iterate over all combinations of columns to calculate correlation
tmp = input.iloc[:, [r,c]].dropna()
if len(tmp) < 2:
# too few data points to calculate correlation coefficient
result = (0, 1)
else:
result = pearsonr(tmp.iloc[:, 0], tmp.iloc[:, 1])
correlation.iloc[r, c] = result[0]
pvalues.iloc[r, c] = result[1]
一些注意事项:
.dropna()
并捕获剩余少于两个数据点的情况。def foo():
data = load_df() # the pd.DataFrame of shape (1166, 49262)
cols = data.columns
for i in range(len(cols)):
logging.info(f"{i+1}/{len(cols)}")
for j in range(i+1, len(cols)):
tmp = data.iloc[:, [i, j]].dropna()
if len(tmp) < 2:
# You may ignore this for this post; I was looking for columns pairs with too few data points to correlate
logging.warn(f"correlating columns '{cols[i]}' and '{cols[j]}' results in less than 2 usable data points")
foo()
您可以尝试通过多重处理(使用共享数组)来加速计算:
import ctypes
import multiprocessing as mp
from itertools import combinations
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from tqdm import tqdm
df = None
def init_df(shared_arr, r, c, columns):
global df
a = np.frombuffer(shared_arr.get_obj()).reshape(r, c)
df = pd.DataFrame(
a,
columns=columns,
copy=False, # <-- don't copy the memory, use shared array
)
def get_values(comb):
r, c = comb
tmp = df.loc[:, [r, c]].dropna()
if len(tmp) < 2:
# too few data points to calculate correlation coefficient
result = (0, 1)
else:
result = pearsonr(tmp.iloc[:, 0], tmp.iloc[:, 1])
return r, c, result
if __name__ == "__main__":
# generate sample data:
def get_df(rows=1166, columns=49262):
out = []
for r in range(rows):
d = {}
for c in range(columns):
d[f"column_{c+1}"] = np.random.random() - 0.5
out.append(d)
return pd.DataFrame(out)
r, c = 1166, 49262
data = get_df(r, c)
# create shared array across processes
shared_arr = mp.Array(ctypes.c_double, r * c)
shared_arr[:] = data.values.ravel()
correlation = {}
pvalues = {}
with mp.Pool(
processes=16,
initializer=init_df,
initargs=(shared_arr, r, c, data.columns.to_list()),
) as pool:
for r, c, res in pool.imap_unordered(
get_values,
tqdm(
combinations(data.columns, 2),
total=1213347691, # all number of 2-combinations for 49262 columns
),
):
correlation.setdefault(c, {})[r] = res[0]
pvalues.setdefault(c, {})[r] = res[1]
correlation = pd.DataFrame(correlation)
pvalues = pd.DataFrame(pvalues)
print(correlation)
print()
print(pvalues)
在我的计算机(AMD 5700x,8x2 核心)上运行此程序显示平均每秒约 5000 次迭代(根据 tqdm),因此计算将在约 70 小时内完成。
我认为 128 个核心可以在几个小时内完成。
通过使用
pd.corr()
并将 R 值转换为具有 beta 分布的概率,您可以获得大约 200 倍的加速。
我建议通过查看 SciPy 是如何做到这一点并查看是否有任何适用于您的案例的改进来实现这一点。源代码可以在这里找到。这准确地告诉您他们如何实现 p 值。具体来说,他们采用 a = b = n / 2 - 1 的 beta 分布,从 -1 到 1,并在指定的 R 值处找到该分布的累积分布函数或生存函数。
因此,虽然
pearsonr()
不支持在所有列对之间进行矢量化,但底层 beta 分布确实 支持这一点。使用此功能,您可以将 pd.corr()
为您提供的相关性转换为相关性加 p 值。
我已经根据您现有的算法对此进行了检查,并且它与机器 epsilon 内的算法一致。我还用 NA 值对其进行了测试。
就速度而言,它比原始解决方案快大约 200 倍,比仅使用一个核心的多核解决方案更快。
这是代码。请注意,只有
calculate_corr_fast
和 get_pvalue_vectorized
对解决方案很重要。剩下的只是设置测试数据或进行比较。
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import scipy
M = 1000
N = 200
P = 0.1
A = np.random.rand(M, N)
A[np.random.rand(M, N) < P] = np.nan
df = pd.DataFrame(A, columns=[f"a{i}" for i in range(1, N + 1)])
# setting up output dataframes
def calculate_corr(data):
dfcols = pd.DataFrame(columns=data.columns)
correlation = dfcols.T.join(dfcols, how='outer')
pvalues = correlation.copy()
# pairwise calculation
for r in range(len(data.columns)):
for c in range(r, len(data.columns)):
# iterate over all combinations of columns to calculate correlation
tmp = data.iloc[:, [r,c]].dropna()
if len(tmp) < 2:
# too few data points to calculate correlation coefficient
result = (0, 1)
else:
result = pearsonr(tmp.iloc[:, 0], tmp.iloc[:, 1])
correlation.iloc[r, c] = result[0]
correlation.iloc[c, r] = result[0]
pvalues.iloc[r, c] = result[1]
pvalues.iloc[c, r] = result[1]
return correlation, pvalues
def get_pvalue_vectorized(r, ab, alternative='two-sided'):
"""Get p-value from beta dist given the statistic, and alternative."""
assert len(r.shape) == 2
assert len(ab.shape) == 2
diag = np.arange(r.shape[0])
# This is just to keep squareform happy. These don't actually
# get sent to the beta distribution function.
r[diag, diag] = 0
ab[diag, diag] = 0
# Avoid doing repeated computations of r,c and c,r
rsq = scipy.spatial.distance.squareform(r)
r[diag, diag] = 1
absq = scipy.spatial.distance.squareform(ab)
kwargs = dict(a=absq, b=absq, loc=-1, scale=2)
if alternative == 'less':
pvalue = scipy.stats.beta.cdf(rsq, **kwargs)
elif alternative == 'greater':
pvalue = scipy.stats.beta.sf(rsq, **kwargs)
elif alternative == 'two-sided':
pvalue = 2 * (scipy.stats.beta.sf(np.abs(rsq), **kwargs))
else:
message = "`alternative` must be 'less', 'greater', or 'two-sided'."
raise ValueError(message)
# Put back into 2d matrix
pvalue = scipy.spatial.distance.squareform(pvalue)
return pvalue
def calculate_corr_fast(data):
correlation = data.corr()
# For each pair of data values, count how many cases where both data values are
# defined at the same position, using matrix multiply as a pairwise boolean and.
data_notna = data.notna().values.astype('float32')
value_count = data_notna.T @ data_notna
# This is the a and b parameter for the beta distribution. It is different
# for every p-value, because each one can potentially have a different number
# of missing values
ab = value_count / 2 - 1
pvalues = get_pvalue_vectorized(correlation.values, ab)
invalid = value_count < 2
pvalues[invalid] = np.nan
# Convert back to dataframe
pvalues = pd.DataFrame(pvalues, columns=correlation.columns, index=correlation.index)
return correlation, pvalues
correlation, pvalues = calculate_corr(df)
correlation_fast, pvalues_fast = calculate_corr_fast(df)
assert np.allclose(pvalues_fast.values.astype('float64'), pvalues.values.astype('float64'))
assert np.allclose(correlation_fast.values.astype('float64'), correlation.values.astype('float64'))
1000x200 数据帧的基准测试结果:
Original code
40.5 s ± 1.18 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
@Andrej Kesely's answer
~20 seconds
My answer
190 ms ± 209 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
基准注释: