我想计算 pandas DataFrame 列的皮尔逊相关性。我不仅仅想使用
DataFrame.corr()
,因为我还需要相关性的 p 值;因此,我正在使用scipy.stats.pearsonr(x, y)
。我现在的问题是我的数据框很大(形状:(1166, 49262)),所以我正在查看 (49262^2-49262)/2 相关性。
请告知我如何优化它以减少计算时间。
我的相关性代码:
# the variable `data` contains the dataframe of shape (1166, 49262)
# setting up output dataframes
dfcols = pd.DataFrame(columns=data.columns)
correlation = dfcols.T.join(dfcols, how='outer')
pvalues = correlation.copy()
# pairwise calculation
for r in range(len(data.columns)):
for c in range(r+1, len(data.columns)):
# iterate over all combinations of columns to calculate correlation
tmp = input.iloc[:, [r,c]].dropna()
if len(tmp) < 2:
# too few data points to calculate correlation coefficient
result = (0, 1)
else:
result = pearsonr(tmp.iloc[:, 0], tmp.iloc[:, 1])
correlation.iloc[r, c] = result[0]
pvalues.iloc[r, c] = result[1]
一些注意事项:
.dropna()
并捕获剩余少于两个数据点的情况。def foo():
data = load_df() # the pd.DataFrame of shape (1166, 49262)
cols = data.columns
for i in range(len(cols)):
logging.info(f"{i+1}/{len(cols)}")
for j in range(i+1, len(cols)):
tmp = data.iloc[:, [i, j]].dropna()
if len(tmp) < 2:
# You may ignore this for this post; I was looking for columns pairs with too few data points to correlate
logging.warn(f"correlating columns '{cols[i]}' and '{cols[j]}' results in less than 2 usable data points")
foo()
您可以尝试通过多重处理(使用共享数组)来加速计算:
import ctypes
import multiprocessing as mp
from itertools import combinations
import pandas as pd
from scipy.stats import pearsonr
from tqdm import tqdm
df = None
def init_df(shared_arr, r, c, columns):
global df
a = np.frombuffer(shared_arr.get_obj()).reshape(r, c)
df = pd.DataFrame(
a,
columns=columns,
copy=False, # <-- don't copy the memory, use shared array
)
def get_values(comb):
r, c = comb
# iterate over all combinations of columns to calculate correlation
tmp = df.loc[:, [r, c]].dropna()
if len(tmp) < 2:
# too few data points to calculate correlation coefficient
result = (0, 1)
else:
result = pearsonr(tmp.iloc[:, 0], tmp.iloc[:, 1])
return r, c, result
if __name__ == "__main__":
# generate sample data:
def get_df(rows=1166, columns=49262):
out = []
for r in range(rows):
d = {}
for c in range(columns):
d[f"column_{c+1}"] = np.random.random() - 0.5
out.append(d)
return pd.DataFrame(out)
r, c = 1166, 49262
data = get_df(r, c)
# create shared array across processes
shared_arr = mp.Array(ctypes.c_double, r * c)
shared_arr[:] = data.values.ravel()
correlation = {}
pvalues = {}
with mp.Pool(
processes=16,
initializer=init_df,
initargs=(shared_arr, r, c, data.columns.to_list()),
) as pool:
for r, c, res in pool.imap_unordered(
get_values,
tqdm(
combinations(data.columns, 2),
total=1213347691, # all number of combinations for 1166, 49262
),
):
correlation.setdefault(c, {})[r] = res[0]
pvalues.setdefault(c, {})[r] = res[1]
correlation = pd.DataFrame(correlation)
pvalues = pd.DataFrame(pvalues)
print(correlation)
print()
print(pvalues)
在我的计算机(AMD 5700x,8x2 核心)上运行此程序显示平均每秒约 5000 次迭代(根据 tqdm),因此计算将在约 70 小时内完成。
我认为 128 个核心可以在几个小时内完成。