我想在 sklearn 中执行 Matthews 相关系数(MCC)来查找 2D numpyarray 中不同特征(布尔向量)之间的相关性。到目前为止我所做的就是循环遍历每一列并逐一查找特征之间的相关值。
这是我的代码:
from sklearn.metrics import matthews_corrcoef
import numpy as np
X = np.array([[1, 0, 0, 0, 0],
[1, 0, 0, 1, 0],
[1, 0, 0, 0, 1],
[1, 1, 0, 0, 0],
[1, 1, 0, 1, 0],
[1, 1, 0, 0, 1],
[1, 0, 1, 0, 0],
[1, 0, 1, 1, 0],
[1, 0, 1, 0, 1],
[1, 0, 0, 0, 0]])
n_sample, n_feature = X.shape
rff_all = []
for i in range(n_feature):
for j in range(i + 1, n_feature):
coeff_f_f = abs(matthews_corrcoef(X[:, i], X[:, j]))
rff_all.append(coeff_f_f)
rff = np.mean(rff_all)
由于我有一个巨大的二维 numpyarray 维度,它似乎非常慢且不切实际。在不使用循环的情况下同时执行此类操作的最有效方法是什么?
编辑:然后我想出了这个想法,但它仍然很慢。
from more_itertools import distinct_combinations
all_c = []
for item in distinct_combinations(np.arange(X.shape[1]), r=2):
c = matthews_corrcoef(X[:, item][:, 0], X[:, item][:, 1])
all_c.append(abs(c))
您可以使用 numba 来加快计算速度,例如:
import numba
import numpy as np
@numba.njit
def _fill_cm(m, c1, c2):
m[:] = 0
for a, b in zip(c1, c2):
m[a, b] += 1
@numba.njit
def mcc(confusion_matrix):
# https://stackoverflow.com/a/56875660/992687
tp = confusion_matrix[0, 0]
tn = confusion_matrix[1, 1]
fp = confusion_matrix[1, 0]
fn = confusion_matrix[0, 1]
x = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
return ((tp * tn) - (fp * fn)) / sqrt(x + 1e-6)
@numba.njit
def get_all_mcc_numba(X):
rows, columns = X.shape
confusion_matrix = np.zeros((2, 2), dtype="float32")
out = []
for i in range(columns):
c1 = X[:, i]
for j in range(i + 1, columns):
# make confusion matrix
c2 = X[:, j]
_fill_cm(confusion_matrix, c1, c2)
out.append(abs(mcc(confusion_matrix)))
return out
基准:
from timeit import timeit
from math import sqrt
import numba
import numpy as np
from sklearn.metrics import matthews_corrcoef
def get_all_mcc_normal(X):
n_sample, n_feature = X.shape
rff_all = []
for i in range(n_feature):
for j in range(i + 1, n_feature):
coeff_f_f = abs(matthews_corrcoef(X[:, i], X[:, j]))
rff_all.append(coeff_f_f)
return rff_all
@numba.njit
def _fill_cm(m, c1, c2):
m[:] = 0
for a, b in zip(c1, c2):
m[a, b] += 1
@numba.njit
def mcc(confusion_matrix):
# https://stackoverflow.com/a/56875660/992687
tp = confusion_matrix[0, 0]
tn = confusion_matrix[1, 1]
fp = confusion_matrix[1, 0]
fn = confusion_matrix[0, 1]
x = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
return ((tp * tn) - (fp * fn)) / sqrt(x + 1e-6)
@numba.njit
def get_all_mcc_numba(X):
rows, columns = X.shape
confusion_matrix = np.zeros((2, 2), dtype="float32")
out = []
for i in range(columns):
c1 = X[:, i]
for j in range(i + 1, columns):
# make confusion matrix
c2 = X[:, j]
_fill_cm(confusion_matrix, c1, c2)
out.append(abs(mcc(confusion_matrix)))
return out
# make 2000 x 100 0-1 matrix:
np.random.seed(42)
X = np.random.randint(low=0, high=2, size=(2000, 100), dtype="uint8")
assert np.allclose(get_all_mcc_normal(X), get_all_mcc_numba(X))
t_normal = timeit("get_all_mcc_normal(X)", number=1, globals=globals())
t_numba = timeit("get_all_mcc_numba(X)", number=1, globals=globals())
print(f"{t_normal=}")
print(f"{t_numba=}")
在我的计算机上打印(AMD 5700x):
t_normal=4.352220230008243
t_numba=0.008588693017372862