我想计算 sklearn 中矩阵 X 的每一列与输出 y 之间的马修斯相关系数(MCC)。这是我的代码:
from sklearn.metrics import matthews_corrcoef
import numpy as np
X = np.array([[1, 0, 0, 0, 0],
[1, 0, 0, 1, 0],
[1, 0, 0, 0, 1],
[1, 1, 0, 0, 0],
[1, 1, 0, 1, 0],
[1, 1, 0, 0, 1],
[1, 0, 1, 0, 0],
[1, 0, 1, 1, 0],
[1, 0, 1, 0, 1],
[1, 0, 0, 0, 0]])
y = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
n_sample, n_feature = X.shape
rcf_all = []
for i in range(n_feature):
coeff_c_f = abs(matthews_corrcoef(X[:, i], y))
rcf_all.append(coeff_c_f)
rcf = np.mean(rcf_all)
它在这里工作得很好,但只要我有一个包含许多特征的非常大的矩阵,通过一次循环一个特征来计算它们就非常慢。在不使用循环来加速计算过程的情况下同时执行此操作的最有效方法是什么?
调整我之前的numba答案:
from math import sqrt
from statistics import mean
from timeit import timeit
import numba
import numpy as np
from sklearn.metrics import matthews_corrcoef
def get_all_mcc_normal(X, y):
n_sample, n_feature = X.shape
rcf_all = []
for i in range(n_feature):
coeff_c_f = abs(matthews_corrcoef(X[:, i], y))
rcf_all.append(coeff_c_f)
return mean(rcf_all)
@numba.njit
def _fill_cm(m, c1, c2):
m[:] = 0
for a, b in zip(c1, c2):
m[a, b] += 1
@numba.njit
def mcc(confusion_matrix):
# https://stackoverflow.com/a/56875660/992687
tp = confusion_matrix[0, 0]
tn = confusion_matrix[1, 1]
fp = confusion_matrix[1, 0]
fn = confusion_matrix[0, 1]
x = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
return ((tp * tn) - (fp * fn)) / sqrt(x + 1e-6)
@numba.njit
def get_all_mcc_numba(X, y):
rows, columns = X.shape
confusion_matrix = np.zeros((2, 2), dtype="float32")
out = []
for i in range(columns):
_fill_cm(confusion_matrix, X[:, i], y)
out.append(abs(mcc(confusion_matrix)))
return sum(out) / len(out)
X = np.array(
[
[1, 0, 0, 0, 0],
[1, 0, 0, 1, 0],
[1, 0, 0, 0, 1],
[1, 1, 0, 0, 0],
[1, 1, 0, 1, 0],
[1, 1, 0, 0, 1],
[1, 0, 1, 0, 0],
[1, 0, 1, 1, 0],
[1, 0, 1, 0, 1],
[1, 0, 0, 0, 0],
]
)
y = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
# make 2000 x 100 0-1 matrix:
np.random.seed(42)
X = np.random.randint(low=0, high=2, size=(2000, 100), dtype="uint8")
y = np.random.randint(low=0, high=2, size=2000, dtype="uint8")
assert np.allclose(get_all_mcc_normal(X, y), get_all_mcc_numba(X, y))
t_normal = timeit("get_all_mcc_normal(X, y)", number=1, globals=globals())
t_numba = timeit("get_all_mcc_numba(X, y)", number=1, globals=globals())
print(f"{t_normal=}")
print(f"{t_numba=}")
在我的计算机上打印(AMD 5700x):
t_normal=0.08777441698475741
t_numba=0.00017813200247474015