使用 scipy.stats.chi2_contingency 时出现 ValueError

问题描述 投票:0回答:1
import numpy as np
from scipy.stats import chi2_contingency

class Node_chi2:
    def __init__(self, chi2, num_samples, num_samples_per_class, predicted_class):
        self.chi2 = chi2
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

class DecisionTree_chi2():

    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y, depth=0):
        classes = list(set(y))
        num_samples_per_class = [np.sum(y == i) for i in classes]
        predicted_class = classes[np.argmax(num_samples_per_class)]
        node = Node_chi2(
              chi2=0,  # chi2 is 0 at the start
              num_samples=len(y),
              num_samples_per_class=num_samples_per_class,
              predicted_class=predicted_class,
          )

        if depth < self.max_depth:
            idx, thr = self.best_split(X, y, classes)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self.fit(X_left, y_left, depth + 1)
                node.right = self.fit(X_right, y_right, depth + 1)

        self.node = node
        return self.node
    def best_split(self, X, y, classes):
        m, n = X.shape
        if m <= 1:
            return None, None

        best_chi2 = 0  # Best chi2 score
        best_idx, best_thr = None, None

        for idx in range(n):
            thresholds, classes_sorted = zip(*sorted(zip(X[:, idx], y)))
            for i in range(1, m):  # iterate through each threshold
                if thresholds[i] == thresholds[i - 1]:
                    continue
                y_left = np.array(classes_sorted[:i])
                y_right = np.array(classes_sorted[i:])
                left_counts = np.bincount(y_left, minlength=len(classes)) 
                right_counts = np.bincount(y_right, minlength=len(classes))
                chi2, p, _ , _ = chi2_contingency(np.array([left_counts, right_counts]))

                if chi2 > best_chi2:
                    best_chi2 = chi2
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2

        return best_idx, best_thr

    def predict(self, X):
        yhat = []
        for sample in X: 
            node = self.node
            while node.left:
                if sample[node.feature_index] < node.threshold:
                    node = node.left
                else:
                    node = node.right
            yhat.append(node.predicted_class)
        return np.array(yhat)
    



from sklearn.datasets import make_moons, make_classification

X, y = make_moons(n_samples=100, noise=0.1, random_state=42)

def plot_decision_boundaries(clf, X, y, label=''):
    # Generate a grid of points to make predictions:
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, 0.1), np.arange(x2_min, x2_max, 0.1))

    # Use the classifier to make predictions on the grid:
    Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])
    '''Here is where the magic happens! Above we have created the (x1, x2) grid,
    and now we calculate the class for each and every point.'''

    Z = Z.reshape(xx1.shape)

    colors = {0: 'C0', 1: 'C1', 2: 'C2'}
    # colors dictionary indexed by class label

    # Create a contour plot to display the decision boundaries:
    fig = plt.figure(figsize=(4, 4))

    from matplotlib.colors import ListedColormap
    n_classes = len(np.unique(y))
    custom_cmap = ListedColormap(list(colors.values())[:n_classes])

    plt.contourf(xx1, xx2, Z, cmap=custom_cmap, alpha=0.4)

    plt.scatter(X[y==0,0], X[y==0,1], c=colors[0], marker='o', edgecolors='k', alpha=0.5)
    plt.scatter(X[y==1,0], X[y==1,1], c=colors[1], marker='o', edgecolors='k', alpha=0.5)
    plt.scatter(X[y==2,0], X[y==2,1], c=colors[2], marker='o', edgecolors='k', alpha=0.5)

    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.title('Decision Boundaries - '+label+' DT')
    plt.show()


clf_gini = DecisionTree_chi2(max_depth=5)
clf_gini.fit(X, y)

plot_decision_boundaries(clf_gini, X, y, label='Gini')

我收到的错误是“ValueError:内部计算的预期频率表在 (0, 0) 处有一个零元素。” 。我找到的唯一解决方案是放置:

left_counts = np.bincount(y_left, minlength=len(classes)) + 1e-10  # add small constant to avoid zero frequencies
                right_counts = np.bincount(y_right, minlength=len(classes)) + 1e-10  # add small constant to avoid zero frequencies

但我对这个答案不满意,我想修复它而不必添加一个小常数。这对我来说似乎不正确,所以

python scikit-learn decision-tree scipy.stats contingency
1个回答
0
投票

导致错误的输入本质上是

from scipy import stats
stats.chi2_contingency([[0, 1], [0, 31]])

我不知道您的用例是什么,但让我们将

chi2_contingency
文档中的示例改编为您的数据。

下表总结了参与者多年来定期服用阿司匹林或安慰剂的实验结果。记录了缺血性中风的病例:

                  Aspirin   Control/Placebo
Ischemic stroke     0           1
No stroke           0          31

有证据表明阿司匹林可以降低缺血性中风的风险吗?

如果没有参与者服用阿司匹林,则不能期望测试返回有意义的结果。

您需要考虑这在您的问题背景下意味着什么,并重新评估是否要提供该输入。

© www.soinside.com 2019 - 2024. All rights reserved.