数据呈正态分布,但 ks 测试返回统计量为 1.0

问题描述 投票:0回答:1

我有一个

age
变量。当我使用
kde
qq-plot
绘制它时,分布看起来呈正态;然而,当我执行 ks-test 时,测试
statistics = 1.0
p = 0.0

有人可以帮我解释一下这个观察结果吗?我对其他变量使用 ks-test,结果与其他人的可视化结果一致。

# library
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sps                         

# the age variable 
age = np.array([87, 88, 75, 76, 80, 88, 90, 80, 83, 85, 71, 73, 75, 93, 95, 68, 69,
       66, 68, 78, 80, 83, 81, 82, 85, 76, 77, 88, 90, 80, 81, 85, 86, 87,
       88, 92, 80, 82, 84, 72, 76, 61, 64, 86, 87, 82, 84, 69, 71, 73, 74,
       64, 66, 77, 80, 60, 62, 86, 88, 91, 90, 92, 79, 80, 82, 84, 88, 89,
       69, 70, 73, 75, 82, 85, 88, 89, 81, 83, 84, 86, 88, 71, 73, 75, 70,
       73, 72, 73, 68, 69, 71, 75, 77, 83, 85, 77, 78, 66, 66, 68, 68, 69,
       69, 70, 71, 71, 72, 92, 94, 97, 74, 78, 82, 84, 85, 87, 65, 67, 71,
       73, 81, 83, 85, 78, 79, 80, 75, 78, 68, 70, 72, 79, 81, 83, 80, 81,
       78, 81, 82, 61, 62, 67, 68, 71, 73, 88, 90, 81, 82, 80, 82, 84, 85,
       86, 83, 84, 70, 72, 75, 76, 77, 73, 75, 66, 69, 71, 69, 73, 89, 91,
       92, 69, 71, 73, 66, 68, 69, 82, 84, 78, 80, 63, 65, 96, 98, 78, 80,
       70, 72, 73, 75, 76, 75, 78, 83, 84, 61, 63, 71, 72, 74, 89, 91, 74,
       77, 66, 67, 80, 83, 77, 80, 82, 71, 74, 76, 82, 84, 86, 69, 74, 75,
       70, 71, 86, 87, 70, 72, 77, 79, 81, 83, 62, 65, 76, 78, 73, 75, 76,
       78, 73, 75, 73, 74, 76, 78, 67, 71, 81, 83, 85, 76, 78, 73, 74, 86,
       88, 70, 71, 74, 75, 77, 79, 81, 81, 84, 86, 76, 79, 78, 80, 82, 65,
       67, 78, 81, 70, 71, 74, 78, 74, 75, 73, 75, 67, 68, 76, 78, 81, 65,
       68, 69, 71, 89, 91, 93, 77, 79, 68, 73, 80, 82, 77, 78, 80, 82, 81,
       83, 73, 75, 66, 68, 69, 75, 77, 78, 81, 73, 75, 73, 76, 73, 76, 76,
       78, 77, 79, 80, 82, 84, 77, 79, 78, 80, 71, 73, 76, 77, 81, 75, 79,
       60, 62, 64, 70, 72, 73, 84, 87, 89, 68, 70, 89, 90, 93, 79, 81, 74,
       75, 77, 73, 75, 66, 66, 68, 72, 72, 73, 80, 82, 86, 61, 63, 65])

# Visualization 
fig, ax = plt.subplots(1,2)                          # Making (row, col) of plots  
fig.set_figheight(4)                                 # set height 
fig.set_figwidth(8)                                  # set width
sns.kdeplot(age, color = 'red',
                alpha = .1, fill = 'true',
                ax = ax[0])                          # Distribution plot
sm.qqplot(age, fit = True, line = '45', ax = ax[1])  # qqplot
fig.tight_layout()                                   # Tight layout
plt.show()                                           # show plots

# KS test (because n > 50)
print('n =', age.size)
sps.kstest(age, 'norm')

python statistics normal-distribution kolmogorov-smirnov
1个回答
0
投票

@Timur Shtatland 是正确的。您的代码是:

sps.kstest(age, 'norm')

在不指定正态分布参数的情况下,您将数据与标准正态分布(平均值为 0,标准差为 1)进行比较。因此,检验的 p 值实际上为零也就不足为奇了。相反,您应该使用数据的平均值和标准差:

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy import stats

# Data
data = np.array([87, 88, 75, 76, 80, 88, 90, 80, 83, 85, 71, 73, 75, 93, 95, 68, 69,
        66, 68, 78, 80, 83, 81, 82, 85, 76, 77, 88, 90, 80, 81, 85, 86, 87,
        88, 92, 80, 82, 84, 72, 76, 61, 64, 86, 87, 82, 84, 69, 71, 73, 74,
        64, 66, 77, 80, 60, 62, 86, 88, 91, 90, 92, 79, 80, 82, 84, 88, 89,
        69, 70, 73, 75, 82, 85, 88, 89, 81, 83, 84, 86, 88, 71, 73, 75, 70,
        73, 72, 73, 68, 69, 71, 75, 77, 83, 85, 77, 78, 66, 66, 68, 68, 69,
        69, 70, 71, 71, 72, 92, 94, 97, 74, 78, 82, 84, 85, 87, 65, 67, 71,
        73, 81, 83, 85, 78, 79, 80, 75, 78, 68, 70, 72, 79, 81, 83, 80, 81,
        78, 81, 82, 61, 62, 67, 68, 71, 73, 88, 90, 81, 82, 80, 82, 84, 85,
        86, 83, 84, 70, 72, 75, 76, 77, 73, 75, 66, 69, 71, 69, 73, 89, 91,
        92, 69, 71, 73, 66, 68, 69, 82, 84, 78, 80, 63, 65, 96, 98, 78, 80,
        70, 72, 73, 75, 76, 75, 78, 83, 84, 61, 63, 71, 72, 74, 89, 91, 74,
        77, 66, 67, 80, 83, 77, 80, 82, 71, 74, 76, 82, 84, 86, 69, 74, 75,
        70, 71, 86, 87, 70, 72, 77, 79, 81, 83, 62, 65, 76, 78, 73, 75, 76,
        78, 73, 75, 73, 74, 76, 78, 67, 71, 81, 83, 85, 76, 78, 73, 74, 86,
        88, 70, 71, 74, 75, 77, 79, 81, 81, 84, 86, 76, 79, 78, 80, 82, 65,
        67, 78, 81, 70, 71, 74, 78, 74, 75, 73, 75, 67, 68, 76, 78, 81, 65,
        68, 69, 71, 89, 91, 93, 77, 79, 68, 73, 80, 82, 77, 78, 80, 82, 81,
        83, 73, 75, 66, 68, 69, 75, 77, 78, 81, 73, 75, 73, 76, 73, 76, 76,
        78, 77, 79, 80, 82, 84, 77, 79, 78, 80, 71, 73, 76, 77, 81, 75, 79,
        60, 62, 64, 70, 72, 73, 84, 87, 89, 68, 70, 89, 90, 93, 79, 81, 74,
        75, 77, 73, 75, 66, 66, 68, 72, 72, 73, 80, 82, 86, 61, 63, 65])


# Fit a normal distribution to the data
mu, std = norm.fit(data)

shapiro_test = stats.shapiro(data)

print("\nShapiro-Wilk Test:")
print("Statistic: {:.2f}".format(shapiro_test[0]))
print("p-value: {:.2f}".format(shapiro_test[1]))

# Perform the KS test for normality
ks_statistic, p_value = stats.kstest(data, 'norm', args=(mu, std))

print("\nKolmogorov-Smirnov Test:")
print("Statistic: {:.2f}".format(ks_statistic))
print("p-value: {:.2f}".format(p_value))```

产生这个:

Shapiro-Wilk Test:
Statistic: 0.99
p-value: 0.07

Kolmogorov-Smirnov Test:
Statistic: 0.05
p-value: 0.21

我还会绘制数据的直方图,然后用数据中的参数覆盖正常密度:

# Create histogram of the data
count, bins, ignored = plt.hist(data, 20, density=True, alpha=0.5, color='gray')

# Plot the PDF of the normal distribution
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)

plt.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
plt.title(title)

plt.xlabel('Accuracy')
plt.ylabel('Density')
plt.show()

© www.soinside.com 2019 - 2024. All rights reserved.