比率分布 - 积分可能发散，或缓慢收敛或在 100 次迭代后未能收敛

Question

import numpy as np
from scipy.stats import norm, rv_continuous
from scipy.special import erf
import scipy.integrate as integrate

class normal_ratio_wiki(rv_continuous):
    def _pdf(self, z, mu_x, mu_y, sigma_x, sigma_y):
    
        a_z = np.sqrt(((1/(sigma_x**2))*(np.power(z,2))) + (1/(sigma_y**2)))
        b_z = ((mu_x/(sigma_x**2)) * z) + (mu_y/sigma_y**2)
        c = ((mu_x**2)/(sigma_x**2)) + ((mu_y**2)/(sigma_y**2))
        d_z = np.exp(((b_z**2)-((c*a_z**2))) / (2*(a_z**2)))
        pdf_z = ((b_z * d_z) / (a_z**3)) * (1/(np.sqrt(2*math.pi)*sigma_x*sigma_y)) * \
        (norm.cdf(b_z/a_z) - norm.cdf(-b_z/a_z)) + ((1/((a_z**2) * math.pi * sigma_x * sigma_y))*np.exp(-c/2))

        return pdf_z

    def _cdf(self, z, mu_x, mu_y, sigma_x, sigma_y):
        cdf_z = integrate.quad(self._pdf, -np.inf, np.inf, args=(mu_x, mu_y, sigma_x, sigma_y))[0]
        
        return cdf_z

rng1 = np.random.default_rng(99)
rng2 = np.random.default_rng(88)

# Sample Data 1
x = rng1.normal(141739.951, 1.223808e+06, 1000)
y = rng2.normal(333.91, 64.494571, 1000)

# Sample Data 2
# x = rng1.normal(500, 20, 1000)
# y = rng2.normal(400, 10, 1000)

z = x / y

# 1st approach with normal_ratio_wiki 
mu_x = x.mean()
mu_y = y.mean()
sigma_x = x.std()
sigma_y = y.std()

rng3 = np.random.default_rng(11)
nr_wiki_inst = normal_ratio_wiki(name='normal_ratio_wiki', seed=rng3)
nr_wiki_vars = nr_wiki_inst.rvs(mu_x, mu_y, sigma_x, sigma_y, size = 100)
nr_wiki_params = nr_wiki_vars.fit(nr_wiki_vars)

您好，我正在通过使用

scipy

定义自定义分布来模拟两个不相关正态分布的比率分布。

方法来自这里。

使用任一方法从上面的自定义分布调用

scipy.dist.rvs

或

fit

方法时，我分别收到以下错误

RuntimeError: Failed to converge after 100 iterations.

和

IntegrationWarning: The integral is probably divergent, or slowly convergent.

。如果我们注释

_cdf(...)

，那么该过程需要花费大量时间来运行

scipy.dist.rvs

，但随后在调用

fit

时失败。尝试了不同的有界区间但没有成功。

我相信实施自定义

_rvs

、

_ppf

和/或

_fit

方法可能有助于解决问题。根据上面的

_pdf

和

_cdf

方法我们应该如何定义它呢？请指教。

请注意，示例

integrate.quad(nr_wiki_inst.pdf, -np.inf, np.inf, args=(mu_x, mu_y, sigma_x, sigma_y))

可以单独运行，没有任何问题。

Answer 1

_cdf

方法无法按编写的方式工作，因为它不返回任何内容。它本质上也与通用实现相同，因此其余部分假设它已被删除。我还假设

_pdf

实现是正确的，而且我还没有看到任何其他建议。当

mu

为 0 且

sigma

为 1 时，它会简化为柯西分布，即使参数发生变化，它似乎也会积分为 1，并且 PDF 似乎与随机变量一致（见下文）。

使用

quad

整合 PDF 以获得 CDF 肯定会相当慢，但有一点可以让它变得更快，那就是在 PDF 中用

scipy.stats.norm.cdf

替换

scipy.special.ndtr

的使用。 SciPy 分布有很多开销，但特殊函数会直接跳转到评估正态分布的 CDF。

from scipy import stats, special
x = 1.5
%timeit special.ndtr(x)
# 1.43 µs ± 726 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%timeit stats.norm.cdf(x)
# 107 µs ± 21.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

这会将您的发行版的 CDF 速度提高 10 倍至 20 倍。

提高RVS的速度也很容易。

_rvs

的通用实现将 CDF（数值）反转以获得 PPF，并传入均匀分布的随机数以获得每个观测值（逆变换采样）。我们重写

_rvs

来生成两个正态随机变量并获取比率。

    def _rvs(self, mu_x, mu_y, sigma_x, sigma_y, size=None, random_state=None):
      x = random_state.normal(mu_x, sigma_x, size)
      y = random_state.normal(mu_y, sigma_y, size)
      return x/y

检查 RVS 实现与 PDF 之间的一致性：

rng = np.random.default_rng(8235834852452842)
nr_wiki_inst = normal_ratio_wiki(name='normal_ratio_wiki', seed=rng)

mu_x = 1
mu_y = 2
sigma_x = 3
sigma_y = 4
dist = nr_wiki_inst(mu_x, mu_y, sigma_x, sigma_y)

rvs = dist.rvs(size = 100_000)
plt.hist(rvs, density=True, bins=np.linspace(-10, 10, 300))

x = np.linspace(-10, 10, 300)
pdf = dist.pdf(x)
plt.plot(x, pdf)

这会根据分布生成 100_000 个随机变量（以 8 毫秒为单位），并允许我们直观地将直方图与 PDF 进行比较。一致性很好，直方图的明显过量可以通过以下事实来解释：直方图在 [-10, 10] 范围内而不是整个实线进行归一化。

您可以使用

scipy.stats.sampling.NumericalInversePolynomial

获得快速（但近似）的 PPF 方法。描述

NumericalInverseHermite

的作用更容易：在多个点评估 CDF 并使用该数据对 PPF 进行插值。

NumericalInversePolynomial

也可以使用插值，但更高级，可以直接处理 PDF。

dist2 = sampling.NumericalInversePolynomial(dist, center=0, random_state=rng)
# show that `dist2`'s `ppf` method is the inverse of the original `cdf`
dist2.ppf(dist.cdf(1.5))  # 1.5000000001244032

我认为

fit

遇到了麻烦，部分原因是它不知道

sigma_x

和

sigma_y

不能为负数。为此，您需要定义

_argcheck

:

    def _argcheck(self, mu_x, mu_y, sigma_x, sigma_y,):
        return (sigma_x > 0.) & (sigma_y > 0.)

但是还有其他问题，我想我现在不会调试它。相反，我们可以定义

_shape_info

，以便我们可以在此分布中使用

scipy.stats.fit

函数，而不是分布的

fit

方法。

# _ShapeInfo is private/not documented
# so don't use this for production code
from scipy.stats._distn_infrastructure import _ShapeInfo
...
    def _shape_info(self):
        # parameter name, discrete (True or False), domain, domain includes endpoints
        return [_ShapeInfo("mu_x", False, (-np.inf, np.inf), (False, False)),
                _ShapeInfo("mu_y", False, (-np.inf, np.inf), (False, False)),
                _ShapeInfo("sigma_x", False, (0, np.inf), (False, False)),
                _ShapeInfo("sigma_y", False, (0, np.inf), (False, False))]

现在可以使用

fit

功能了：

# generate random variates to fit
rvs = dist.rvs(size=200)

# assign rough bounds to parameters
bounds = dict(mu_x=(-10, 10), mu_y=(-10, 10), sigma_x=(1e-1, 10), sigma_y=(1e-1, 10))
res = stats.fit(nr_wiki_inst, rvs, bounds=bounds)

# Show that negative log likelihood of fitted parameters is better
print(res.nllf((1, 2, 3, 4, 0, 1)))  # 454.43887891645863
print(res.nllf())  # 453.19908132025944
# The latter corresponds with the fitted parameters.
# It's lower, so it's a better fit according to MLE

大家一起：

import numpy as np
import matplotlib.pyplot as plt
from scipy import integrate, stats, special
from scipy.stats import sampling
from scipy.stats._distn_infrastructure import _ShapeInfo

class normal_ratio_wiki(stats.rv_continuous):
    def _pdf(self, z, mu_x, mu_y, sigma_x, sigma_y):
    
        a_z = np.sqrt(((1/(sigma_x**2))*(np.power(z,2))) + (1/(sigma_y**2)))
        b_z = ((mu_x/(sigma_x**2)) * z) + (mu_y/sigma_y**2)
        c = ((mu_x**2)/(sigma_x**2)) + ((mu_y**2)/(sigma_y**2))
        d_z = np.exp(((b_z**2)-((c*a_z**2))) / (2*(a_z**2)))
        pdf_z = ((b_z * d_z) / (a_z**3)) * (1/(np.sqrt(2*np.pi)*sigma_x*sigma_y)) * \
        (special.ndtr(b_z/a_z) - special.ndtr(-b_z/a_z)) + ((1/((a_z**2) * np.pi * sigma_x * sigma_y))*np.exp(-c/2))

        return pdf_z
        
    def _rvs(self, mu_x, mu_y, sigma_x, sigma_y, size=None, random_state=None):
        x = random_state.normal(mu_x, sigma_x, size)
        y = random_state.normal(mu_y, sigma_y, size)
        return x/y
    
    def _argcheck(self, mu_x, mu_y, sigma_x, sigma_y):
        return (sigma_x > 0) & (sigma_y > 0)

    def _shape_info(self):
        return [_ShapeInfo("mu_x", False, (-np.inf, np.inf), (False, False)),
                _ShapeInfo("mu_y", False, (-np.inf, np.inf), (False, False)),
                _ShapeInfo("sigma_x", False, (0, np.inf), (False, False)),
                _ShapeInfo("sigma_y", False, (0, np.inf), (False, False))]

rng = np.random.default_rng(11)
nr_wiki_inst = normal_ratio_wiki(name='normal_ratio_wiki', seed=rng)

mu_x = 1
mu_y = 2
sigma_x = 3
sigma_y = 4
dist = nr_wiki_inst(mu_x, mu_y, sigma_x, sigma_y)
rvs = dist.rvs(size = 100_000, random_state=rng)
x = np.linspace(-10, 10, 300)
pdf = dist.pdf(x)
plt.plot(x, pdf)
plt.hist(rvs, density=True, bins=np.linspace(-10, 10, 300))

dist2 = sampling.NumericalInversePolynomial(dist, center=0, random_state=rng)
dist2.ppf(dist.cdf(1.5))  # 1.5000000001244032

rvs = dist.rvs(size=200, random_state=rng)
bounds = dict(mu_x=(-10, 10), mu_y=(-10, 10), sigma_x=(1e-1, 10), sigma_y=(1e-1, 10))
res = stats.fit(nr_wiki_inst, rvs, bounds=bounds)
print(res.nllf((1, 2, 3, 4, 0, 1)))  # 399.2571952688255
print(res.nllf())  # 396.24324681778955

比率分布 - 积分可能发散，或缓慢收敛或在 100 次迭代后未能收敛

问题描述投票：0回答：1

1个回答

最新问题

比率分布 - 积分可能发散，或缓慢收敛或在 100 次迭代后未能收敛

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1