我想拆分一个数字列表并根据两个条件创建两个子集:
我现在所做的只是一种概率方法,而不是一种开发方法,可以在添加一个保持计数条件的 while 循环后用于小列表。然而,我希望有一种更稳健的方法,无需运行代码 100 次迭代来找到(可能找到)最佳分割。
import numpy as np
from scipy.stats import norm
def split_into_normal_distributions(numbers, target_percentage):
# Calculate the means and standard deviation of the two normal distributions
mean_1 = np.mean(numbers) * target_percentage
mean_2 = np.mean(numbers) *(1-target_percentage)
std = np.std(numbers)
# Initialize subsets
subset_1 = []
subset_2 = []
for num in numbers:
# Calculate probability densities for each number in both distributions
pdf_1 = norm.pdf(num, loc=mean_1, scale=std)
pdf_2 = norm.pdf(num, loc=mean_2, scale=std)
# Calculate the ratio of probabilities for assignment
ratio = pdf_1 / (pdf_1 + pdf_2)
pdf_sum = pdf_1 + pdf_2
# Assign numbers to subsets based on the calculated ratio
if np.random.rand() < ratio:
subset_1.append(num)
else:
subset_2.append(num)
return subset_1, subset_2
# Sample list of numbers
numbers = [10, 20, 30, 40, 50, 60, 70, 80,10,20,25,20,21,26,65,95,84,65,2,3,6,198,16,651,984,651,35,61,651,16,56,651,651,651,2,32,615,651,984,615,351,651,651,3,5]
# Split into two normal distributions with specified means and standard deviation
subset_1, subset_2 = split_into_normal_distributions(numbers, 0.4)
print("Subset 1 (40% mean):", subset_1, sum(subset_1)/sum(numbers), len(subset_1))
print("Subset 2 (60% mean):", subset_2, sum(subset_2)/sum(numbers), len(subset_2))
len(numbers)
谢谢你
这就是我的想法。这是一种简单的贪心方法,可以产生总数相等的分割。不完全符合您的要求,但希望有帮助。
import bisect
class Cluster:
def __init__(self, nums):
self.nums = nums
self.total = sum(nums)
def main():
nums = [10, 20, 30, 40, 50, 60, 70, 80, 10, 20, 25, 20, 21, 26, 65, 95, 84, 65, 2, 3, 6, 198, 16, 651, 984, 651, 35, 61, 651, 16, 56, 651, 651, 651, 2, 32, 615, 651, 984, 615, 351, 651, 651, 3, 5]
clusters = [Cluster([n]) for n in nums]
while len(clusters) > 1:
pairs = list(zip(clusters, clusters[1:]))
best_pair_index = min(
range(len(pairs)), key=lambda i: abs(pairs[i][0].total - pairs[i][1].total)
)
best_pair = pairs[best_pair_index]
combined = Cluster(best_pair[0].nums + [-n for n in best_pair[1].nums])
del clusters[best_pair_index : best_pair_index + 2]
bisect.insort(clusters, combined, key=lambda c: c.total)
[cluster] = clusters
left = [n for n in cluster.nums if n > 0]
right = [-n for n in cluster.nums if n < 0]
print(sum(left))
print(sum(right))
print(len(left))
print(len(right))
main()