从 100 个组合中选择 14 个组合,包括列表中找到的所有数字,并考虑它们出现的频率

问题描述 投票:0回答:1

我无法完成这个脚本。你能帮帮我吗?

由 1 到 49 的 5 个数字组成的 100 个组合的列表。此脚本的目的是以等比例的方式选择这些组合中的 14 个,即包括列表中找到的所有数字,并考虑它们的出现频率。

#!/bin/bash
#Either a list of 100 combinations composed of 5 numbers ranging from 1 to 49. The purpose of this script is to choose 14 of these combinations in an equiporportional way, i.e. by including all the numbers found in the list, and by taking into account their frequency of occurrence.

numbers=() #table to store numbers
for (( i=1; i<=50; i++ )); do
 numbers+=(0) #for each number (from 1 to 49), its number of occurrences is initialized to zero.
done
while IFS=',' read -r num_1 num_2 num_3 num_4 num_5 proba_total proba_50; do #Loop to count the frequency of appearance of numbers in each combination
  numbers[$num_1]=$(echo "${numbers[$num_1]} + 1" | bc)
  numbers[$num_2]=$(echo "${numbers[$num_2]} + 1" | bc)
  numbers[$num_3]=$(echo "${numbers[$num_3]} + 1" | bc)
  numbers[$num_4]=$(echo "${numbers[$num_4]} + 1" | bc)
  numbers[$num_5]=$(echo "${numbers[$num_5]} + 1" | bc)
done < ./tmp
count_numbers=0 #variable to store the number of unique numbers found
for (( i=1; i<=49; i++ )); do
  if [ ${numbers[$i]} -gt 0 ]; then #If the number has been found in at least one combination, it is counted
    echo "Numéro $i: trouvée dans ${numbers[$i]} combinaisons. Doit être présente dans: $(echo "scale=0; $(bc -l <<< "scale=10; $(echo "14/100*${numbers[$i]}" | bc -l)")/1" | bc -l) combinaisons sur 14." #The number is displayed on the screen, the number of times it has been found in the list of combinations, as well as its frequency of appearance rounded to the nearest integer: 14/100*${numbers[$i]} (knowing this frequency allows you to know how many of the 14 combinations will contain the number. Note that numbers less than 0.5 are rounded to Zero so they will not be selected to be in the final list).
    count_numbers=$((count_numbers+1))
  fi
done
echo
echo "Among the 100 combinations submitted, $count_numbers different numbers have been identified."
echo "Here is an equiproportional selection (14 combinations) in terms of frequency of appearance of all the numbers found in the list
:"

tmp 文件的内容(我们只使用第 5 列,不关心其他 2 列的值)。在这个例子中,100 个组合仅由 16 个不同的数字组成:

8,9,27,28,37,.00000006733472960649,1042634448565697025823.29223835702291394323
9,21,27,28,37,.00000007433230311830,816578840834854780620.13253676488983761033
8,9,21,27,28,.00000006642084702349,809583352099871759384.05331707930246135263
9,27,28,29,37,.00000005982349878999,800530309603058114263.12955413405628415282
6,9,27,28,37,.00000007677535066051,800164830173237853165.68451385478173226421
8,9,27,28,29,.00000005345626725995,793672305840536872484.65748459622627763414
6,8,9,27,28,.00000006860387217224,793309957409349354243.65941057807868650664
8,9,21,27,37,.00000006472830766877,768637109161657531107.36215020404453633090
9,27,28,37,38,.00000008008797293678,765950367200722139792.70808848724282392434
8,9,27,28,38,.00000007156392006835,759388603783204921837.34470299821797400650
8,9,27,29,37,.00000005209409197692,753530794822570333580.65275316963420871913
6,8,9,27,37,.00000006685570486120,753186772863701201928.68059602960855865670
9,27,28,36,37,.00000003432168134102,727853914382720354900.15532540758878547038
8,9,27,28,36,.00000003066870055556,721618516642592812941.04427522172377915669
8,9,27,37,38,.00000006974032466839,720981057266385504975.79715852947176227786
9,27,28,37,46,.00000005144956964138,718022526388470928972.97785383018648684494
8,9,27,28,46,.00000004597360570323,711871352437307481044.77587576701276610693
8,9,27,36,37,.00000002988719919004,685121265291608575149.31043507275577808827
8,9,27,37,46,.00000004480210397722,675867082757046931729.25940513590106310814
9,21,27,28,29,.00000005901156037548,621594665702391687553.43013182247200790357
6,9,21,27,28,.00000007573333777678,621310878741079917810.57774679751990516394
9,27,28,37,49,.00000004442255830508,614678394023520350905.01043018227203039752
8,9,27,28,49,.00000003969450462039,609412551258561144676.58410104523326163896
6,9,27,28,29,.00000006095106771602,609100022246271695262.14788563171252710286
7,9,27,28,37,.00000007444479121835,605182789838561454149.80337014063999988950
7,8,9,27,28,.00000006652136260785,599998294261143659553.68993902199947450672
9,21,27,28,38,.00000007900100037444,594744079934757091037.51330773951167761787
9,21,27,29,37,.00000005750782483469,590156313956482285380.80530485500334762951
6,9,21,27,37,.00000007380349706568,589886880069292444789.82126753661936208955
8,9,21,27,29,.00000005138705886617,585100547581320219884.62506487681855338656
6,8,9,21,27,.00000006594832371326,584833421887324228676.39262034659385409017
9,27,28,29,38,.00000006358092043491,583055350733788901897.78183793916738435592
6,9,27,28,38,.00000008159749196288,582789158767559040767.62734661030033784836
8,9,27,37,49,.00000003868300726306,578590333498391952651.15525467892911055788
6,9,27,29,37,.00000005939790955191,578293611245010343632.90742371286894491226
6,8,9,27,29,.00000005307597502506,573339470578273090908.31403107777688051328
7,8,9,27,37,.00000006482626190997,569652220746158360861.35522743767092811838
9,21,27,28,36,.00000003385585951356,565162999031530919194.51233471587012689512
9,21,27,37,38,.00000007698789292116,564663748465602707521.63668015584129901528
8,9,21,27,38,.00000006879379279072,559826372459868951918.89617650416784242267
9,21,27,28,46,.00000005075128413741,557529136502695279289.55156342081339103387
6,9,27,28,36,.00000003496858535285,553802685700155918271.01357691825459534638
9,27,29,37,38,.00000006196074823194,553566199338014782208.76998317905139074649
6,9,27,37,38,.00000007951822058074,553313470545019867022.55535333566508575555
8,9,27,29,38,.00000005536604150721,548823894103192244759.94482073114846514850
6,8,9,27,38,.00000007105480851801,548573330393758149895.64917257914602595971
9,27,28,29,46,.00000004084522149613,546571806588718629159.92064540053319477980
6,9,27,28,46,.00000005241930456425,546322271062291246273.25588036439036272563
9,21,27,36,37,.00000003299314280364,536578787908597598170.79193454897582606947
8,9,21,27,36,.00000002948156318386,531982010869398545389.96342190571381624838
9,27,28,36,38,.00000003647737318401,530122487859086803418.64868308115410484158
9,21,27,37,46,.00000004945803736995,529331022733236284942.26382949677300114065
6,9,27,36,37,.00000003407751410729,525793044383177958946.57397440934865598143
8,9,21,27,46,.00000004419403942056,524796336036209466212.41703875905369622354
9,27,28,38,46,.00000005468103771834,522961930280661839556.42505126510016346856
6,8,9,27,36,.00000003045052092438,521288666930592427701.60136424953761747976
9,27,29,37,46,.00000003980440151366,518927880959922515171.78106435240870101195
6,9,27,37,46,.00000005108355321662,518690966175090557706.84712106147803788961
8,9,27,29,46,.00000003556787497345,514482316166923041430.49883827271356944124
6,8,9,27,46,.00000004564654573150,514247430989797734450.13442289012972910533
8,9,21,28,37,.00000006230275852199,510605063807495101906.25762259740312578814
8,21,27,28,37,.00000006274360142252,509841688152476933474.00965151703414029002
9,27,36,37,38,.00000003554785493127,503310518321191790547.54841388196036742552
8,9,28,29,37,.00000005014198192031,500569950351395440607.19350166955691685548
6,8,9,28,37,.00000006435043624344,500341416818255016592.47976878334013986869
8,27,28,29,37,.00000005049677739442,499821577605378229220.75484908108824416688
6,8,27,28,37,.00000006480576813623,499593385739306779790.41679723157247181854
8,9,27,36,38,.00000003176436805201,498998744754398585476.38638656885447421481
9,27,37,38,46,.00000005328765277306,496512119783486692198.31239741466684797828
8,9,27,38,46,.00000004761605499358,492258586913131096818.63894368884260211659
9,27,28,37,40,.00000003491858014695,485235942921813566213.29465453782181982073
9,22,27,28,37,.00000003909687178847,485235942921813564272.35088284863398638337
8,9,22,27,28,.00000003493565019808,481079011095061875374.20935199698841867302
8,9,27,28,40,.00000003120206926088,481079011095061871685.93693360151844571234
8,9,28,37,38,.00000006712696134888,478947183738005855802.00670201343590077565
8,27,28,37,38,.00000006760193942443,478231137920953501587.46466433784148271390
9,21,27,28,49,.00000004381964503040,477284627782535041437.11297160028593654028
7,9,21,27,28,.00000007343440922844,469911494200724179611.21320224196959249553
9,27,28,29,49,.00000003526655803038,467904373393879427639.03524344357786389758
6,9,27,28,49,.00000004525984628343,467690753220418167446.88473155114670318971
7,9,27,28,29,.00000005910086338412,460676146780810152696.99944525511300839499
6,7,9,27,28,.00000007584794608191,460465826629983226941.93084428564568750982
9,27,28,31,37,.00000006811103364324,459496996558773363769.05382757323633883513
8,9,22,27,37,.00000003404541820778,456747510194405716766.20695671516633183801
8,9,27,37,40,.00000003040697656726,456747510194405713264.47604522472668474869
8,9,27,28,31,.00000006086172977889,455560565803478399282.82240491410563855330
8,9,28,36,37,.00000002876724297450,455125550419553196042.14370655612878221285
8,27,28,36,37,.00000002897079471336,454445118927988480071.15879710827294996539
9,21,27,37,49,.00000004270303063827,453145035152364493764.22369236831715865305
8,9,21,27,49,.00000003815799250764,449263020544343383973.74189948192262048935
6,9,21,27,29,.00000005859174733335,449032622060132765603.33487380932941207583
8,9,28,37,46,.00000004312324492792,448978003798105522076.50929286694923319877
8,27,28,37,46,.00000004342837710545,448306763142593054164.31696119829074720829
9,27,28,38,49,.00000004721267064345,447692638637972009362.46827979134118950117
7,9,21,27,37,.00000007156314992990,446144812053549198717.17483369028309441043
9,27,29,37,49,.00000003436789383011,444239205261228390891.39536705114368980098
6,9,27,37,49,.00000004410653261075,444036389340962753703.93526391684813155733
7,8,9,21,27,.00000006394642483293,442322767137836136294.10504260814917115230
7,9,27,28,38,.00000007912055367847,440776644624908730133.00186293320269407438
8,9,27,29,49,.00000003070999448216,440433485347073485038.79524946280933301948

脚本的实际结果:

Number 6: found in 24 combinaisons. Must be part in: 3 of the 14 combinations that will be picked up.
Number 7: found in 9 combinaisons. Must be part in: 1 of the 14 combinations that will be picked up.
Number 8: found in 50 combinaisons. Must be part in: 7 of the 14 combinations that will be picked up.
Number 9: found in 94 combinaisons. Must be part in: 13 of the 14 combinations that will be picked up.
Number 21: found in 27 combinaisons. Must be part in: 3 of the 14 combinations that will be picked up.
Number 22: found in 3 combinaisons. Must be part in: 0 of the 14 combinations that will be picked up.
Number 27: found in 94 combinaisons. Must be part in: 13 of the 14 combinations that will be picked up.
Number 28: found in 56 combinaisons. Must be part in: 7 of the 14 combinations that will be picked up.
Number 29: found in 22 combinaisons. Must be part in: 3 of the 14 combinations that will be picked up.
Number 31: found in 2 combinaisons. Must be part in: 0 of the 14 combinations that will be picked up.
Number 36: found in 14 combinaisons. Must be part in: 1 of the 14 combinations that will be picked up.
Number 37: found in 51 combinaisons. Must be part in: 7 of the 14 combinations that will be picked up.
Number 38: found in 22 combinaisons. Must be part in: 3 of the 14 combinations that will be picked up.
Number 40: found in 3 combinaisons. Must be part in: 0 of the 14 combinations that will be picked up.
Number 46: found in 17 combinaisons. Must be part in: 2 of the 14 combinations that will be picked up.
Number 49: found in 12 combinaisons. Must be part in: 1 of the 14 combinations that will be picked up.

Among the 100 combinations submitted, 16 different numbers have been identified.

所以我们确定了 16 个唯一数字,但其中 3 个(40、31 和 22)的四舍五入频率等于 0。这意味着它们在 100 种组合中的频率不足以代表样本中的原始数据14组最具代表性的组合

关于剩下的13个数字,脚本表示它们必须分别出现在: 数字 6:将被拾取的 14 种组合中的 3 种。 数字 7:将被拾取的 14 种组合中的一种。 数字 9:将被拾取的 14 种组合中的 13 种。 ...等等

所以一个想要的脚本的输出示例(在这里我自己用我的大脑对它们进行排序并选择其中的 14 个试图保持比例。我没有管理:nums 21、36 和 49 比它们应该出现的次数多一次。有时它不能'不完美。但我想尽可能接近预期的比例)可能是:

Number 6: found in 24 combinaisons. Must be part in: 3 of the 14 combinations that will be picked up.
Number 7: found in 9 combinaisons. Must be part in: 1 of the 14 combinations that will be picked up.
Number 8: found in 50 combinaisons. Must be part in: 7 of the 14 combinations that will be picked up.
Number 9: found in 94 combinaisons. Must be part in: 13 of the 14 combinations that will be picked up.
Number 21: found in 27 combinaisons. Must be part in: 3 of the 14 combinations that will be picked up.
Number 22: found in 3 combinaisons. Must be part in: 0 of the 14 combinations that will be picked up.
Number 27: found in 94 combinaisons. Must be part in: 13 of the 14 combinations that will be picked up.
Number 28: found in 56 combinaisons. Must be part in: 7 of the 14 combinations that will be picked up.
Number 29: found in 22 combinaisons. Must be part in: 3 of the 14 combinations that will be picked up.
Number 31: found in 2 combinaisons. Must be part in: 0 of the 14 combinations that will be picked up.
Number 36: found in 14 combinaisons. Must be part in: 1 of the 14 combinations that will be picked up.
Number 37: found in 51 combinaisons. Must be part in: 7 of the 14 combinations that will be picked up.
Number 38: found in 22 combinaisons. Must be part in: 3 of the 14 combinations that will be picked up.
Number 40: found in 3 combinaisons. Must be part in: 0 of the 14 combinations that will be picked up.
Number 46: found in 17 combinaisons. Must be part in: 2 of the 14 combinations that will be picked up.
Number 49: found in 12 combinaisons. Must be part in: 1 of the 14 combinations that will be picked up.

Among the 100 combinations submitted, 16 different numbers have been identified.
Here is an equiproportional selection (14 combinations) in terms of frequency of appearance of all the numbers found in the list :
7,9,21,27,28
6,9,27,28,37
6,9,27,37,38
6,9,21,27,29
8,9,27,28,49
8,9,27,37,49
8,27,28,37,46
8,9,27,29,46
8,9,28,36,37
8,9,27,36,38
8,9,27,28,37
9,21,27,37,38
9,21,27,28,29
9,27,28,29,46
bash combinations
1个回答
0
投票

问题看起来像

a knapsack problem
的变体。 为了计算严格(优化)的答案,我们需要 检查 C(100, 14) (= 44186942677323600) 的所有可能组合。
一个实用的替代方法是从 100 个样本中随机抽取 14 个 列表中的项目并最小化错误(与原始概率的差异 发生)与重复。结果可能不会优化,但会给出 取决于重复次数的近似值。 流程大纲为:

  1. 阅读原始列表并计算每个数字的概率。
  2. 生成 1 到 100 之间的 14 个随机数。号码 索引原始列表中的序列号。
  3. 将索引行拆分为5个数字并累加频率 号码。
  4. 比较挑选出的14个样本与原来的概率 列表。使用的标准是平均绝对误差(又名 MAE)。如果你 更喜欢其他方法,例如 MSE(均方误差),这取决于您。
  5. 将计算出的MAE与循环中的最小值进行比较。 如果当前值小于(=更好)前一个值, 然后用它更新。
  6. 转到 1. 并重复给定次数。
  7. 最后报告结果

这是一个 awk 脚本:

#!/bin/bash

awk -F, -v trial=1000000 '
#
# generate n random numbers out of integers between 1 and m (inclusive)
# return a comma separated string
#
function random(m, n,   i, r, a, d, x) {
    for (i = 1; i <= n; ) {
        r = int(rand() * m) + 1
        if (! (r in a)) {
            a[r]
            x = x d r
            d = ","
            i++
        }
    }
    return x
}
#
# calculate mean absolute error between array a and array b
#
function mae(a, b,   i, diff, err) {
    for (i in a) {
        diff = a[i] - b[i]
        if (diff < 0) diff = - diff
        err += diff
    }
    return err
}

{
    # loop over input "tmp" file
    for (i = 1; i <= 5; i++) p[$i]++    # count the occurrences
    a[++n] = sprintf("%d,%d,%d,%d,%d", $1, $2, $3, $4, $5)
}

END {
    min_err = -1                        # negative value means "unset"

    for (i in p) {                      # normalize the probability
        p[i] /= 5 * n
    }
    for (i = 1; i <= trial; i++) {      # repeat the simulation "trial" times
        split(random(n, 14), b, ",")    # pick 14 combinations randomly
        for (j in b) {                  # j is the index between 1 and n (=100)
            split(a[b[j]], c, ",")      # dereference numbers between 1 and 49
            for (k in c) {
                p2[c[k]]++              # count the occurrences of the number
            }
        }
        for (j in p2) {                 # normalize the probability
            p2[j] /= 5 * 14
        }
        err = mae(p, p2)                # calculate the error (distance)
        if (min_err < 0 || err < min_err) {
                                        # update the minimum error condition
            delete combo
            for (i in b) combo[i]       # copy array b to array combo
            min_err = err
        }
    }
    #print(min_err)
    print("Here is an equiproportional selection (14 combinations) in terms of frequency of appearance of all the numbers found in the list :")

    for (i in combo) {
        print(a[i])
        split(a[i], d, ",")
        for (j in d) p3[d[j]]++         # count (again) the occurrences of the final result
    }
    print("")

    for (i in p3) {
        p3[i] /= 5 * 14
        printf("%d: prob: %f (prob in the original list: %f)\n", i, p3[i], p[i])
    }
}' ./tmp

1000000次重复的结果:

Here is an equiproportional selection (14 combinations) in terms of frequency of appearance of all the numbers found in the list :
8,9,27,28,37
9,21,27,28,37
8,9,21,27,28
9,27,28,29,37
6,9,27,28,37
8,9,27,28,29
6,8,9,27,28
8,9,21,27,37
9,27,28,37,38
8,9,27,28,38
8,9,27,29,37
6,8,9,27,37
9,27,28,36,37
8,9,27,28,36

6: prob: 0.042857 (prob in the original list: 0.048000)
8: prob: 0.128571 (prob in the original list: 0.100000)
9: prob: 0.200000 (prob in the original list: 0.188000)
21: prob: 0.042857 (prob in the original list: 0.054000)
27: prob: 0.200000 (prob in the original list: 0.188000)
28: prob: 0.157143 (prob in the original list: 0.112000)
29: prob: 0.042857 (prob in the original list: 0.044000)
36: prob: 0.028571 (prob in the original list: 0.028000)
37: prob: 0.128571 (prob in the original list: 0.102000)
38: prob: 0.028571 (prob in the original list: 0.044000)
``
© www.soinside.com 2019 - 2024. All rights reserved.