从子序列中查找超序列的多数合并算法的实现?

问题描述 投票:0回答:2

我有一小组唯一值的序列,我想将它们组合成一个超级序列,其中尽可能保持每个值的相对顺序。例如(为简单起见,忽略字符串周围的引号):

list1 = [Mary, Bob, Sue, Roger]
list2 = [Bob, Alice, Sue, Dave]
list3 = [Mary, Bob, Larry, Sue, Roger]

superSequence = [Mary, Bob, Alice, Larry, Sue, Roger, Dave]

目标是生成一个可以重新创建原始列表的对象,例如:

obj = {
    Mary: [1, 3],
    Bob: [1, 2, 3],
    Alice: [2],
    Larry: [3],
    Sue: [1, 2, 3],
    Roger: [1, 3],
    Dave: [2]
}

暂时忽略在所有情况下都无法保证对象键顺序的情况,可以迭代这些键并使用每个关联数组中的索引来重新生成原始列表。因为超级序列是用JS对象来表示的,所以里面的值必须是唯一的。

显然,并不是每组序列都可以通过这种方式组合:

list1 = [Mary, Bob, Sue]
list2 = [Bob, Mary, Sue]

superSequence = [Mary, Bob, Sue] OR
superSequence = [Bob, Mary, Sue] 

在大多数情况下,选择其中之一应该没问题,如果算法可以突出显示顺序不确定的地方,则可以加分。

在研究这个问题时,我似乎偶然发现了一个 NP 难题,该问题在压缩、生物信息学和其他领域得到了充分研究。有一种叫做多数合并算法的东西,似乎是解决这个问题的一个相当不错的近似方法,但多年来我将学术论文伪代码转化为可用的东西的能力已经萎缩。所以我希望找到一个 JS、C、Python 或不依赖魔术库的实际实现。

javascript algorithm compression bioinformatics
2个回答
3
投票

经过进一步思考,我意识到你的问题有一个更简单的答案。由于我们可以假设相同的名称不会在给定列表中多次出现,因此这将起作用:

var simpleCombine = function (arr1, arr2) {
    "use strict";
    arr1 = JSON.parse(JSON.stringify(arr1));
    arr2 = JSON.parse(JSON.stringify(arr2));
    var i, j, arrOut = [];
    while (arr1.length) {
        var val = arr1.shift(), found = false;
        for (j = 0; j < arr2.length; j += 1) {
            if (val === arr2[j]) {
                //If we wound an overlap then put everything to the left of it 
                // in arr2 into the final array
                found = true;
                var pos = arrOut.length;
                arrOut.push(val);
                var newVals = arr2.splice(0, j);
                while (newVals.length) {
                    arrOut.splice(pos, 0, newVals.pop());
                }
                arr2.shift(); //get rid of dup
                break;
            }
        }
        if (!found) {
             //No overlap found, just add it to the out array
            arrOut.push(val);
        }
    }
    //anything left in arr2? Add it to out array
    arrOut = arrOut.concat(arr2);

    //check for duplicates based on user requirement of each item in the 
    // sequence only occurs once. 

    for (i = 0; i < arrOut.length; i += 1) {
        for (j = i + 1; j < arrOut.length; j += 1) {
            if (arrOut[i] === arrOut[j]) {
                //If we find an overlap warn the user, and remove the dup.
                console.warn('Even with strict ordering, multiple solutions are possible');
                arrOut.splice(i,1);
                i -= 1;
                break;
            }
        }
    }

    return arrOut;
};

var findMultipleSCS = function (arr) {
    var first = arr.shift();
    while (arr.length) {
        first = simpleCombine(first, arr.shift());
    }
    return first;
};

list1 = ["Mary", "Bob", "Sue", "Roger"];
list2 = ["Bob", "Alice", "Sue", "Dave"];
list3 = ["Mary", "Bob", "Larry", "Sue", "Roger"];

console.log(findMultipleSCS([list1, list2, list3]));

我的原始答案如下,因为对于可能多次包含相同名称的列表来说它更准确。

//This code works for things where the important thing is that order is
//maintained, not that each entry only occurs once
var findSCS = (function () {
    'use strict';
    var lcsLen, lcsBack, combine;
    lcsLen = function(arr1, arr2) {
        //This function moves through the arrays developing the 
        // length of the longest possible sequence of identical order.
        var dists = [[0]], i, j;
        for (i = 0; i < arr1.length; i += 1) {
            dists[i + 1] = [];
            for (j = 0; j < arr2.length; j += 1) {
                dists[i + 1][0] = 0; // initialize 0'th column/row with 0
                dists[0][j + 1] = 0; // this could be done in a separate loop
                dists[i + 1][j + 1] = dists[i + 1][j + 1] || 0; // initialize i,j
                if (arr1[i] === arr2[j]) {
                    //if this condition is met then we have a longer overlap
                    dists[i + 1][j + 1] = dists[i][j] + 1;
                } else {
                    //if not take the max length so far
                    dists[i + 1][j + 1] = Math.max(dists[i][j + 1], dists[i + 1][j]);
                }
            }
        }
        return dists;
    };

    lcsBack = function (dists, x, y, i, j) {
        //this recursive function takes the longest possible array and build
        // the actual list starting from the bottom right of the matrix
        // created by lcsLen
        if (!i || !j) {
            return [];
        } else if(x[i - 1] === y[j - 1]) {
            return lcsBack(dists, x, y, i - 1, j - 1).concat([x[i - 1]]);
        } else {
            if (dists[i][j-1] > dists[i-1][j]) {
                return lcsBack(dists, x, y, i, j-1);
            } else {
                return lcsBack(dists,x,y,i-1,j);
            }
        }
    };

    combine = function (lcs, arr1, arr2) {
        //this take the lcs and fills in the non-overlapping part of 
        // the original lists, creating the scs
        var out = JSON.parse(JSON.stringify(arr1));
        var i, testing = 0, outPos = 0, positions = [0];
        for (i = 0; i < arr1.length && testing < lcs.length; i += 1) {
            if (lcs[testing] === arr1[i]) {
                positions[testing + 1] = i;
                testing += 1;
            }
        }
        testing = 0; outPos = 0;
        for (i = 0; i < arr2.length; i += 1) {
            if (lcs[testing] === undefined || lcs[testing] !== arr2[i]) {
                out.splice(positions[testing] + outPos, 0, arr2[i]);
                outPos += 1;
            } else {
                testing += 1;
                outPos += 1;
            }
        }

        return out;
    };

    return function (arr1, arr2) {
        //get the length matrix to determine the maximum sequence overlap
        var lcsLenMat = lcsLen(arr1,arr2);
        //Take that distance matrix and build the actual sequence (recursively)
        var lcs = lcsBack(lcsLenMat, arr1, arr2, arr1.length, arr2.length);
        //Build the SCS
        var tempScs =  combine(lcs, arr1, arr2);
        //This code will allow for duplicates, and in your second example
        // It will generate a list with two bobs, which is arguably more
        // correct for general purpose use.
        return tempScs;
}());

var findMultipleSCS = function (arr) {
    var first = arr.shift();
    while (arr.length) {
        first = findSCS(first, arr.shift());
    }
    return first;
};

list1 = ["Mary", "Bob", "Sue", "Roger"];
list2 = ["Bob", "Alice", "Sue", "Dave"];
list3 = ["Mary", "Bob", "Larry", "Sue", "Roger"];

console.log(findMultipleSCS([list1, list2, list3]));

这些想法大部分取自 https://en.wikipedia.org/wiki/Longest_common_subsequence_problemhttps://en.wikipedia.org/wiki/Shortest_common_supersequence_problem

您放置这些的顺序将决定您获得哪种非唯一解决方案。例如list1,list2,list3给你你的第一个答案,但是list2,list3,list1给你也正确:

["Mary", "Bob", "Larry", "Alice", "Sue", "Dave", "Roger"]

如果你想保持列表1、列表2、列表3的优先级顺序,确实有一个独特的解决方案,这将通过控制台提醒你重复的可能性。警告寻找重复项。


2
投票

基于 aduss 的

simpleCombine()
函数,我想出了一个似乎运行良好的解决方案。目前它并未标记结果中的重复项已被删除,但这可以在最终的
filter()
调用中通过一些额外的逻辑来实现。

function combineLists(...lists)
{
    var superSequence = lists.slice(1).reduce((list1, list2) => {
        var result = [];

            // we need to make a copy of list2 since we mutate it in the loop below
        list2 = [].concat(list2);

        list1.forEach(item => {
            var overlapIndex = list2.indexOf(item);

            if (overlapIndex > -1) {
                    // add 1 to overlapIndex so we also splice out the matching item
                result = result.concat(list2.splice(0, overlapIndex + 1));
            } else {
                result.push(item);
            }
        });

            // anything remaining in list2 is by definition not in list1, so add
            // those items to the result
        return result.concat(list2);
    }, lists[0]);

        // look back at the list up to the current item and then filter it out if
        // there's a duplicate found.  this keeps the first instance of each item.
    return superSequence.filter((item, i, list) => list.slice(0, i).indexOf(item) == -1);
}

var list1 = ["Mary", "Bob", "Sue", "Roger"],
    list2 = ["Bob", "Alice", "Jimmy", "Chuck", "Sue", "Dave"],
    list3 = ["Mary", "Bob", "Larry", "Sue", "Roger"];

console.log(combineLists(list1, list2, list3).join(" "));

© www.soinside.com 2019 - 2024. All rights reserved.