区分具有恒定空间（内存限制）的非常大的文件

Question

我想在内存有限的Linux环境（16 Gb RAM）中比较两个非常大的数据库转储（几个200 Gb表格文件）。我正在比较的两个文件之间的更改是稀疏的，并且预期它们将保持相同的顺序（可以假定它们是排序的，尽管不是这样）。

[在线搜索答案（[1]，[2]，[3]），导致我尝试diff --speed-large-files －但是我仍在达到内存限制（进一步显示文件，它出现了，但我仍然没有得到任何输出）。我不愿意像建议一个答案那样更改内核过量使用行为。[2]

我希望能够为差异指定内存限制。我愿意付出比较精确的差异结果。

我想应该有某种比较算法，一次比较一个数据块，随即散播出差异，并有失去同步的风险，我认为只要保持同步就不会丢失块大小比修改之间的通用距离大得多。但是，我找不到这样的算法-我得到的最接近的是一些学术论文。（[4]，[5]）

Linux是否有这样一个现有的，易于使用的命令行实用程序？

Answer 1

嗯，由于找不到现成的实现，因此我使用Python的difflib实现了自己的“窗口差异”，并认为我会与世界分享。我添加了一些基本的输出选项和测试代码，但是请注意，未经测试的内容超出了您的阅读范围，因此，后果自负。希望有人有一个更整洁的解决方案。

import difflib
import itertools

# thanks to https://docs.python.org/3/library/itertools.html#itertools-recipes
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)


class Missing:
    """
    Utility class that may be used to nicely represent a 'missing' value.
    """
    def __repr__(self):
        return '<missing>'

    def __sizeof__(self):
        return 0

    def __hash__(self):
        return 0


def rindex_predicate(seq, pred):
    """
    Find the next-to right-most index, such that for items following this index the given predicate holds.

    e.g. rindex_predicate([1, 2], lambda x: x>2) -> 1
         rindex_predicate([1, 2], lambda x: x>1) -> 0
         rindex_predicate([1, 2], lambda x: x>0) -> -1
    """
    i = len(seq) - 1
    while i >= 0 and pred(seq[i]):
        i -= 1
    return i


def rsplit_predicate(seq, pred):
    """
    Split the sequence to two, so that the second part consists only of items for which the predicate holds,
    and the predicate does not hold for the last item of the first part (if it exists).

    e.g. rsplit_predicate([1,2,3], lambda x: x>2) -> ([1,2], [3])
    """
    i = rindex_predicate(seq, pred)
    if i == len(seq) - 1:
        return seq, []
    return seq[:i+1], seq[i+1:]


def windowed_diff(seq1, seq2, window_size):
    """
    Performs a diff of two sequences, similar to SequenceMatcher, using constant space (limited memory)
    determined by window_size. As opposed to SequenceMatcher, the sequences may be generators.

    This is done by breaking the sequences up to windows. Each pair of windows is compared almost independently.
    By allowing tails of unmatched items in one window to be compared on the following window, synchronization 
    problems that result from the window size may be partially avoided. However, if the synchornization is lost
    for a length beyond the given window_size, the diff algorithm might not be able to regain it as it would
    if a full-memory diff was used.

    Returns a generator which generates a tuple (a, b, opcodes) where a and b are the respective windows
    being compared, and opcodes are what SequenceMatcher.get_opcodes() would return with respect to these
    windows.
    """

    # some utilities
    missing = Missing()
    missing_group = tuple()

    is_missing = lambda x: x is missing
    def isnt_equal(opcode):
        tag, alo, ahi, blo, bhi = opcode
        return tag != 'equal'

    # get windows ready
    groups1 = grouper(seq1, window_size, fillvalue=missing)
    groups2 = grouper(seq2, window_size, fillvalue=missing)
    tail1, tail2 = [], []
    pos1, pos2 = 0, 0

    for group1, group2 in itertools.zip_longest(groups1, groups2, fillvalue=missing_group):
        # we prepend any previous unmatched items (see tail below)
        window1, window2 = tuple(tail1) + group1, tuple(tail2) + group2

        window1, _ = rsplit_predicate(window1, is_missing)
        window2, _ = rsplit_predicate(window2, is_missing)

        assert len(window1) < window_size*2 and len(window2) < window_size*2

        tail1, tail2 = [], []
        cruncher = difflib.SequenceMatcher(isjunk=None, a=window1, b=window2)
        opcodes = list(cruncher.get_opcodes())

        # the tail is whatever didn't match at the end of the window
        resolved, tail = rsplit_predicate(opcodes, isnt_equal)

        # matches up to the last equal (synchronization) point are yielded to caller
        yield pos1, pos2, window1, window2, resolved

        # the tail contains all the unmatched items at the end of the window
        # keep tail at most at window_size (since we prepend the previous tail, 
        # the actual window might thus be twice the window_size)
        tail = tail[:window_size]

        # push any discrepencies at end of window to next block
        for tag, alo, ahi, blo, bhi in tail:
            if tag == 'delete':
                # it was in block1 but not in block2
                tail1 += window1[alo:ahi]
            elif tag == 'insert':
                tail2 += window2[blo:bhi]
            elif tag == 'replace':
                tail1 += window1[alo:ahi]
                tail2 += window2[blo:bhi]
            else:
                raise ValueError(tag)

        pos1 += len(window1) - len(tail1)
        pos2 += len(window2) - len(tail2)

    # flush out last tail
    if tail:
        # the tail opcodes relate to window1, window2, so we revert the pos1, pos2 vars
        # to where the tail parts refer to
        pos1 -= len(window1) - len(tail1)
        pos2 -= len(window2) - len(tail2)
        yield pos1, pos2, window1, window2, tail


# based on difflib code (https://github.com/python/cpython/blob/3.7/Lib/difflib.py)
def windowed_text_diff(a, b, window_size):
    """
    Same as difflib.Differ().compare(), only using windowed_diff instead of SequenceMatcher.
    """
    differ = difflib.Differ()
    for apos, bpos, a, b, opcodes in windowed_diff(a, b, window_size):
        for tag, alo, ahi, blo, bhi in opcodes:
            if tag == 'replace':
                g = differ._fancy_replace(a, alo, ahi, b, blo, bhi)
            elif tag == 'delete':
                g = differ._dump('-', a, alo, ahi)
            elif tag == 'insert':
                g = differ._dump('+', b, blo, bhi)
            elif tag == 'equal':
                g = differ._dump(' ', a, alo, ahi)
            else:
                raise ValueError('unknown tag %r' % (tag,))

            yield from g


def unified_diff_from_grouped_opcodes(a, b, apos, bpos, grouped_opcodes, lineterm='\n'):
    """
    Same as difflib.unified_diff, only the grouped_opcodes are given, no file info is given and no type checks are done.
    """
    for group in grouped_opcodes:
        first, last = group[0], group[-1]
        file1_range = difflib._format_range_unified(first[1]+apos, last[2]+apos)
        file2_range = difflib._format_range_unified(first[3]+bpos, last[4]+bpos)
        yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)

        for tag, i1, i2, j1, j2 in group:
            if tag == 'equal':
                for line in a[i1:i2]:
                    yield ' ' + line
                continue
            if tag in {'replace', 'delete'}:
                for line in a[i1:i2]:
                    yield '-' + line
            if tag in {'replace', 'insert'}:
                for line in b[j1:j2]:
                    yield '+' + line


def windowed_unified_diff(a, b, window_size, fromfile='', tofile='', fromfiledate='',
                 tofiledate='', n=3, lineterm='\n'):
    started = False
    for apos, bpos, a, b, opcodes in windowed_diff(a, b, window_size):
        if not started:
            started = True
            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
            todate = '\t{}'.format(tofiledate) if tofiledate else ''
            yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
            yield '+++ {}{}{}'.format(tofile, todate, lineterm)

        matcher = difflib.SequenceMatcher(None, a, b)
        matcher.opcodes = opcodes
        grouped_opcodes = matcher.get_grouped_opcodes(n)
        yield from unified_diff_from_grouped_opcodes(a, b, apos, bpos, grouped_opcodes, lineterm=lineterm)


def test_windowed_diff():
    import random

    def generate_lines(n):
        for i in range(n):
            if random.random() < 0.1:
                action = random.choice(['insert', 'delete', 'edit'])
                if action == 'insert':
                    yield 'inserted at %d' % (i,)
                elif action == 'delete':
                    continue
                elif action == 'edit':
                    yield '%d+' % (i,)
                    continue
            yield '%d' % (i,)

    random.seed(2)

    for cut in ['a', 'b', 'neither']:
        for n in [10, 100]:
            a = list(generate_lines(n))
            b = list(generate_lines(n))

            if cut == 'a':
                a = a[:int(n*0.9)]
            elif cut == 'b':
                b = b[:int(n*0.8)]

            base_text_diff_result = list(difflib.ndiff(a, b))
            base_unified_diff_result = list(difflib.unified_diff(a, b, n=0))

            for window_size in [1, 2, 3, 4, 10, int(n/2), n-1, n, n+1, n*2]:
                result = list(windowed_text_diff(a=(x for x in a), b=(x for x in b), window_size=window_size))
                assert result == base_text_diff_result, (window_size, result, base_text_diff_result)

                result = list(windowed_unified_diff(a=(x for x in a), b=(x for x in b), window_size=window_size, n=0))
                assert result == base_unified_diff_result, (window_size, result, base_unified_diff_result)


def main():
    import argparse
    import sys

    parser = argparse.ArgumentParser()
    parser.add_argument('--window-size', type=int, default=1024*64)
    parser.add_argument('--unified-diff', action='store_true')
    parser.add_argument('--context', type=int, default=3)
    parser.add_argument('file1')
    parser.add_argument('file2')
    args = parser.parse_args()

    with open(args.file1, 'rt') as f1, open(args.file2, 'rt') as f2:
        if args.unified_diff:
            result = windowed_unified_diff(f1, f2, args.window_size, n=args.context, fromfile=f1.name, tofile=f2.name)
        else:
            result = windowed_text_diff(f1, f2, args.window_size)

        for line in result:
            sys.stdout.write(line)


if __name__ == "__main__":
    main()

区分具有恒定空间（内存限制）的非常大的文件

问题描述投票：0回答：1

1个回答

最新问题

区分具有恒定空间（内存限制）的非常大的文件

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1