如何在 Node.js 中实现 Unix“粘贴”命令而不将整个文件加载到内存中？

Question

基本的 Unix 粘贴可以像 Python 一样实现（该示例仅适用于两个文件；Unix 粘贴适用于多个文件）：

def paste(fn1, fn2):
  with open(fn1) as f1:
    with open(fn2) as f2:
      for l1 in f1:
        l2 = f2.readline()
        if l2 != None:
          print(l1[:-1] + "\t" + l2[:-1])
        else:
          print(l1[:-1])
      for l2 in f2:
        print("\t" + l2[:-1])

import sys
if __name__ == "__main__":
  if len(sys.argv) >= 3:
    paste(sys.argv[1], sys.argv[2])

任务是在 Node.js 中实现相同的功能。 重要的是，由于输入文件可能很大，因此实现应该逐行读取输入文件，而不是将整个文件读入内存。我想看看如何使用内置 Node 功能来实现这一目标，而无需外部包。

请注意，使用同步 I/O 实现 Unix 粘贴很容易，如 Python 示例所示，但 Node 不提供用于行读取的同步 I/O。同时，有一些方法可以使用异步 I/O 逐行读取一个文件，但是联合读取两个文件会比较困难，因为两个流不同步。

到目前为止，我能想到的唯一解决方案是使用基本的 read API 实现同步行读取。 Dave Newton 在评论中指出，npm n-readlines 包在 100 多行代码中实现了这种方法。因为 n-readlines 检查每个字节以查找行结尾，所以我怀疑它效率低下，因此做了一个microbenchmark，结果如下表所示。对于行读取（不适用于此任务），n-readlines 的速度是 Node Readline 实现的 3 倍，并且比 Python、Perl 或 mawk 中的内置行读取慢一个数量级。

实现 Unix 粘贴的正确方法是什么？ N-readlines 使用同步 API。一个好的异步解决方案会更干净、更快吗？

语言	运行时	版本	已过去	用户	系统	代码
JavaScript	节点	21.5.0	6.30	5.33	0.90	lc-node.js
	节点	21.5.0	22.34	20.41	2.24	lc-n-readlines.js
	小圆面包	1.0.20	4.91	5.30	1.47	lc-node.js
	小圆面包	1.0.20	21.16	19.22	3.37	lc-n-readlines.js
	k8	1.0	1.49	1.06	0.37	lc-k8.js
C	叮当	15.0.0	0.71	0.35	0.35	lc-c.c
蟒蛇	蟒蛇	2017年11月3日	3.48	2.85	0.62	lc-python.py
perl	perl	5.34.3	1.70	1.13	0.57	lc-perl.pl
awk	莫克	1.3.4	2.08	1.27	0.80	lc-awk.awk
	苹果awk	？	90.06	87.90	1.12	lc-awk.awk

Answer 1

我正在发布基于 n-readlines 的答案。它冗长且效率低下（请参阅问题中的表格），但它解决了问题。我仍在寻找更好的解决方案。

const fs = require('fs');

// the following implementation is copied from https://github.com/nacholibre/node-readlines (the n-readlines package)
// Written by Liucw and distributed under the MIT license
class LineByLine {
    constructor(file, options) {
        options = options || {};
        if (!options.readChunk) options.readChunk = 1024;
        if (!options.newLineCharacter) {
            options.newLineCharacter = 0x0a; //linux line ending
        } else {
            options.newLineCharacter = options.newLineCharacter.charCodeAt(0);
        }
        if (typeof file === 'number') {
            this.fd = file;
        } else {
            this.fd = fs.openSync(file, 'r');
        }
        this.options = options;
        this.newLineCharacter = options.newLineCharacter;
        this.reset();
    }
    _searchInBuffer(buffer, hexNeedle) {
        let found = -1;
        for (let i = 0; i <= buffer.length; i++) {
            let b_byte = buffer[i];
            if (b_byte === hexNeedle) {
                found = i;
                break;
            }
        }
        return found;
    }
    reset() {
        this.eofReached = false;
        this.linesCache = [];
        this.fdPosition = 0;
    }
    close() {
        fs.closeSync(this.fd);
        this.fd = null;
    }
    _extractLines(buffer) {
        let line;
        const lines = [];
        let bufferPosition = 0;
        let lastNewLineBufferPosition = 0;
        while (true) {
            let bufferPositionValue = buffer[bufferPosition++];
            if (bufferPositionValue === this.newLineCharacter) {
                line = buffer.slice(lastNewLineBufferPosition, bufferPosition);
                lines.push(line);
                lastNewLineBufferPosition = bufferPosition;
            } else if (bufferPositionValue === undefined) {
                break;
            }
        }
        let leftovers = buffer.slice(lastNewLineBufferPosition, bufferPosition);
        if (leftovers.length) {
            lines.push(leftovers);
        }
        return lines;
    };
    _readChunk(lineLeftovers) {
        let totalBytesRead = 0;
        let bytesRead;
        const buffers = [];
        do {
            const readBuffer = new Buffer(this.options.readChunk);
            bytesRead = fs.readSync(this.fd, readBuffer, 0, this.options.readChunk, this.fdPosition);
            totalBytesRead = totalBytesRead + bytesRead;
            this.fdPosition = this.fdPosition + bytesRead;
            buffers.push(readBuffer);
        } while (bytesRead && this._searchInBuffer(buffers[buffers.length-1], this.options.newLineCharacter) === -1);
        let bufferData = Buffer.concat(buffers);
        if (bytesRead < this.options.readChunk) {
            this.eofReached = true;
            bufferData = bufferData.slice(0, totalBytesRead);
        }
        if (totalBytesRead) {
            this.linesCache = this._extractLines(bufferData);
            if (lineLeftovers) {
                this.linesCache[0] = Buffer.concat([lineLeftovers, this.linesCache[0]]);
            }
        }
        return totalBytesRead;
    }
    next() {
        if (!this.fd) return false;
        let line = false;
        if (this.eofReached && this.linesCache.length === 0) {
            return line;
        }
        let bytesRead;
        if (!this.linesCache.length) {
            bytesRead = this._readChunk();
        }
        if (this.linesCache.length) {
            line = this.linesCache.shift();
            const lastLineCharacter = line[line.length-1];
            if (lastLineCharacter !== this.newLineCharacter) {
                bytesRead = this._readChunk(line);
                if (bytesRead) {
                    line = this.linesCache.shift();
                }
            }
        }
        if (this.eofReached && this.linesCache.length === 0) {
            this.close();
        }
        if (line && line[line.length-1] === this.newLineCharacter) {
            line = line.slice(0, line.length-1);
        }
        return line;
    }
}

function main(args) {
    if (args.length < 2) {
        console.log("Usage: node lc-n-readlines.js <in1.txt> <in2.txt>");
        return;
    }
    const f1 = new LineByLine(args[0]);
    const f2 = new LineByLine(args[1]);
    let l1, l2;
    while (l1 = f1.next()) {
        if (l2 = f2.next()) {
            console.log(`${l1}\t${l2}`);
        } else {
            console.log(l1);
        }
    }
    while (l2 = f2.next())
        console.log(`\t${l2}`);
}

main(process.argv.splice(2));

Answer 2

import { open as fsOpenAsync } from 'node:fs/promises'
import { createWriteStream } from 'node:fs'

const filenames = ['a.txt', 'b.txt', 'c.txt']
const outname = 'out.txt'

await paste(filenames, outname)

/**
 * Read multiple files line by line and write lines concatenated by `\t`
 */
async function paste(from: string[], to: string) {
  const files = await Promise.all(filenames.map(fn => fsOpenAsync(fn)))
  const zip = zipAsyncs(files.map(f => f.readLines()[Symbol.asyncIterator]()))
  const writeStream = createWriteStream(to, { flags: 'w' })
  for await (const lines of zip)
    writeStream.write(`${lines.map(e => e ?? '').join('\t')}\n`)
  writeStream.close()
  await Promise.all(files.map(f => f.close()))
}

/**
 * Zip multiple async iterables, returning `undefined` for missing values
 * @template {T}
 * @param {AsyncIterator<T>[]} its
 * @returns {AsyncGenerator<IteratorResult<T | undefined, any>[]>}
 */
async function* zipAsyncs(its) {
  while (true) {
    const results = await Promise.all(its.map(e => e.next()))
    yield results.map(r => r.value)
    if (results.every(r => r.done))
      return
  }
}

如何在 Node.js 中实现 Unix“粘贴”命令而不将整个文件加载到内存中？

问题描述投票：0回答：2

2个回答

最新问题

如何在 Node.js 中实现 Unix“粘贴”命令而不将整个文件加载到内存中？

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2