如何在 Node.js 中实现 Unix“粘贴”命令而不将整个文件加载到内存中?

问题描述 投票:0回答:2

基本的 Unix 粘贴 可以像 Python 一样实现(该示例仅适用于两个文件;Unix 粘贴适用于多个文件):

def paste(fn1, fn2):
  with open(fn1) as f1:
    with open(fn2) as f2:
      for l1 in f1:
        l2 = f2.readline()
        if l2 != None:
          print(l1[:-1] + "\t" + l2[:-1])
        else:
          print(l1[:-1])
      for l2 in f2:
        print("\t" + l2[:-1])

import sys
if __name__ == "__main__":
  if len(sys.argv) >= 3:
    paste(sys.argv[1], sys.argv[2])

任务是在 Node.js 中实现相同的功能。 重要的是,由于输入文件可能很大,因此实现应该逐行读取输入文件,而不是将整个文件读入内存。我想看看如何使用内置 Node 功能来实现这一目标,而无需外部包。

请注意,使用同步 I/O 实现 Unix 粘贴很容易,如 Python 示例所示,但 Node 不提供用于行读取的同步 I/O。同时,有一些方法可以使用异步 I/O 逐行读取一个文件,但是联合读取两个文件会比较困难,因为两个流不同步。

到目前为止,我能想到的唯一解决方案是使用基本的 read API 实现同步行读取。 Dave Newton 在评论中指出,npm n-readlines 包 在 100 多行代码中实现了这种方法。因为 n-readlines 检查每个字节以查找行结尾,所以我怀疑它效率低下,因此做了一个microbenchmark,结果如下表所示。对于行读取(不适用于此任务),n-readlines 的速度是 Node Readline 实现的 3 倍,并且比 Python、Perl 或 mawk 中的内置行读取慢一个数量级。

实现 Unix 粘贴的正确方法是什么? N-readlines 使用同步 API。一个好的异步解决方案会更干净、更快吗?

语言 运行时 版本 已过去 用户 系统 代码
JavaScript 节点 21.5.0 6.30 5.33 0.90 lc-node.js
节点 21.5.0 22.34 20.41 2.24 lc-n-readlines.js
小圆面包 1.0.20 4.91 5.30 1.47 lc-node.js
小圆面包 1.0.20 21.16 19.22 3.37 lc-n-readlines.js
k8 1.0 1.49 1.06 0.37 lc-k8.js
C 叮当 15.0.0 0.71 0.35 0.35 lc-c.c
蟒蛇 蟒蛇 2017年11月3日 3.48 2.85 0.62 lc-python.py
perl perl 5.34.3 1.70 1.13 0.57 lc-perl.pl
awk 莫克 1.3.4 2.08 1.27 0.80 lc-awk.awk
苹果awk 90.06 87.90 1.12 lc-awk.awk
javascript node.js
2个回答
0
投票

我正在发布基于 n-readlines 的答案。它冗长且效率低下(请参阅问题中的表格),但它解决了问题。我仍在寻找更好的解决方案。

const fs = require('fs');

// the following implementation is copied from https://github.com/nacholibre/node-readlines (the n-readlines package)
// Written by Liucw and distributed under the MIT license
class LineByLine {
    constructor(file, options) {
        options = options || {};
        if (!options.readChunk) options.readChunk = 1024;
        if (!options.newLineCharacter) {
            options.newLineCharacter = 0x0a; //linux line ending
        } else {
            options.newLineCharacter = options.newLineCharacter.charCodeAt(0);
        }
        if (typeof file === 'number') {
            this.fd = file;
        } else {
            this.fd = fs.openSync(file, 'r');
        }
        this.options = options;
        this.newLineCharacter = options.newLineCharacter;
        this.reset();
    }
    _searchInBuffer(buffer, hexNeedle) {
        let found = -1;
        for (let i = 0; i <= buffer.length; i++) {
            let b_byte = buffer[i];
            if (b_byte === hexNeedle) {
                found = i;
                break;
            }
        }
        return found;
    }
    reset() {
        this.eofReached = false;
        this.linesCache = [];
        this.fdPosition = 0;
    }
    close() {
        fs.closeSync(this.fd);
        this.fd = null;
    }
    _extractLines(buffer) {
        let line;
        const lines = [];
        let bufferPosition = 0;
        let lastNewLineBufferPosition = 0;
        while (true) {
            let bufferPositionValue = buffer[bufferPosition++];
            if (bufferPositionValue === this.newLineCharacter) {
                line = buffer.slice(lastNewLineBufferPosition, bufferPosition);
                lines.push(line);
                lastNewLineBufferPosition = bufferPosition;
            } else if (bufferPositionValue === undefined) {
                break;
            }
        }
        let leftovers = buffer.slice(lastNewLineBufferPosition, bufferPosition);
        if (leftovers.length) {
            lines.push(leftovers);
        }
        return lines;
    };
    _readChunk(lineLeftovers) {
        let totalBytesRead = 0;
        let bytesRead;
        const buffers = [];
        do {
            const readBuffer = new Buffer(this.options.readChunk);
            bytesRead = fs.readSync(this.fd, readBuffer, 0, this.options.readChunk, this.fdPosition);
            totalBytesRead = totalBytesRead + bytesRead;
            this.fdPosition = this.fdPosition + bytesRead;
            buffers.push(readBuffer);
        } while (bytesRead && this._searchInBuffer(buffers[buffers.length-1], this.options.newLineCharacter) === -1);
        let bufferData = Buffer.concat(buffers);
        if (bytesRead < this.options.readChunk) {
            this.eofReached = true;
            bufferData = bufferData.slice(0, totalBytesRead);
        }
        if (totalBytesRead) {
            this.linesCache = this._extractLines(bufferData);
            if (lineLeftovers) {
                this.linesCache[0] = Buffer.concat([lineLeftovers, this.linesCache[0]]);
            }
        }
        return totalBytesRead;
    }
    next() {
        if (!this.fd) return false;
        let line = false;
        if (this.eofReached && this.linesCache.length === 0) {
            return line;
        }
        let bytesRead;
        if (!this.linesCache.length) {
            bytesRead = this._readChunk();
        }
        if (this.linesCache.length) {
            line = this.linesCache.shift();
            const lastLineCharacter = line[line.length-1];
            if (lastLineCharacter !== this.newLineCharacter) {
                bytesRead = this._readChunk(line);
                if (bytesRead) {
                    line = this.linesCache.shift();
                }
            }
        }
        if (this.eofReached && this.linesCache.length === 0) {
            this.close();
        }
        if (line && line[line.length-1] === this.newLineCharacter) {
            line = line.slice(0, line.length-1);
        }
        return line;
    }
}

function main(args) {
    if (args.length < 2) {
        console.log("Usage: node lc-n-readlines.js <in1.txt> <in2.txt>");
        return;
    }
    const f1 = new LineByLine(args[0]);
    const f2 = new LineByLine(args[1]);
    let l1, l2;
    while (l1 = f1.next()) {
        if (l2 = f2.next()) {
            console.log(`${l1}\t${l2}`);
        } else {
            console.log(l1);
        }
    }
    while (l2 = f2.next())
        console.log(`\t${l2}`);
}

main(process.argv.splice(2));

0
投票
import { open as fsOpenAsync } from 'node:fs/promises'
import { createWriteStream } from 'node:fs'

const filenames = ['a.txt', 'b.txt', 'c.txt']
const outname = 'out.txt'

await paste(filenames, outname)

/**
 * Read multiple files line by line and write lines concatenated by `\t`
 */
async function paste(from: string[], to: string) {
  const files = await Promise.all(filenames.map(fn => fsOpenAsync(fn)))
  const zip = zipAsyncs(files.map(f => f.readLines()[Symbol.asyncIterator]()))
  const writeStream = createWriteStream(to, { flags: 'w' })
  for await (const lines of zip)
    writeStream.write(`${lines.map(e => e ?? '').join('\t')}\n`)
  writeStream.close()
  await Promise.all(files.map(f => f.close()))
}

/**
 * Zip multiple async iterables, returning `undefined` for missing values
 * @template {T}
 * @param {AsyncIterator<T>[]} its
 * @returns {AsyncGenerator<IteratorResult<T | undefined, any>[]>}
 */
async function* zipAsyncs(its) {
  while (true) {
    const results = await Promise.all(its.map(e => e.next()))
    yield results.map(r => r.value)
    if (results.every(r => r.done))
      return
  }
}
© www.soinside.com 2019 - 2024. All rights reserved.