基本的 Unix 粘贴 可以像 Python 一样实现(该示例仅适用于两个文件;Unix 粘贴适用于多个文件):
def paste(fn1, fn2):
with open(fn1) as f1:
with open(fn2) as f2:
for l1 in f1:
l2 = f2.readline()
if l2 != None:
print(l1[:-1] + "\t" + l2[:-1])
else:
print(l1[:-1])
for l2 in f2:
print("\t" + l2[:-1])
import sys
if __name__ == "__main__":
if len(sys.argv) >= 3:
paste(sys.argv[1], sys.argv[2])
任务是在 Node.js 中实现相同的功能。 重要的是,由于输入文件可能很大,因此实现应该逐行读取输入文件,而不是将整个文件读入内存。我想看看如何使用内置 Node 功能来实现这一目标,而无需外部包。
请注意,使用同步 I/O 实现 Unix 粘贴很容易,如 Python 示例所示,但 Node 不提供用于行读取的同步 I/O。同时,有一些方法可以使用异步 I/O 逐行读取一个文件,但是联合读取两个文件会比较困难,因为两个流不同步。
到目前为止,我能想到的唯一解决方案是使用基本的 read API 实现同步行读取。 Dave Newton 在评论中指出,npm n-readlines 包 在 100 多行代码中实现了这种方法。因为 n-readlines 检查每个字节以查找行结尾,所以我怀疑它效率低下,因此做了一个microbenchmark,结果如下表所示。对于行读取(不适用于此任务),n-readlines 的速度是 Node Readline 实现的 3 倍,并且比 Python、Perl 或 mawk 中的内置行读取慢一个数量级。
实现 Unix 粘贴的正确方法是什么? N-readlines 使用同步 API。一个好的异步解决方案会更干净、更快吗?
语言 | 运行时 | 版本 | 已过去 | 用户 | 系统 | 代码 |
---|---|---|---|---|---|---|
JavaScript | 节点 | 21.5.0 | 6.30 | 5.33 | 0.90 | lc-node.js |
节点 | 21.5.0 | 22.34 | 20.41 | 2.24 | lc-n-readlines.js | |
小圆面包 | 1.0.20 | 4.91 | 5.30 | 1.47 | lc-node.js | |
小圆面包 | 1.0.20 | 21.16 | 19.22 | 3.37 | lc-n-readlines.js | |
k8 | 1.0 | 1.49 | 1.06 | 0.37 | lc-k8.js | |
C | 叮当 | 15.0.0 | 0.71 | 0.35 | 0.35 | lc-c.c |
蟒蛇 | 蟒蛇 | 2017年11月3日 | 3.48 | 2.85 | 0.62 | lc-python.py |
perl | perl | 5.34.3 | 1.70 | 1.13 | 0.57 | lc-perl.pl |
awk | 莫克 | 1.3.4 | 2.08 | 1.27 | 0.80 | lc-awk.awk |
苹果awk | ? | 90.06 | 87.90 | 1.12 | lc-awk.awk |
我正在发布基于 n-readlines 的答案。它冗长且效率低下(请参阅问题中的表格),但它解决了问题。我仍在寻找更好的解决方案。
const fs = require('fs');
// the following implementation is copied from https://github.com/nacholibre/node-readlines (the n-readlines package)
// Written by Liucw and distributed under the MIT license
class LineByLine {
constructor(file, options) {
options = options || {};
if (!options.readChunk) options.readChunk = 1024;
if (!options.newLineCharacter) {
options.newLineCharacter = 0x0a; //linux line ending
} else {
options.newLineCharacter = options.newLineCharacter.charCodeAt(0);
}
if (typeof file === 'number') {
this.fd = file;
} else {
this.fd = fs.openSync(file, 'r');
}
this.options = options;
this.newLineCharacter = options.newLineCharacter;
this.reset();
}
_searchInBuffer(buffer, hexNeedle) {
let found = -1;
for (let i = 0; i <= buffer.length; i++) {
let b_byte = buffer[i];
if (b_byte === hexNeedle) {
found = i;
break;
}
}
return found;
}
reset() {
this.eofReached = false;
this.linesCache = [];
this.fdPosition = 0;
}
close() {
fs.closeSync(this.fd);
this.fd = null;
}
_extractLines(buffer) {
let line;
const lines = [];
let bufferPosition = 0;
let lastNewLineBufferPosition = 0;
while (true) {
let bufferPositionValue = buffer[bufferPosition++];
if (bufferPositionValue === this.newLineCharacter) {
line = buffer.slice(lastNewLineBufferPosition, bufferPosition);
lines.push(line);
lastNewLineBufferPosition = bufferPosition;
} else if (bufferPositionValue === undefined) {
break;
}
}
let leftovers = buffer.slice(lastNewLineBufferPosition, bufferPosition);
if (leftovers.length) {
lines.push(leftovers);
}
return lines;
};
_readChunk(lineLeftovers) {
let totalBytesRead = 0;
let bytesRead;
const buffers = [];
do {
const readBuffer = new Buffer(this.options.readChunk);
bytesRead = fs.readSync(this.fd, readBuffer, 0, this.options.readChunk, this.fdPosition);
totalBytesRead = totalBytesRead + bytesRead;
this.fdPosition = this.fdPosition + bytesRead;
buffers.push(readBuffer);
} while (bytesRead && this._searchInBuffer(buffers[buffers.length-1], this.options.newLineCharacter) === -1);
let bufferData = Buffer.concat(buffers);
if (bytesRead < this.options.readChunk) {
this.eofReached = true;
bufferData = bufferData.slice(0, totalBytesRead);
}
if (totalBytesRead) {
this.linesCache = this._extractLines(bufferData);
if (lineLeftovers) {
this.linesCache[0] = Buffer.concat([lineLeftovers, this.linesCache[0]]);
}
}
return totalBytesRead;
}
next() {
if (!this.fd) return false;
let line = false;
if (this.eofReached && this.linesCache.length === 0) {
return line;
}
let bytesRead;
if (!this.linesCache.length) {
bytesRead = this._readChunk();
}
if (this.linesCache.length) {
line = this.linesCache.shift();
const lastLineCharacter = line[line.length-1];
if (lastLineCharacter !== this.newLineCharacter) {
bytesRead = this._readChunk(line);
if (bytesRead) {
line = this.linesCache.shift();
}
}
}
if (this.eofReached && this.linesCache.length === 0) {
this.close();
}
if (line && line[line.length-1] === this.newLineCharacter) {
line = line.slice(0, line.length-1);
}
return line;
}
}
function main(args) {
if (args.length < 2) {
console.log("Usage: node lc-n-readlines.js <in1.txt> <in2.txt>");
return;
}
const f1 = new LineByLine(args[0]);
const f2 = new LineByLine(args[1]);
let l1, l2;
while (l1 = f1.next()) {
if (l2 = f2.next()) {
console.log(`${l1}\t${l2}`);
} else {
console.log(l1);
}
}
while (l2 = f2.next())
console.log(`\t${l2}`);
}
main(process.argv.splice(2));
import { open as fsOpenAsync } from 'node:fs/promises'
import { createWriteStream } from 'node:fs'
const filenames = ['a.txt', 'b.txt', 'c.txt']
const outname = 'out.txt'
await paste(filenames, outname)
/**
* Read multiple files line by line and write lines concatenated by `\t`
*/
async function paste(from: string[], to: string) {
const files = await Promise.all(filenames.map(fn => fsOpenAsync(fn)))
const zip = zipAsyncs(files.map(f => f.readLines()[Symbol.asyncIterator]()))
const writeStream = createWriteStream(to, { flags: 'w' })
for await (const lines of zip)
writeStream.write(`${lines.map(e => e ?? '').join('\t')}\n`)
writeStream.close()
await Promise.all(files.map(f => f.close()))
}
/**
* Zip multiple async iterables, returning `undefined` for missing values
* @template {T}
* @param {AsyncIterator<T>[]} its
* @returns {AsyncGenerator<IteratorResult<T | undefined, any>[]>}
*/
async function* zipAsyncs(its) {
while (true) {
const results = await Promise.all(its.map(e => e.next()))
yield results.map(r => r.value)
if (results.every(r => r.done))
return
}
}