我有很长的文件需要解析。因为它很长,我需要一块一块地做。我试过这个:
function parseFile(file){
var chunkSize = 2000;
var fileSize = (file.size - 1);
var foo = function(e){
console.log(e.target.result);
};
for(var i =0; i < fileSize; i += chunkSize)
{
(function( fil, start ) {
var reader = new FileReader();
var blob = fil.slice(start, chunkSize + 1);
reader.onload = foo;
reader.readAsText(blob);
})( file, i );
}
}
运行后,我只看到控制台中的第一个块。如果我将“console.log”更改为 jquery 附加到某个 div,我只会看到该 div 中的第一个块。其他块呢?如何让它发挥作用?
FileReader API 是异步的,因此您应该使用
block
调用来处理它。 for loop
不会解决这个问题,因为它不会在读取下一个块之前等待每次读取完成。
这是一种工作方法。
function parseFile(file, callback) {
var fileSize = file.size;
var chunkSize = 64 * 1024; // bytes
var offset = 0;
var self = this; // we need a reference to the current object
var chunkReaderBlock = null;
var readEventHandler = function(evt) {
if (evt.target.error == null) {
offset += evt.target.result.length;
callback(evt.target.result); // callback for handling read chunk
} else {
console.log("Read error: " + evt.target.error);
return;
}
if (offset >= fileSize) {
console.log("Done reading file");
return;
}
// of to the next chunk
chunkReaderBlock(offset, chunkSize, file);
}
chunkReaderBlock = function(_offset, length, _file) {
var r = new FileReader();
var blob = _file.slice(_offset, length + _offset);
r.onload = readEventHandler;
r.readAsText(blob);
}
// now let's start the read with the first block
chunkReaderBlock(offset, chunkSize, file);
}
您可以利用 Response(fetch 的一部分)将大部分内容转换为任何其他 blob、文本、json,并获得一个 ReadableStream,它可以帮助您成块读取 blob 👍
var dest = new WritableStream({
write (str) {
console.log(str)
}
})
var blob = new Blob(['bloby']);
(blob.stream ? blob.stream() : new Response(blob).body)
// Decode the binary-encoded response to string
.pipeThrough(new TextDecoderStream())
.pipeTo(dest)
.then(() => {
console.log('done')
})
旧答案(WritableStreams pipeTo 和 pipeThrough 之前没有实现)
我想出了一个有趣的想法,它可能非常快,因为它将 blob 转换为 ReadableByteStreamReader 可能也更容易,因为您不需要处理诸如块大小和偏移量之类的东西,然后在循环中递归完成
function streamBlob(blob) {
const reader = new Response(blob).body.getReader()
const pump = reader => reader.read()
.then(({ value, done }) => {
if (done) return
// uint8array chunk (use TextDecoder to read as text)
console.log(value)
return pump(reader)
})
return pump(reader)
}
streamBlob(new Blob(['bloby'])).then(() => {
console.log('done')
})
slice
的第二个参数其实就是结束字节。您的代码应该类似于:
function parseFile(file){
var chunkSize = 2000;
var fileSize = (file.size - 1);
var foo = function(e){
console.log(e.target.result);
};
for(var i =0; i < fileSize; i += chunkSize) {
(function( fil, start ) {
var reader = new FileReader();
var blob = fil.slice(start, chunkSize + start);
reader.onload = foo;
reader.readAsText(blob);
})(file, i);
}
}
BlobReader
更简单的界面:
BlobReader(blob)
.readText(function (text) {
console.log('The text in the blob is', text);
});
更多信息:
在课堂上修改@alediaferia 的答案(打字稿版本)并在承诺中返回结果。勇敢的程序员甚至会把它包装成一个异步迭代器…
class FileStreamer {
constructor(file) {
this.file = file;
this.offset = 0;
this.defaultChunkSize = 64 * 1024; // bytes
this.rewind();
}
rewind() {
this.offset = 0;
}
isEndOfFile() {
return this.offset >= this.getFileSize();
}
readBlockAsText(length = this.defaultChunkSize) {
const fileReader = new FileReader();
const blob = this.file.slice(this.offset, this.offset + length);
return new Promise((resolve, reject) => {
fileReader.onloadend = (event) => {
const target = (event.target);
if (target.error == null) {
const result = target.result;
this.offset += result.length;
this.testEndOfFile();
resolve(result);
}
else {
reject(target.error);
}
};
fileReader.readAsText(blob);
});
}
testEndOfFile() {
if (this.isEndOfFile()) {
console.log('Done reading file');
}
}
getFileSize() {
return this.file.size;
}
}
在控制台中打印整个文件的示例(在 async 上下文中)
const fileStreamer = new FileStreamer(aFile);
while (!fileStreamer.isEndOfFile()) {
const data = await fileStreamer.readBlockAsText();
console.log(data);
}
使用简单的方法将大文件解析成小块:
//Parse large file in to small chunks
var parseFile = function (file) {
var chunkSize = 1024 * 1024 * 16; //16MB Chunk size
var fileSize = file.size;
var currentChunk = 1;
var totalChunks = Math.ceil((fileSize/chunkSize), chunkSize);
while (currentChunk <= totalChunks) {
var offset = (currentChunk-1) * chunkSize;
var currentFilePart = file.slice(offset, (offset+chunkSize));
console.log('Current chunk number is ', currentChunk);
console.log('Current chunk data', currentFilePart);
currentChunk++;
}
};
通过按字节盲目切片 blob 来获取块,可以将多字节字符切成两半并破坏编码。
TextDecoderStream
来帮助解决这个问题。
这里是作为 async generator function实现的解决方案:
async function* read_chunks(file, chunk_size = 1000000) {
let offset = 0;
const stream = new ReadableStream({
async pull(controller) {
let chunk = file.slice(offset, offset + chunk_size);
chunk = await chunk.arrayBuffer();
chunk = new Uint8Array(chunk);
controller.enqueue(chunk);
if (offset >= file.size) {
controller.close()
}
offset += chunk.length;
}
}).pipeThrough(new TextDecoderStream());
const reader = stream.getReader();
for (;;) {
const { done, value } = await reader.read();
if (done) return;
yield value;
}
}
const file = new Blob(["000😀001😀002😀003😀"]);
(async() => {
for await (const chunk of read_chunks(file, 4)) {
console.log(`Chunk: [${chunk.length}] "${chunk}"`);
}
})();
如果您不关心块的大小,您可以通过将其替换为
ReadableStream
来摆脱自定义file.stream()
。