我使用 google cloud Vision 对 pdf 进行 OCR,并从 Firebase Storage 获取其 json 输出。然后,我从 json 中检索全文,计算单词数并将单词数保存到 Firestore。发生的奇怪的事情是,每次尝试,即使文档相同,我也会得到不同的字数。不仅如此,字数统计似乎多次保存到 Firestore 中的同一个文档中,直到它(我不确定是哪个函数)完成,也许是计算字数的函数。另外,为什么 Firestore 不止一次将数据保存到文档中,这让我很困惑。有谁知道发生了什么事以及我如何才能只保存一次字数?以下是 Firebase 云函数中的代码:
if (path.basename(object.name).startsWith('output') && path.basename(object.name).split('.').pop() === "json") {
// Get references
const fileBucket = object.bucket; // The Storage bucket that contains the file.
const filePath = object.name; // File path in the bucket.
// Download JSON
const bucket = admin.storage().bucket(fileBucket);
const downloadResponse = await bucket.file(filePath).download();
const bufferToJson = downloadResponse.toString();
const jsObject = JSON.parse(bufferToJson);
// Text
const textArray = jsObject.responses.map(async (response) => {
return response.fullTextAnnotation.text;
});
const readyArray = await Promise.all(textArray);
const fullTextReady = readyArray.join();
// Count words
async function countWords(str) {
return str.trim().split(/\s+/).length;
}
const words = await countWords(fullTextReady);
// Text confidence
const textConfidenceArray = jsObject.responses.map(async (response) => {
return response.fullTextAnnotation.pages.map((page) => {
return page.confidence;
})
})
const textConfidence = await Promise.all(textConfidenceArray);
const textConfidence2 = textConfidence.flat();
const sum = textConfidence2.reduce((accumulator, currentValue) => {
return accumulator + currentValue
},0);
const average = sum / textConfidence2.length;
const textConfidence3 = Number(average).toFixed(2) * 100;
// Language and Language Confidence
const pages = jsObject.responses.map((response) => {
return response.fullTextAnnotation.pages.map((page) => {
return page.property.detectedLanguages
})
});
const pages2 = await Promise.all(pages);
const detectedLanguages = pages2.flat(2);
const languageAndConfidenceArray = detectedLanguages.map((language) => {
const langCode = language.languageCode;
const confidence = Number((language.confidence).toFixed(1)) * 100;
return {
languageCode: langCode,
languageConfidence: confidence
}
})
const languages = await Promise.all(languageAndConfidenceArray);
// Save to Firestore
const jsonLocation = path.dirname(object.name);
const fileName = path.basename(jsonLocation);
const results = path.dirname(jsonLocation);
const order = path.dirname(results);
const destination = `${order}/${fileName}`;
const docRef = db.collection('Clients').doc(destination);
await docRef.set({
fullText: fullTextReady,
textConfidence: textConfidence3,
type: "application/pdf",
pageCount: jsObject.responses.length,
languages: languages,
fileName: fileName,
location: jsonLocation,
wordCount: words
}, { merge: true });
}
我已经弄清楚了。 Google Vision API 为 PDF 文件的每 20 页创建一个单独的 JSON。因此,例如,如果您的 PDF 有 34 页,那么它将创建 2 个 JSON,同样,如果它有 100 页,那么它将创建 5 个 JSON。我的云函数会在每次 JSON 通过时运行,因此它会简单地覆盖之前的 JSON 信息。解决方案是增加所有 JSON 的值。请记住,Firebase Cloud 函数不保证事件的顺序。因此,您必须检查您的函数才能知道它是哪个 JSON。幸运的是,Vision Api 对 JSON 进行编号,如下所示:1-3、2-3、3-3。希望这对某人有帮助。这是我更新的代码:
if (path.basename(object.name).startsWith('output') && path.basename(object.name).split('.').pop() === "json") {
// Get references
const fileBucket = object.bucket; // The Storage bucket that contains the file.
const filePath = object.name; // File path in the bucket.
// Download JSON
const bucket = admin.storage().bucket(fileBucket);
const downloadResponse = await bucket.file(filePath).download();
// const url = await getDownloadURL(bucket.file(filePath));
const bufferToJson = downloadResponse.toString();
const jsObject = JSON.parse(bufferToJson);
// jsObject.responses.forEach((response) => {
// return response.fullTextAnnotation.pages.map((page) => {
// console.log(page.property);
// })
// });
// Text
const textArray = jsObject.responses.map(async (response) => {
return response.fullTextAnnotation.text;
});
const readyArray = await Promise.all(textArray);
const fullTextReady = readyArray.join();
// Count words
function countWords(str) {
return str.trim().split(/\s+/).length;
}
const words = countWords(fullTextReady);
// Text confidence
const textConfidenceArray = jsObject.responses.map(async (response) => {
return response.fullTextAnnotation.pages.map((page) => {
return page.confidence;
})
})
const textConfidence = await Promise.all(textConfidenceArray);
const textConfidence2 = textConfidence.flat();
const sum = textConfidence2.reduce((accumulator, currentValue) => {
return accumulator + currentValue
},0);
const average = sum / textConfidence2.length;
const textConfidence3 = Number(average).toFixed(2) * 100;
// Language and Language Confidence
const pages = jsObject.responses.map((response) => {
return response.fullTextAnnotation.pages.map((page) => {
if (page.property && page.property.detectedLanguages) {
return page.property.detectedLanguages
}
})
});
const pages2 = await Promise.all(pages);
const detectedLanguages = pages2.flat(2);
const filteredDetectedLanguages = detectedLanguages.filter((language) => language !== undefined)
console.log(filteredDetectedLanguages);
const languageAndConfidenceArray = filteredDetectedLanguages.map((language) => {
const langCode = language.languageCode;
const confidence = Number((language.confidence).toFixed(1)) * 100;
return {
languageCode: langCode,
languageConfidence: confidence
}
})
const languages = await Promise.all(languageAndConfidenceArray);
// Save to Firestore
const jsonLocation = path.dirname(object.name);
const fileName = path.basename(jsonLocation);
const results = path.dirname(jsonLocation);
const order = path.dirname(results);
const destination = `${order}/${fileName}`;
const docRef = db.collection('Clients').doc(destination);
const doc = await docRef.get();
if (!doc.data().fullText) {
console.log('ya na voobshe ne dolje')
await docRef.set({
textConfidence: textConfidence3,
type: "application/pdf",
pageCount: jsObject.responses.length,
languages: languages,
fileName: fileName,
location: jsonLocation,
wordCount: words,
fullText: fullTextReady,
jsonArray: [path.basename(object.name)]
}, { merge: true });
} else {
console.log('ya toje srabotala', path.basename(object.name))
// Combining texts from more than one json
const combinedText = fullTextReady + ' ' + doc.data().fullText;
// Combining text confidences and averaging them out
const averageTextConfidence = (Number(doc.data().textConfidence) + Number(textConfidence3)) / 2;
// Conbining page counts
const combinedPages = Number(doc.data().pageCount) + Number(jsObject.responses.length);
// Combining language arrays
const combinedLanguageArray = doc.data().languages.concat(languages);
// Combining word counts
const combinedWordCount = Number(doc.data().wordCount) + Number(words);
// Update json array
const array = [...doc.data().jsonArray, path.basename(object.name)]
//Saving to Firestore
await docRef.set({
textConfidence: averageTextConfidence,
pageCount: combinedPages,
languages: combinedLanguageArray,
wordCount: combinedWordCount,
fullText: combinedText,
jsonArray: array
}, { merge: true });
}
}