我使用了 Tanaike 提出的 Google Apps 脚本 code 将多个 PDF 文件合并为一个。我的要求是:
- 获取文件id
- 检查它是pdf还是docx
- 如果是docx,转换成pdf然后合并
这是修改后的片段:
async function newMain() {
// Retrieve PDF data.
var destinationFolder = DriveApp.getFolderById("your-folder-id")
var urls = sheet.getRange(2,2).getValue().toString().split(","); //split the urls joined by commas
const ids = urls.map((url) => {
const matches = url.match(/\/file\/d\/([^\/]+)\/edit/); //get file id from urls
return matches ? [matches[1]] : [];
});
const data = ids.map(([id]) => {
const file = DriveApp.getFileById(id);
const mimeType = file.getMimeType();
// Check if the file is a DOCX
if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
var blob = file.getBlob();
var tempfile = Drive.Files.insert({}, blob, {convert:true});
var id = tempfile["id"];
var doc = DocumentApp.openById(id);
var body = doc.getBody();
var text = body.getText();
var pdfFile = destinationFolder.createFile(`temp.pdf`, text, "application/pdf");
// Get the PDF content
const pdfData = pdfFile.getBlob().getBytes();
Drive.Files.remove(id);
// Remove the temporary PDF file
DriveApp.getFileById(pdfFile.getId()).setTrashed(true);
return new Uint8Array(pdfData);
} else if (mimeType === 'application/pdf') {
// File is already a PDF, fetch its content
return new Uint8Array(file.getBlob().getBytes());
}
});
// Load pdf-lib
const cdnjs = "https://cdn.jsdelivr.net/npm/pdf-lib/dist/pdf-lib.min.js";
eval(UrlFetchApp.fetch(cdnjs).getContentText().replace(/setTimeout\(.*?,.*?(\d*?)\)/g, "Utilities.sleep($1);return t();"));
// Merge PDFs.
const pdfDoc = await PDFLib.PDFDocument.create();
for (let i = 0; i < data.length; i++) {
const pdfData = await PDFLib.PDFDocument.load(data[i]);
for (let j = 0; j < pdfData.getPageCount(); j++) {
const [page] = await pdfDoc.copyPages(pdfData, [j]);
pdfDoc.addPage(page);
}
}
const bytes = await pdfDoc.save();
// Create a PDF file.
var file = DriveApp.createFile(Utilities.newBlob([...new Int8Array(bytes)], MimeType.PDF, "sample2.pdf"));
file.moveTo(destinationFolder);
}
当我运行它时,我收到以下错误:
错误:无法解析 PDF 文档(行:368 col:0 offset = 18311):找不到 PDF 标题 e
e
e.parseHeader
评估
评估
评估
评估
我
e.parseDocument
由于我的初学者技能,我无法解决或改进它,非常感谢这方面的任何指导。
ids
值是DOCX文件的有效文件ID,在您的脚本中,我认为pdfData
不是PDF数据。在您的脚本中,检索文本数据并将检索到的文本数据保存为文本文件,不包含 PDF 数据。我认为这可能是您当前问题的原因。既然如此,下面的修改如何?
async function newMain() {
// Retrieve PDF data.
var destinationFolder = DriveApp.getFolderById("your-folder-id")
var urls = sheet.getRange(2, 2).getValue().toString().split(","); //split the urls joined by commas
const ids = urls.map((url) => {
const matches = url.match(/\/file\/d\/([^\/]+)\/edit/); //get file id from urls
return matches ? [matches[1]] : [];
});
// --- I modifieid the below script.
const data = ids.map(([id]) => {
const file = DriveApp.getFileById(id);
const mimeType = file.getMimeType();
if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
var blob = file.getBlob();
var tempfile = Drive.Files.insert({}, blob, { convert: true });
const pdfData = DriveApp.getFileById(tempfile.id).getBlob().getBytes();
Drive.Files.remove(tempfile.id);
return new Uint8Array(pdfData);
} else if (mimeType === 'application/pdf') {
return new Uint8Array(file.getBlob().getBytes());
}
});
// ---
// Load pdf-lib
const cdnjs = "https://cdn.jsdelivr.net/npm/pdf-lib/dist/pdf-lib.min.js";
eval(UrlFetchApp.fetch(cdnjs).getContentText().replace(/setTimeout\(.*?,.*?(\d*?)\)/g, "Utilities.sleep($1);return t();"));
// Merge PDFs.
const pdfDoc = await PDFLib.PDFDocument.create();
for (let i = 0; i < data.length; i++) {
const pdfData = await PDFLib.PDFDocument.load(data[i]);
for (let j = 0; j < pdfData.getPageCount(); j++) {
const [page] = await pdfDoc.copyPages(pdfData, [j]);
pdfDoc.addPage(page);
}
}
const bytes = await pdfDoc.save();
// Create a PDF file.
var file = DriveApp.createFile(Utilities.newBlob([...new Int8Array(bytes)], MimeType.PDF, "sample2.pdf"));
file.moveTo(destinationFolder);
}
ids
的值是DOCX文件的有效文件ID,则DOCX文件被转换为Google文档,并且Google文档被转换为PDF数据。然后,每个PDF数据被合并。ids
值是 DOCX 文件的有效文件 ID。请注意这一点。