我想使用 pdf.js 将 pdf 转换为 html 页面。 Pdf.js 在浏览器中执行此操作,但是否可以在后端获取浏览器渲染的 html 页面,从而将 n 个页面的 pdf 转换为 n 个 html 文件。我使用 Node.js 作为后端。我尝试过 pdf2html 和其他类似的 npm 模块,它们效果不佳,并且在某些 pdf 上存在问题。谢谢您的建议。
也许我发现了类似的东西 - 我正在使用本地 PDF 文件和浏览器。我对现成的viewer.js / PDF.js做了一些小改动,应该可以使用Node.js和浏览器进行处理。
此脚本包含由viewer.js Webpack的参数指定的PDF并启动浏览器。
const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
datauri(pdf, (err, content, meta) => {
if (err) {
throw err;
}
const viewerJSpath = path.join(__dirname, './viewer.js');
let wp = fs.readFileSync(viewerJSpath, 'utf-8');
const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
const srcPos = [wp.indexOf(pdfName)];
srcPos.push(srcPos[0] + pdfName.length);
let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
wp = wp.substr(0, srcPos[0]) + content +
wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
wp.substr(HOSTED_VIEWER_ORIGINS);
fs.writeFileSync(viewerJSpath, wp, 'utf-8');
const c = path.join(__dirname, 'viewer.html');
chp.execSync(c);
});
然后尝试将原始宽度作为下一个样式参数添加到renderTextLayer的appendText方法中,并将元素按位置排序到TextLayerBuilder的render方法next2
this.textLayerDiv.appendChild(textLayerFrag);
。
我的 Github 上提到的所有 PDF.js 更改似乎只需要 web 和 build 文件夹(除了 npm i -g datauri Fox 示例)。
使用 puppeteer 和稍微修改的 PDF.js 可以直接转换(head/less 都可以,但元素大小略有不同)
const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
if (err) {
throw err;
}
const viewerJSpath = path.join(__dirname, './viewer');
let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
const srcPos = [wp.indexOf(pdfName)];
srcPos.push(srcPos[0] + pdfName.length);
let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
wp = wp.substr(0, srcPos[0]) + content +
wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
wp.substr(HOSTED_VIEWER_ORIGINS);
fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
(async () => {
const browser = await puppeteer.launch({
// headless: false
});
const page = await browser.pages();
const c = path.join(__dirname, 'viewer.html');
await page[0].goto('file:///' + c);
page[0].exposeFunction('reader', (elLists) => {
fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
setTimeout(() => { browser.close(); }, 100);
});
})();
});
puppeteer/chromium 所需的修复:
const message = exception?.message; // => exception.message
page: this.pageLabel ?? this.id // => this.pageLabel || this.id
viewer.js =>viewerSrc.js 基本添加:
function webViewerPageRendered({
...
if (pageNumber < PDFViewerApplication.pagesCount) {
arguments[0].source.eventBus.dispatch("pagenumberchanged", {
value: pageNumber + 1
}); // generate all remaining pages
}
}
class BaseViewer {
constructor(options) {
this.pageNo = []; // rendered pages array
...
_setCurrentPageNumber(val, resetCurrentPageView = false) {
...
if (this.pageNo.indexOf(val) < 0) {
this.pageNo.push(val);
}
if (this.pagesCount - 1 <= this.pageNo.length) {
window.reader(elLists); // sent result back 2 node.js
}
结果看起来像 {PageNo:{ElNo:{data}, ...}, ...} 并且可以简单地翻译为网页或进一步处理。
{
"1": {
"0": {
"x": 99.9871,
"y": 98.0496,
"w": 557.695,
"h": 22,
"text": "Trace-based Just-in-Time Type Specialization for Dynamic",
"ff": "sans-serif",
"fs": "22.2695px",
"cssText": "left: 99.9871px; top: 98.0496px; width: 557.695px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.970163);"
},
"1": {
"x": 327.478,
"y": 122.793,
"w": 102.707,
"h": 22,
"text": "Languages",
"ff": "sans-serif",
"fs": "22.2695px",
"cssText": "left: 327.478px; top: 122.793px; width: 102.707px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.932262);"
},
...
"2": {
"0": {
"x": 393.677,
"y": 90.3408,
"w": 192.909,
"h": 11,
"text": "1 for (var i = 2; i < 100; ++i) {",
"ff": "monospace",
"fs": "11.1347px",
"cssText": "left: 393.677px; top: 90.3408px; width: 192.909px; font-size: 11.1347px; font-family: monospace; transform: scaleX(0.875232);"
},
"1": {
"x": 67.0588,
"y": 91.7599,
"w": 173.346,
"h": 11,
"text": "Hence, recording and compiling a trace",
"ff": "sans-serif",
"fs": "11.1347px",
"cssText": "left: 67.0588px; top: 91.7599px; width: 173.346px; font-size: 11.1347px; font-family: sans-serif; transform: scaleX(0.895175);"
},
变更摘要(在原始 gh-pages 分支中):
- PDF.js
中的更改 function appendText(task, geom, styles) {
let left, top;
+++ , width;
if (angle === 0) {
left = tx[4];
top = tx[5] - fontAscent;
} else {
left = tx[4] + fontAscent * Math.sin(angle);
top = tx[5] - fontAscent * Math.cos(angle);
}
+++ width = geom.width * task._viewport.transform[0];
textDiv.style.left = `${left}px`;
textDiv.style.top = `${top}px`;
+++ textDiv.style.width = `${width}px`;
- 新的 nodeView.js
const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const viewerJSpath = path.join(__dirname, './viewer');
const content = datauri(pdf);
let wp = fs.readFileSync(viewerJSpath, 'utf-8');
const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
const srcPos = [wp.indexOf(pdfName)];
srcPos.push(srcPos[0] + pdfName.length);
let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
wp = wp.substr(0, srcPos[0]) + content +
wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
wp.substr(HOSTED_VIEWER_ORIGINS);
fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
const c = path.join(__dirname, 'viewer.ff');
chp.execSync(c);
- 新 openFF.bat:启动节点nodeView.js %1
-新pdf2sortedMergedTexts.js
const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
if (err) {
throw err;
}
const viewerJSpath = path.join(__dirname, './viewer');
let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
const srcPos = [wp.indexOf(pdfName)];
srcPos.push(srcPos[0] + pdfName.length);
let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
wp = wp.substr(0, srcPos[0]) + content +
wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
wp.substr(HOSTED_VIEWER_ORIGINS);
fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
(async () => {
const browser = await puppeteer.launch({
// headless: false
});
const page = await browser.pages();
const c = path.join(__dirname, 'viewer.html');
await page[0].goto('file:///' + c);
page[0].exposeFunction('reader', (elLists) => {
fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
setTimeout(() => { browser.close(); }, 100);
});
})();
});
- 更改了viewer.js -> viewerSrc.js
function webViewerPageRendered({
...
if (pageNumber < PDFViewerApplication.pagesCount) {
arguments[0].source.eventBus.dispatch("pagenumberchanged", {
value: pageNumber + 1
});
}
}
...
class BaseViewer {
constructor(options) {
+++ this.pageNo = [];
...
_setCurrentPageNumber(val, resetCurrentPageView = false) {
...
+++ if (this.pageNo.indexOf(val) < 0) {
+++ this.pageNo.push(val);
+++ console.log(this.pageNo);
+++ }
+++ if (this.pagesCount - 1 <= this.pageNo.length) {
+++ window.reader(elLists);
+++ }
this._currentPageNumber = val;
render(timeout = 0) {
...
this.textLayerRenderTask.promise.then(() => {
this.textLayerDiv.appendChild(textLayerFrag);
+++ this.reorder(this.textLayerDiv);
... new
reorder(_src) {
const src = _src.children;
let els = [];
const elDest = [];
for (let j = 0; j < src.length; j++) {
const i = src[j];
if (i.className === 'endOfContent') continue;
els.push({ x: parseFloat(i.style.left), y: parseFloat(i.style.top), w: parseFloat(i.style.width), h: i.offsetHeight, text: i.innerText, ff: i.style.fontFamily, fs: i.style.fontSize, cssText: i.style.cssText });
}
els.sort((a, b) => {
if (Math.abs(a.y - b.y) <= 1) {
if (Math.abs(a.x - b.x) <= 1) return 0;
else return a.x - b.x;
} else return a.y - b.y;
});
let elMin = els[0];
for (let i = 1; i < els.length; i++) {
if (elMin.x + elMin.w + 1 >= els[i].x &&
Math.abs(elMin.y - els[i].y) < 1 &&
elMin.h === els[i].h &&
elMin.ff === els[i].ff &&
elMin.fs === els[i].fs) {
elMin.text += els[i].text;
elMin.w = els[i].x + els[i].w - elMin.x;
if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
continue;
}
if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
elMin = els[i];
}
if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
els = _src;
while (els.lastChild) els.removeChild(els.lastChild);
const elList = [];
if (window.elLists === undefined) window.elLists = {};
const uqIdx = { x: [], y: [] };
for (let i = 0; i < elDest.length; i++) {
const o = document.createElement('DIV');
o.innerHTML = elDest[i].text;
o.setAttribute('style', elDest[i].cssText + 'width:' + elDest[i].w + 'px;position:absolute;');
els.appendChild(o);
elList.push([elDest[i].x, elDest[i].x + elDest[i].w, o, elDest[i].y, elDest[i].y + elDest[i].h, elDest[i].text]);
if (uqIdx.x.indexOf(elDest[i].x) < 0) uqIdx.x.push(elDest[i].x);
if (uqIdx.y.indexOf(elDest[i].y) < 0) uqIdx.y.push(elDest[i].y);
}
elLists[_src.parentElement.getAttribute("data-page-Number")] = Object.assign({}, elDest);
}
- 更改了 viewer.css
+++ input{padding:0px;border:1px solid #e0e0e0;}input:focus{background-color:red;}