我想将许多PDF转换为文本文件。格式非常重要,只有Adobe Reader才能正确显示格式(PDFMiner或PyPDF2格式不正确。)
是否有一种方法可以从Adobe Reader中自动执行“导出为文本”功能?
以下代码将对一个文件执行所需的操作。我建议将脚本组织成几个小功能,然后循环调用这些功能以处理许多文件。您需要使用keyboard
或其他工具安装pip
库。
import pathlib as pl
import os
import keyboard
import time
import io
KILL_KEY = 'esc'
read_path = pl.Path("C:/Users/Sam/Downloads/WS-1401-IP.pdf")
####################################################################
write_path = pl.Path(str(read_path.parent/read_path.stem) + ".txt")
overwrite_file = os.path.exists(write_path)
# alt -- activate keyboard shortcuts
# `F` -- open file menu
# `v` -- select "save as text" option
# keyboard.write(write_path)
# `alt+s` -- save button
# `ctrl+w` -- close file
os.startfile(read_path)
time.sleep(1)
keyboard.press_and_release('alt')
time.sleep(1)
keyboard.press_and_release('f') # -- open file menu
time.sleep(1)
keyboard.press_and_release('v') # -- select "save as text" option
time.sleep(1)
keyboard.write(str(write_path))
time.sleep(1)
keyboard.press_and_release('alt+s')
time.sleep(2)
if overwrite_file:
keyboard.press_and_release('y')
# wait for program to finish saving
waited_too_long = True
for _ in range(5):
time.sleep(1)
if os.path.exists(write_path):
waited_too_long = False
break
if waited_too_long:
with io.StringIO() as ss:
print(
"program probably saved to somewhere other than",
write_path,
file = ss
)
msg = ss.getvalue()
raise ValueError(msg)
keyboard.press_and_release('ctrl+w') # close the file