我有这个代码:
import os
import traceback
import pdfplumber
from docx import Document
def read_docx(file_path):
try:
doc = Document(file_path)
content = [paragraph.text for paragraph in doc.paragraphs]
return '\n'.join(content)
except Exception as e:
return f"Error reading DOCX file: {e}"
def read_pdf(file_path):
try:
text = ''
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or ''
return text
except Exception as e:
return f"Error reading PDF file: {e}"
def read_txt(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
return f"Error reading TXT file: {e}"
def read_file(folder_path, patent_name=None, file_name=None):
if file_name:
file_path = os.path.join(folder_path, patent_name, file_name)
elif patent_name:
file_path = os.path.join(folder_path, patent_name)
else:
file_path = os.path.join(folder_path)
if os.path.basename(file_path).startswith('~$'):
return "Skipping temporary file."
if file_path.endswith('.docx'):
return read_docx(file_path)
elif file_path.endswith('.pdf'):
return read_pdf(file_path)
elif file_path.endswith('.txt') and file_name:
return read_txt(file_path)
else:
raise ValueError("Unsupported file format or missing file name for .txt file")
def save_text_to_file(folder_path, text, patent_name, file_name):
try:
file_path = os.path.join(folder_path, patent_name)
if not os.path.exists(file_path):
os.makedirs(file_path)
with open(os.path.join(file_path, file_name), "w", encoding='utf-8') as f:
f.write(text)
except Exception as err:
print(err, traceback.format_exc())
由于某种原因,我能够读取除一个之外的所有 docx 文件。文件是在
docID
的基础上读取的。这是docID
的结构:
Mode LastWriteTime Length Name
---- ------------- ------ ----
d----- 16/04/2024 18:00 custom_template
-a---- 16/04/2024 17:48 162129 input_disclosure.docx
Mode LastWriteTime Length Name
---- ------------- ------ ----
-a---- 16/04/2024 17:47 39928 input_customTemplate.docx
这是我收到的错误:
raise PackageNotFoundError("Package not found at '%s'" % pkg_file)
docx.opc.exceptions.PackageNotFoundError: Package not found at 'C:\Users\hp\Desktop\Projects\PatentGenie\temp\234\custom_template\~$put_customTemplate.docx'
注:
清除了 MS Word 的临时文件和应用程序数据文件,代码工作了。