我创建了一个函数,它将打开目录中的每个文件,并从每个文件中提取文本,并使用Pandas将其输出到Excel工作表中。每个文件类型的索引似乎工作正常。但是当文本从路径目录中的第一个文件中提取时,它似乎是用第一个文件的提取文本替换其他文件中的其他提取文本。请帮帮忙,谢谢!
from pathlib import Path
import shutil
from datetime import datetime
import time
from configparser import ConfigParser
import glob
import fileinput
import pandas as pd
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import docx2txt
from pptx import Presentation
p = Path('C:/Users/XXXX/Desktop/test_folder')
txt_files = list(p.rglob('*txt'))
PDF_files = list(p.rglob('*pdf'))
csv_files = list(p.rglob('*csv'))
docx_files = list(p.rglob('*docx'))
pptx_files = list(p.rglob('*pptx'))
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out = [],[]
data = []
print(pptx_files)
if path.suffix == ".pdf":
for name1 in PDF_files:
data.append(pdf_to_text(name1))
return data
elif path.suffix == ".docx":
for name2 in docx_files:
docx_out = (docx2txt.process(name2))
return docx_out
elif path.suffix == ".pptx":
for file in pptx_files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
pptx_out.append(run.text)
return pptx_out
else:
return f.readlines()
文本内容文件名这是测试first_pdf.pdf
这个块
if path.suffix == ".pdf":
for name1 in PDF_files:
data.append(pdf_to_text(name1))
return data
在附加第一个PDF文件后从函数返回。它永远不会到达第二个因为你从for
循环内部返回。这应该解决它:
if path.suffix == ".pdf":
for name1 in PDF_files:
data.append(pdf_to_text(name1))
return data