我想用python3编写一个函数,从有235页、大小为13.6mb的阿拉伯语pdf文件中提取数据,重点是从第51页到第67页(含)提取数据,然后根据特定输入过滤提取的数据,然后导出最终过滤后的数据数据卓越。
现在的问题是 Excel 文件是空的,当我尝试调查时,我发现“df”没有以阿拉伯语形式正确打印,而是以相反的相当混乱的形状打印。
这是我迄今为止编写的代码...
from PyPDF2 import PdfReader
import pandas as pd
from pandas import ExcelWriter
from io import StringIO
from pdfminer.high_level import extract_text
from pathlib import Path
import fitz
import arabic_reshaper
from bidi.algorithm import get_display
def reshape_text(text):
reshaped_text = arabic_reshaper.reshape(text)
bidi_text = get_display(reshaped_text)
return bidi_text
def flat_filter():
# try:
print("Starting PDF processing...")
pdf_path = Path(r"C:\Users\Documents\Personal\Python\pd\char.pdf")
if not pdf_path.exists():
raise FileNotFoundError(f"Error: PDF file not found at path: {pdf_path}")
start_page = 51
end_page = 67
with open(pdf_path, "rb") as pdf_file:
chunks = [] #store the upcoming dfs
for page_num in range(start_page, end_page + 1):
# page = doc.load_page(page_num)
# text = page.get_text("text")
# page = pdf_reader.pages[page_num]
# text = page.extract_text()
text = extract_text(pdf_path, page_numbers=[page_num], maxpages=1, password="")
# print(text)
filtered_text = [line for line in text.splitlines() if
not any(word.lower() in line.lower() for word in ["a", "130", "الأرضي"])]
# print(filtered_text)
chunk_df = pd.DataFrame([line.split() for line in filtered_text])
chunks.append(chunk_df)
df = pd.concat(chunks, ignore_index=True)
# print("DataFrame shape:", df.shape)
# print("DataFrame columns:", df.columns)
#print(df)
columns_to_filter = [2, 4, 5, 7] #(0-based)
filter_conditions = [] #store the filter conditions
while True:
condition = input("Enter a filter condition for column {}:..".format(columns_to_filter[len(filter_conditions)]+1))
filter_conditions.append(condition)
choice = input("Do you want to add another filter condition? (yes/no): ")
if choice.lower() == 'no':
for col_index, condition in zip(columns_to_filter, filter_conditions):
filter_combined = " & ".join(filter_conditions)
df = df[(df.iloc[:, columns_to_filter].fillna('').apply(lambda x: x.str.contains(filter_combined)).all(axis=1))]
print("Exporting DataFrame to Excel...")
# Export DataFrame to Excel file
writer = pd.ExcelWriter("FLAT.xlsx", engine='openpyxl')
df = df.map(reshape_text)
df.to_excel(writer, sheet_name="Data", index=True)
writer.close()
print("Successfully exported filtered data to FLAT.xlsx!")
break
# except Exception as e:
# print(e)
flat_filter()
我需要提取的数据图像:
这应该可以帮助您开始。它将:-
接下来,您可以根据需要对数据进行排序/过滤。
阿拉伯文本的标题存在一些问题,因此您可能需要单独处理这些问题。您可以使用 df.rename() 方法来更新它们。我的假设是标题在提取的所有页面上都相同。
import pandas as pd
import fitz
pdf_file = r'arabic_doc.pdf'
list_of_list = []
doc = fitz.open(pdf_file)
for page in doc.pages(51, 67):
tabs = page.find_tables()
if tabs.tables:
list_of_list.append(tabs[0].extract())
df = pd.concat([pd.DataFrame(d) for d in list_of_list])
df.columns = df.iloc[0]
df = df.drop([0], axis=0)
df = df.rename_axis('page_index').reset_index()
df.head()