我正在使用 Camelot PDF 解析库从 PDF 文件中提取数据,但是当 PDF 包含表格和非表格数据时,我遇到了问题。 Camelot似乎只提取表格数据而忽略非表格内容。这是我正在使用的代码片段
from langchain.document_loaders.csv_loader import CSVLoader
import camelot
import uuid
from camelot.core import TableList
def export_tables_as_csv(filepath):
tables = camelot.read_pdf(filepath, backend="ghostscript")
for i, table in enumerate(tables):
tables.export(f'table_{i+1}.csv', f='csv')
def generate_random_filename():
return str(uuid.uuid4())
from collections import namedtuple
Document = namedtuple('Document', ['page_content', 'metadata'])
def formChunksForTable(filepath=None, file_type=None, url=None):
try:
if not filepath:
print("Error: Filepath is missing.")
return []
all_docs = []
tables = camelot.read_pdf(filepath, backend="ghostscript", flavor='stream')
if isinstance(tables, TableList):
for i, table in enumerate(tables):
if table.df is not None and not table.df.empty:
for row_idx, row in enumerate(table.df.values):
page_content = ' '.join(row)
metadata = {'source': f'table-page-{i+1}-row-{row_idx+1}'}
doc = Document(page_content, metadata)
all_docs.append(doc)
else:
print(f"Warning: Table {i+1} is empty.")
if all_docs:
print("Documents:", all_docs)
else:
print("No valid tables found in the PDF.")
else:
print("No tables found in the PDF.")
return all_docs
except Exception as e:
print(f"Error: {e}")
return []
如果您的问题是“Camelot 是否从 PDF 中提取除表格以外的任何内容?”那么答案是“不”。 Camelot only 从 PDF 中提取表格。
在他们的主页上,标题为“Camelot:人类的 PDF 表提取”