我使用下面的函数来检查 Excel 中的隐藏行和列,并在读取时忽略它们并对数据框进行一些基本的预处理。我使用示例文件进行了测试,隐藏的列没有被读入,但行仍然可见。我也没有收到任何与行对象有关的错误。
import os
import pandas as pd
from openpyxl import load_workbook
def hide_rows_cols(ws, row, col):
"""Check if a cell is hidden."""
row_hidden = ws.row_dimensions[row].hidden
col_hidden = ws.column_dimensions[col].hidden
return row_hidden or col_hidden
def clean_files(sample_files):
for f in sample_files:
with pd.ExcelWriter("/tmp/" + os.path.basename(f), engine='xlsxwriter') as writer:
wb = load_workbook(f, read_only=False, data_only=True)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
data = []
for idx, row in enumerate(ws.iter_rows(values_only=True), start=1):
if not any(hide_rows_cols(ws, row=idx, col=str(col_idx+1)) for col_idx, _ in enumerate(row)):
data.append(row)
df = pd.DataFrame(data)
df.dropna(how='all', axis=1, inplace=True) # Remove empty columns
df.dropna(how='all', inplace=True) # Remove empty rows
df.reset_index(drop=True, inplace=True) # Reset row indices
df.to_excel(writer, sheet_name=sheet_name, index=False)
dbfs_f = f.replace("/dbfs", "dbfs:")
dbutils.fs.cp("file:/tmp/" + os.path.basename(f), dbfs_f)
好问题!可能的解决方案如下:
我从一个看起来像这样的
xlxs
文件开始
如您所见,第 3 行到第 21 行被隐藏,B 列也被隐藏。 然后我应用了以下函数(这是对您建议的解决方案的简单修改):
import os
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
def is_hidden(ws, row=None, col_letter=None):
if row and ws.row_dimensions[row].hidden:
return True
if col_letter and ws.column_dimensions[col_letter].hidden:
return True
return False
def clean_files(sample_files, output_folder):
for f in sample_files:
if not os.path.exists(output_folder):
os.makedirs(output_folder, exist_ok=True)
output_file_path = os.path.join(output_folder, os.path.basename(f))
with pd.ExcelWriter(output_file_path, engine='xlsxwriter') as writer:
wb = load_workbook(f, read_only=False, data_only=True)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
data, visible_cols = [], []
for col_idx in range(1, ws.max_column + 1):
col_letter = get_column_letter(col_idx)
if not is_hidden(ws, col_letter=col_letter):
visible_cols.append(col_idx)
for idx, row in enumerate(ws.iter_rows(values_only=True, max_col=ws.max_column), start=1):
if not is_hidden(ws, row=idx):
data.append([cell for col_idx, cell in enumerate(row, start=1) if col_idx in visible_cols])
df = pd.DataFrame(data)
if not df.empty:
df.dropna(how='all', axis=1, inplace=True)
df.dropna(how='all', inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_excel(writer, sheet_name=sheet_name, index=False)
sample_files = [r"************\Documents\hidden.xlsx"]
output_folder = r"************\Documents\Cleaned Excels"
clean_files(sample_files, output_folder)
这给了我
您可以根据您的问题调整此解决方案。另请注意,您可以传递要处理的
xlxs
文件列表。