Python openpyxl:读取 Excel 文件时忽略隐藏行和隐藏列

问题描述 投票:0回答:1

我使用下面的函数来检查 Excel 中的隐藏行和列,并在读取时忽略它们并对数据框进行一些基本的预处理。我使用示例文件进行了测试,隐藏的列没有被读入,但行仍然可见。我也没有收到任何与行对象有关的错误。

import os
    import pandas as pd
    from openpyxl import load_workbook

def hide_rows_cols(ws, row, col):
    """Check if a cell is hidden."""
    row_hidden = ws.row_dimensions[row].hidden
    col_hidden = ws.column_dimensions[col].hidden
    return row_hidden or col_hidden

def clean_files(sample_files):
    for f in sample_files:
        with pd.ExcelWriter("/tmp/" + os.path.basename(f), engine='xlsxwriter') as writer:
            wb = load_workbook(f, read_only=False, data_only=True)
            for sheet_name in wb.sheetnames:
                ws = wb[sheet_name]
                data = []
                for idx, row in enumerate(ws.iter_rows(values_only=True), start=1):
                    if not any(hide_rows_cols(ws, row=idx, col=str(col_idx+1)) for col_idx, _ in enumerate(row)):
                        data.append(row)
                df = pd.DataFrame(data)
                df.dropna(how='all', axis=1, inplace=True)  # Remove empty columns
                df.dropna(how='all', inplace=True)  # Remove empty rows
                df.reset_index(drop=True, inplace=True)  # Reset row indices
                df.to_excel(writer, sheet_name=sheet_name, index=False)

        dbfs_f = f.replace("/dbfs", "dbfs:")
        dbutils.fs.cp("file:/tmp/" + os.path.basename(f), dbfs_f)
python-3.x pandas openpyxl pandas.excelwriter
1个回答
0
投票

好问题!可能的解决方案如下:

我从一个看起来像这样的

xlxs
文件开始

如您所见,第 3 行到第 21 行被隐藏,B 列也被隐藏。 然后我应用了以下函数(这是对您建议的解决方案的简单修改):

import os
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter

def is_hidden(ws, row=None, col_letter=None):
    if row and ws.row_dimensions[row].hidden:
        return True
    if col_letter and ws.column_dimensions[col_letter].hidden:
        return True
    return False

def clean_files(sample_files, output_folder):
    for f in sample_files:
        if not os.path.exists(output_folder):
            os.makedirs(output_folder, exist_ok=True)
        
        output_file_path = os.path.join(output_folder, os.path.basename(f))
        
        with pd.ExcelWriter(output_file_path, engine='xlsxwriter') as writer:
            wb = load_workbook(f, read_only=False, data_only=True)
            for sheet_name in wb.sheetnames:
                ws = wb[sheet_name]
                data, visible_cols = [], []
                
                for col_idx in range(1, ws.max_column + 1):
                    col_letter = get_column_letter(col_idx)
                    if not is_hidden(ws, col_letter=col_letter):
                        visible_cols.append(col_idx)
                
                for idx, row in enumerate(ws.iter_rows(values_only=True, max_col=ws.max_column), start=1):
                    if not is_hidden(ws, row=idx):
                        data.append([cell for col_idx, cell in enumerate(row, start=1) if col_idx in visible_cols])
                
                df = pd.DataFrame(data)
                if not df.empty:
                    df.dropna(how='all', axis=1, inplace=True) 
                    df.dropna(how='all', inplace=True) 
                    df.reset_index(drop=True, inplace=True)  
                    df.to_excel(writer, sheet_name=sheet_name, index=False)


sample_files = [r"************\Documents\hidden.xlsx"]

output_folder = r"************\Documents\Cleaned Excels"

clean_files(sample_files, output_folder)

这给了我

您可以根据您的问题调整此解决方案。另请注意,您可以传递要处理的

xlxs
文件列表。

© www.soinside.com 2019 - 2024. All rights reserved.