使用 python-docx 以特定格式在 word 文档中填充表格

问题描述 投票:0回答:0

上下文:我正在编写代码来为我的工作创建一个土壤废物分类表。该表使用 excel 文件中的标准进行各种分类,并从 excel 文件中读取实验室结果。这有三种情况

  1. 纯土的结果。我已经有一个适用于此的功能。
  2. 同一个excel文件中有土壤结果和浸出测试结果(如果我们同时订购测试,有时不需要)。这是我现在正在尝试的功能。
  3. 土壤结果在一个 excel 文件中,然后浸出结果在另一个 excel 文件中(如果由于土壤结果超出标准,我们最终不得不对原始样品进行浸出测试)。这就是我接下来要做的,但我希望一旦我解决了场景 2 中的当前问题,就会很简单。

我已经从场景 1 的函数中复制了代码,并尝试修改它以实现我想要的。问题是标准值和结果值现在没有填充 word 文档中的表格。还有关于使代码更简洁/更简洁的任何建议。我以前只学过一点 python,现在正尝试使用它来加快工作流程。

样本数量、分析物数量以及每次使用的分析物可能会有所不同,因此我需要让代码能够应用于不同的情况。

def same_file_waste_class(lab_results_file):
    lab_results_file = file_path_entry.get()
    if lab_results_file:
        envirolab_results = pd.read_excel(
            lab_results_file, sheet_name="Sheet1", header=None, index_col=None
        )

        # Drop TRIPLICATE row(s)
        mask = (
            envirolab_results.iloc[:, 2]
            .astype(str)
            .str.contains(r"\[TRIPLICATE\]", case=False, regex=True)
        )
        mask = mask.fillna(False)  # Replace NaN values with False
        envirolab_results1 = envirolab_results.loc[~mask]

        # Drop Replicate row(s) - where Replicate = 1
        envirolab_results2 = envirolab_results1[envirolab_results1.iloc[:, 4] != 1]

        # Drop unneeded columns
        columns_to_drop = [
            "Date extracted",
            "Date analysed",
            "Date prepared",
            "Reference",
            "Description",
            "Sample.1",
            "Type of sample",
            "Extraction fluid used",
        ]
        mask = envirolab_results2.iloc[0].isin(columns_to_drop)
        envirolab_results3 = envirolab_results2.drop(
            columns=envirolab_results2.columns[mask]
        )

        # Drop Row at position 1 (empty row) and 4 (Method)
        rows_to_drop = [1, 4]
        envirolab_results4 = envirolab_results3.drop(
            envirolab_results3.index[rows_to_drop]
        )

        # Rename first values in rows 1 and 2
        envirolab_results4.iat[1, 0] = "Units"
        envirolab_results4.iat[2, 0] = "PQL"

        # Drop columbs at position 1 (Sample No.) and 2 (Replicate)
        columns_to_drop = [1, 2]
        envirolab_results5 = envirolab_results4.drop(
            columns=envirolab_results4.columns[columns_to_drop]
        )

        # Replace unicode line break (_x000d_) with a space
        envirolab_results5.iloc[[0]] = envirolab_results5.iloc[[0]].applymap(
            lambda x: x.replace("_x000d_", " ") if isinstance(x, str) else x
        )

        # Split into two dataframes, for soil results and leach results
        units_row = envirolab_results5.iloc[1]  # sets the location of the Units row
        leach_mask = units_row.isin(
            ["mg/L", "pH units"]
        )  # Create a boolean mask for leach test results
        soil_mask = ~leach_mask  # Create a boolean mask for soil results

        # Select columns for each DataFrame based on the masks
        soil_results = envirolab_results5.loc[:, soil_mask]
        leach_results = envirolab_results5.loc[:, leach_mask]
        sample_number_col = envirolab_results5.iloc[:, 0]  # set sample number column
        leach_results = pd.concat(
            [sample_number_col, leach_results], axis=1
        )  # add sample number column to leach results df

        soil_results = soil_results.reset_index(
            drop=True
        )  # reset dataframe index to a default integer index

        #### Change name of leach analytes to match soil results analytes
        leach_results.iloc[0] = leach_results.iloc[0].str.replace(" in TCLP", "")

        print(leach_results.head)
        print(soil_results.head)

        # Sum columns function
        def sum_columns(df, columns, new_column, row_start):
            df.loc[0, new_column] = new_column
            df.loc[1, new_column] = df.loc[1, df.columns[df.iloc[0] == columns[0]][0]]

            for i in range(row_start, len(df)):
                float_values = []
                lt_present = False

                for col_name in columns:
                    col = df.columns[df.iloc[0] == col_name][0]
                    value = df.loc[i, col]

                    if "<" in value:
                        lt_present = True
                        value = value.replace("<", "")

                    float_values.append(float(value))

                total = round(sum(float_values), 2)

                if lt_present:
                    df.loc[i, new_column] = f"<{total}"
                else:
                    df.loc[i, new_column] = total

        # Set the columns to add for Scheduled chemicals using the sum_columns function
        SC_columns_to_sum = [
            "Aldrin",
            "alpha-BHC",
            "1,2,4-trichlorobenzene",
            "HCB",
            "beta-BHC",
            "gamma-BHC",
            "Heptachlor",
            "delta-BHC",
            "Heptachlor Epoxide",
            "gamma-Chlordane",
            "alpha-chlordane",
            "pp-DDE",
            "Dieldrin",
            "Endrin",
            "pp-DDD",
            "Endrin Aldehyde",
            "pp-DDT",
        ]
        sum_columns(soil_results, SC_columns_to_sum, "Scheduled Chemicals", 2)

        # Set the columns to add for Endosulfans (total) using the sum_columns function
        ES_columns_to_sum = ["Endosulfan I", "Endosulfan II", "Endosulfan Sulphate"]
        sum_columns(soil_results, ES_columns_to_sum, "Endosulfans (total)", 2)

        ### Extract Data
        sample_numbers = soil_results.iloc[3:, 0].tolist()
        analyte_names = soil_results.iloc[0, 1:].tolist()
        results = soil_results.iloc[3:, 1:].values.tolist()

        # Load criteria sheet
        criteria_file = "Criterias.xlsx"
        criteria_df = pd.read_excel(criteria_file, sheet_name="Sheet1", index_col=None)

        # Print the criteria DataFrame to verify the data is read correctly
        # print(criteria_df)

        # print(soil_results.head(50))

        # Get the list of analyte names in the soil results
        soil_analyte_names = soil_results.iloc[0].tolist()

        # Get the list of analyte names in the criteria DataFrame
        criteria_analyte_names = criteria_df.index.tolist()

        # Find the common analytes between the two lists
        soil_common_analytes = list(
            set(soil_analyte_names).intersection(criteria_analyte_names)
        )
        soil_analyte_positions = {
            analyte: idx for idx, analyte in enumerate(criteria_analyte_names)
        }
        soil_common_analytes = sorted(soil_common_analytes, key=lambda x: soil_analyte_positions[x])

##################################################################### Added these just to easily find this spot again as this is where i started to change the previous working function

        # Get the list of analyte names in the soil results
        leach_analyte_names = leach_results.iloc[0].tolist()

        # Get the list of analyte names in the criteria DataFrame
        criteria_analyte_names = criteria_df.index.tolist()

        # Find the common analytes between the two lists
        leach_common_analytes = list(
            set(leach_analyte_names).intersection(criteria_analyte_names)
        )
        leach_analyte_positions = {
            analyte: idx for idx, analyte in enumerate(criteria_analyte_names)
        }
        leach_common_analytes = sorted(leach_common_analytes, key=lambda x: leach_analyte_positions[x])
#####################################################################

        # Calculate the number of rows and columns needed in the table
        num_samples = len(sample_numbers)
        num_analytes = len(soil_common_analytes)
        num_columns = 6 + 2 * num_samples
        num_rows = 6 + num_analytes

        def should_highlight_cell(result, criteria):
            # Ignore cells with a "-" sign
            if result == "-":
                return False
            if result.startswith("<"):
                result_value = float(result[1:])
            else:
                result_value = float(result)

            if isinstance(criteria, str) and criteria.startswith("<"):
                criteria_value = float(criteria[1:])
            else:
                criteria_value = float(criteria)

            return result_value > criteria_value

        def shade_cell(cell, hex_color):
            tcPr = cell._element.tcPr
            if tcPr is None:
                tcPr = OxmlElement("w:tcPr")
                cell._element.append(tcPr)

            shading_elm = OxmlElement("w:shd")
            shading_elm.set(qn("w:fill"), hex_color)
            tcPr.append(shading_elm)

        # Step 3: Create a Word document
        doc = Document()
        # Change the page orientation to landscape
        section = doc.sections[0]
        section.page_width, section.page_height = (
            section.page_height,
            section.page_width,
        )
        # create table
        table = doc.add_table(rows=num_rows, cols=num_columns)

        # Step 4: Populate the table
        # Add headers and static values
        table.cell(0, 5).text = "Sample Number"
        table.cell(1, 5).text = "Sample Type"
        table.cell(2, 5).text = "Sample Location"
        table.cell(3, 5).text = "Sample Depth from Surface (m)"
        table.cell(4, 0).text = "Units"
        table.cell(4, 1).text = "mg/kg"
        table.cell(4, 2).text = "mg/kg"
        table.cell(4, 3).text = "mg/kg"
        table.cell(4, 4).text = "mg/L"
        table.cell(4, 5).text = "mg/kg"
        table.cell(5, 5).text = "PQL (Without Leach TCLP Test)"
        table.cell(2, 1).text = "WITHOUT Leach (TCLP) Test"
        table.cell(2, 3).text = "WITH Leach (TCLP) Test"
        table.cell(3, 1).text = "General Solid Waste"
        table.cell(3, 2).text = "Restricted Solid Waste"
        table.cell(3, 3).text = "General Solid Waste"
        table.cell(5, 0).text = "ANALYTE"
        table.cell(5, 1).text = "CT1"
        table.cell(5, 2).text = "CT2"
        table.cell(5, 3).text = "SCC1"
        table.cell(5, 4).text = "TCLP1"

        # Add sample numbers
        for i, sample_number in enumerate(sample_numbers):
            table.cell(0, 6 + 2 * i).text = str(sample_number)  # Soil result column
            table.cell(0, 6 + 2 * i + 1).text = (
                str(sample_number) + " (Leach)"
            )  # Leach result column

        # Add units for the results columns
        for i in range(num_samples):
            table.cell(4, 6 + 2 * i).text = "mg/kg"  # Soil result column
            table.cell(4, 6 + 2 * i + 1).text = "mg/L"  # Leach result column

        # Add results and highlight cells if they exceed the CT1 criteria
        for row_idx, analyte in enumerate(soil_common_analytes):
            # Add analyte name
            table.cell(6 + row_idx, 0).text = analyte

            # Get the corresponding row in the soil_results DataFrame
            analyte_col_index_soil = soil_results.columns.get_loc(
                soil_results.columns[soil_results.iloc[0] == analyte][0]
            )
            analyte_col_index_leach = leach_results.columns.get_loc(
                leach_results.columns[leach_results.iloc[0] == analyte][0]
            )
            analyte_data_soil = soil_results.iloc[3:, analyte_col_index_soil]
            analyte_data_leach = leach_results.iloc[3:, analyte_col_index_leach]

            # Add criteria values
            for col_idx, col in enumerate(["CT1", "CT2", "SCC1", "TCLP1", "PQL"]):
                value = criteria_df.loc[analyte, col]
                table.cell(6 + row_idx, 1 + col_idx).text = str(value)

            # Add soil and leach results
            for i in range(num_samples):
                value_soil = analyte_data_soil.iloc[i]
                value_leach = analyte_data_leach.iloc[i]

                # Soil result
                result_cell_soil = table.cell(6 + row_idx, 6 + 2 * i)
                result_cell_soil.text = str(value_soil)

                # Leach result
                result_cell_leach = table.cell(6 + row_idx, 6 + 2 * i + 1)
                result_cell_leach.text = str(value_leach)

                ct1_value = criteria_df.loc[analyte, "CT1"]

                if should_highlight_cell(str(value_soil), str(ct1_value)):
                    shade_cell(result_cell_soil, "#FFAFAF")  # dull red colour

                if should_highlight_cell(str(value_leach), str(ct1_value)):
                    shade_cell(result_cell_leach, "#FFAFAF")  # dull red colour

        # Merge cells
        table.cell(2, 1).merge(table.cell(2, 2))
        table.cell(2, 3).merge(table.cell(2, 4))
        table.cell(3, 3).merge(table.cell(3, 4))

        # Apply the 'Table Grid' style
        table.style = "Table Grid"

        def set_bold(row, start_col, end_col):
            for i in range(start_col, end_col + 1):
                cell = row.cells[i]
                if len(cell.paragraphs) > 0:
                    paragraph = cell.paragraphs[0]
                    if len(paragraph.runs) > 0:
                        run = paragraph.runs[0]
                    else:
                        run = paragraph.add_run()
                    run.font.bold = True
                    run.font.name = "Arial"
                    run.font.size = Pt(8)
                else:
                    paragraph = cell.add_paragraph()
                    run = paragraph.add_run()
                    run.font.bold = True
                    run.font.name = "Arial"
                    run.font.size = Pt(8)

        # Shading cells
        shade_color_1 = "#BFBFBF"
        shade_color_2 = "#D9D9D9"

        # Shade cells from cell(0, 0) to cell(5, 5) and cell(4, 6) to the end of row 4
        for row_idx in range(6):
            for col_idx in range(6):
                shade_cell(table.cell(row_idx, col_idx), shade_color_1)
            if row_idx == 4:
                for col_idx in range(6, num_samples + 6):
                    shade_cell(table.cell(row_idx, col_idx), shade_color_1)

        # Shade cells containing the analyte names and the criteria values
        for row_idx in range(6, 6 + len(soil_common_analytes)):
            for col_idx in range(6):
                shade_cell(table.cell(row_idx, col_idx), shade_color_2)

        # Set text in rows 0 to 5 to Bold
        for row_idx in range(6):
            row = table.rows[row_idx]
            set_bold(row, 0, num_samples + 5)

        def set_font_and_align(cell, font_name, font_size):
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    run.font.name = font_name
                    run.font.size = Pt(font_size)
                    paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

            # Set vertical alignment
            tcPr = cell._element.tcPr
            if tcPr is None:
                tcPr = OxmlElement("w:tcPr")
                cell._element.append(tcPr)
            vAlign = OxmlElement("w:vAlign")
            vAlign.set(qn("w:val"), "center")
            tcPr.append(vAlign)

        # Apply font settings and alignment to all cells
        for row_idx in range(len(table.rows)):
            for col_idx in range(len(table.columns)):
                set_font_and_align(table.cell(row_idx, col_idx), "Arial", 8)

        def set_row_height(row, height_cm):
            height_twips = int(height_cm * 28.35 * 20)  # Convert cm to twips
            tr = row._tr
            trPr = tr.get_or_add_trPr()
            trHeight = OxmlElement("w:trHeight")
            trHeight.set(qn("w:val"), str(height_twips))
            trHeight.set(qn("w:hRule"), "atLeast")
            trPr.append(trHeight)

        def set_column_width(column, width_cm):
            width_twips = int(width_cm * 28.35 * 20)  # Convert cm to twips
            for cell in column.cells:
                cell.width = width_twips
                cell._element.tcPr.tcW.type = "dxa"
                cell._element.tcPr.tcW.w = width_twips

        # Set the width of column 6 (position 5) to 2.79 cm
        set_column_width(table.columns[5], 2.79)

        # Set the height for each row
        height_cm = 0.53
        for row in table.rows:
            set_row_height(row, height_cm)

        # Save the Word document
        doc.save("output_table.docx")
        os.system("start output_table.docx")

word doc中的表格除了criteria values, soil result values和leach result values外都正确输出了

python-docx
© www.soinside.com 2019 - 2024. All rights reserved.