使用Python从Excel工作表中检索表格

Question

以下屏幕截图是文件“bills.xlsx”的外观。

我的目标是创建一个方法，该方法采用产品名称并检索包含该产品的所有账单。

这是我的代码：


from openpyxl import Workbook, load_workbook
file_path = 'C:/Users/Mohamed Hamdi/Desktop/bills.xlsx'
total_bills = []
bills_workbook = load_workbook(file_path)
bills_sheet = bills_workbook.active

for row in range(1, bills_sheet.max_row + 1):
    bill_info = {'bill number': None, 'history': None, 'products': [], 'total price': None}
    products = []

    for col in range(1, 7):
        
        cell_value = str(bills_sheet.cell(row=row, column=col).value)
        if 'None' not in cell_value:  # Check if cell_value is not None and not empty

            if "Bill Number" in cell_value:
                products.append({'Bill Number': bills_sheet.cell(row=row, column=col + 1).value})
                
            elif "History" in cell_value:
                products.append({'History': bills_sheet.cell(row=row, column=col + 1).value})
            elif "Total price" in cell_value:
                products.append({'Total price': bills_sheet.cell(row=row, column=col + 1).value})
            elif col==6:
                
                products.append({'product name' : bills_sheet.cell(row=row, column=3).value}) 
                products.append({'count': bills_sheet.cell(row=row, column=4).value})
                products.append({'price': bills_sheet.cell(row=row, column=5).value})
                products.append({'total price': bills_sheet.cell(row=row, column=6).value})

def find_rows_between_start_and_end(sheet_data, search_key, search_value, start_element, end_element):
    # Initialize the list to store results
    result_rows_list = []

    # Loop through the sheet data to find all occurrences of the search value
    for entries in sheet_data:
        found_search_value = False
        temp_result_rows = []
        
        for idx, item in enumerate(entries):
            if isinstance(item, dict) and search_key in item:
                if item[search_key] == search_value:
                    found_search_value = True
                    search_index = idx
                    start_index = None
                    end_index = None

                    # Find the first occurrence of the start element before the found element
                    for j in range(search_index, -1, -1):
                        if start_element in entries[j].values():
                            start_index = j
                            break

                    # Find the first occurrence of the end element after the start element
                    if start_index is not None:
                        for k, sub_item in enumerate(entries[start_index:], start=start_index):
                            if end_element in sub_item.values():
                                end_index = k
                                break

                    # If both start and end elements are found, extract rows within the range
                    if start_index is not None and end_index is not None:
                        temp_result_rows.extend(entries[start_index:end_index + 1])

        # If search value is found, add the collected rows to the result list
        if found_search_value:
            result_rows_list.append(temp_result_rows)

    return result_rows_list


search_key = "product name"
search_value = "product"
start_element = "Bill number"
end_element = "Total price"  # Corrected end element name

result_rows_list = find_rows_between_start_and_end(products, search_key, search_value, start_element, end_element)  # Pass 'data' instead of 'result'
for result_rows in result_rows_list:
    for row in result_rows:
        print(row)

这就是数组 products 的样子：

[{'Bill number': 31}, {'Date': '2024-04-16 04:39:44'}]
[{'product name': 'product1'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'product name': 'product1'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'Total price': 20}]
[]
[{'Bill number': 32}, {'Date': '2024-04-16 04:41:51'}]
[{'product name': 'product'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'product name': 'product'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'product name': 'product'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'Total price': 30}]
[]
[{'Bill number': 33}, {'Date': '2024-04-16 04:44:10'}]
[{'product name': 'product'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'product name': 'product'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'product name': 'product'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'Total price': 30}]

当我尝试检索账单表时，它没有给我任何信息或不完整的数据。
如果您有像我想要的那样直接检索数据的方法，请分享。

如果我搜索 product1 关键字，我期望的输出：

[{'Bill number': 31}, {'Date': '2024-04-16 04:39:44'}]
[{'product name': 'product1'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'product name': 'product1'}, {'count': 1}, {'price': 10}, {'total price': 10}]
[{'Total price': 20}]

Answer 1

我已更新以包含“账单编号”31、32、33 的部分。为了使提取的数据更加清晰，我更改了某些行中的值以区分彼此。

假设产品名称是唯一的，您可以轻松使用 Pandas 提取产品“表”，使用该名称作为 Pandas 数据帧中的搜索词。

例如；搜索“product1”，获取行索引，然后将它们提取为数据框，包括上面和下面的“账单编号”和“总价格”数据行。

然后可以使用 Pandas 'to_excel' 将此提取的表直接写入新的或现有的 Excel 工作表

但是，当您希望数据作为使用特定键名称的字典列表时，您可以根据要求处理数据帧的每一行并放入字典列表中。
将输出的数据是这样的（记住一些值已被更改以确保唯一性）。

Data List
[{'Bill Number': '31.0'}, {'Date': '2024-04-16 04:39:44'}]
[{'product name': 'product1'}, {'count': '1'}, {'price': '10.0'}, {'total price': '20.0'}]
[{'product name': 'product1'}, {'count': '2'}, {'price': '20.0'}, {'total price': '40.0'}]
[{'Total price': '20'}]

代码示例

import pandas as pd

def search_df_table(xl_file, sheet, search):
    dframe = pd.read_excel(xl_file, sheet_name=sheet)

    ### Extract indexes where the search term 'product1' in the column 'History'
    idx_list = dframe.index[dframe['History'] == search].tolist()
    ### Also get the rows 1 above and below for Header and Total Price row
    dframe = dframe.iloc[idx_list[0] - 1: idx_list[-1] + 2]

    ### Promote 'Bill number' row to the Header
    dframe.columns = dframe.iloc[0]
    dframe = dframe[1:]

    return dframe


def format_dataframe(dframe):
    ### List to hold the extracted table
    data_list = []

    ### Convert the extracted dataframe to comma separated string
    dframe_csv = dframe.to_csv(header=True, index=False).strip('\n').split('\n')

    ### List of keys for the product name rows
    product_keys = ['product name', 'count', 'price', 'total price']

    ### Process each data row and the last row with 'Total price',
    ### Add the key for each value then add to the data_list
    for line in dframe_csv:
        int_list = []
        ln_list = line.strip(",|\r").split(",")
        if 'Bill Number' in line:
            for x in range(len(ln_list)):
                if x % 2 == 1: continue  # Skip 2nd element, this is the value for the key (1st element)
                key = 'Date' if ln_list[x] == 'History' else ln_list[x]  # Change name to 'Date' if 'History'
                int_list.append({key: ln_list[x + 1]})
            data_list.append(int_list)
        elif 'Total price' in line:
            data_list.append([{ln_list[0]: ln_list[1]}])
        else:
            product_values = line.strip(",|\r").split(",")
            int_list = [{product_keys[i]: product_values[i]} for i in range(len(product_keys))]
            data_list.append(int_list)
    return data_list


file_name = 'bills.xlsx'
worksheet = 'Sheet1'
search_term = 'product1'

df = search_df_table(file_name, worksheet, search_term)
### The extracted dataframe can be written to Excel Sheet as is
print("\nWrite data to Excel sheet")
df.to_excel('new.xlsx', index=False, header=True)

### Or convert data into a list of dictionaries
print("Data List")
for row in format_dataframe(df):
    print(row)

使用Python从Excel工作表中检索表格

问题描述投票：0回答：1

1个回答

最新问题

使用Python从Excel工作表中检索表格

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1