PY - 比较多个 CSV 文件

问题描述 投票:0回答:2

我有几个 .csv 文件,其中一些文件比其他文件有更多的标题和更多的列。

我想将缺失的列填充到没有它的文件中

我试过这个:

import os
import csv

# Get the directory where the Python script is located
script_directory = os.path.dirname(__file__)

# Define the folder containing the CSV files
folder_path = script_directory

# Get a list of CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Find the file with the most columns (maximum header length)
max_columns_file = max(csv_files, key=lambda file_name: len(next(csv.reader(open(os.path.join(folder_path, file_name), "r", newline=""), delimiter=";"))))

# Read the header from the file with the most columns
with open(os.path.join(folder_path, max_columns_file), "r", newline="") as file:
    header_to_copy = next(csv.reader(file, delimiter=";"))

# Iterate through each CSV file and copy the header
for file_name in csv_files:
    file_path = os.path.join(folder_path, file_name)
    data = []

    with open(file_path, "r", newline="") as file:
        reader = csv.reader(file, delimiter=";")
        for row in reader:
            data.append(row)

    # Update the header in the current file
    original_header = data[0]
    data[0] = header_to_copy

    # Print the differences between the original header and the copied header
    differences = set(original_header) ^ set(header_to_copy)
    print(f"Processed: {file_name}")
    print("Differences:")
    print("+----------------+----------------+")
    print("| Original Header | Copied Header  |")
    print("+----------------+----------------+")

    for item in differences:
        original_present = "Yes" if item in original_header else "No"
        copied_present = "Yes" if item in header_to_copy else "No"
        print(f"| {item:<16}| {original_present:<16}| {copied_present:<16}|")

    print("+----------------+----------------+")

    # Print the position of each element in the header if it was not in the original header
    position_dict = {element: position for position, element in enumerate(header_to_copy, start=1)}
    for element in differences:
        if element in header_to_copy:
            position = position_dict[element]
            print(f"Element '{element}' is at position {position} in the header.")

    # Add a new column with the value "0" from the second row onwards at the specified position
    for i in range(1, len(data)):
        for element in differences:
            if element in header_to_copy:
                position = position_dict[element]
                data[i].insert(position - 1, "0")

    # Write the modified data back to the CSV file
    with open(file_path, "w", newline="") as file:
        writer = csv.writer(file, delimiter=";")
        writer.writerows(data)

还有一个例子。我成功地找到了使用该位置,但是当有特定情况时它不起作用:

Talbe1.csv |a|b|c|d|e|f|克| |-|-|-|-|-|-|---| |1|2|3|5|6|7|888|

表2.csv |a|b|c|c (c)|d|e|f|克| |-|-|-|-----|-|-|-|---| |1|2|3| 4 |5|6|7|888|

结果: 表1.csv |a|b|c|c (c)|d|e|f|克| |-|-|-|-----|-|-|-|---| |1|2|3| 4 |5|6|7|888|

预期结果: 表1.csv |a|b|c|c (c)|d|e|f|克| |-|-|-|-----|-|-|-|---| |1|2|3| 5 |0|6|7|888|

它不起作用,我不知道为什么,因为我使用相同的位置编号

python csv compare
2个回答
0
投票

我建议除了跟踪文件名之外还构建一个标题列表,这样您就不必重新打开文件,并且可以使用该列表来构建输出。再次迭代文件时,只需使用

csv.DictReader
而不是
csv.reader
,您可以使用
row.get(header)
(默认为
0
)来构建该行的元素列表。然后将该行写入该文件的输出。您甚至不需要为此保持“主”文件打开。

headers_list = <the list of headers built during your first pass to find the longest number of headers>

for file in LIST_OF_FILES:
    if file != MAX_HEADER_FILE_NAME:
        data=list()
        data.append(headers)
        with open(os.join(folder_path, file), "r") as infile:
            mydata = list()
            csv.DictReader csv_file(infile)
            for row in csv_file:
                for header in headers:
                    mydata.append[row.get(header, 0)]
            data.append(mydata)
        //now open for writing as csv_write
        with open(os.join(folder_path, file), "w") as outfile:
            csv_writer = csv.writer(outfile, delimiter=',')
            for row in data:
                csv_writer.writerow(row);

 

0
投票

有什么具体原因不使用 Pandas 吗?它会更快/更有效率。


import pandas as pd
import os

#get file path
folder_path  = os.path.dirname(__file__)

#get list of files
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

#create empty df
df = pd.DataFrame()

#loop through each file and add it to empty df we created
for file in csv_files:
    dfx = pd.read_csv(file, delim=';')
    dfx['filename'] = file
    df = pd.concat([df, dfx]) 

#fill nulls with 0s
df = df.fillna(0)
#reset index
df = df.reset_index(drop=True).copy()

#split into component dataframes
for file in list(df['filename'].unique()):
    dfx = df.loc[df['filename'] == file].reset_index(drop=True).copy()
    dfx.to_csv(file, index=False)
© www.soinside.com 2019 - 2024. All rights reserved.