我有几个 .csv 文件,其中一些文件比其他文件有更多的标题和更多的列。
我想将缺失的列填充到没有它的文件中
我试过这个:
import os
import csv
# Get the directory where the Python script is located
script_directory = os.path.dirname(__file__)
# Define the folder containing the CSV files
folder_path = script_directory
# Get a list of CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
# Find the file with the most columns (maximum header length)
max_columns_file = max(csv_files, key=lambda file_name: len(next(csv.reader(open(os.path.join(folder_path, file_name), "r", newline=""), delimiter=";"))))
# Read the header from the file with the most columns
with open(os.path.join(folder_path, max_columns_file), "r", newline="") as file:
header_to_copy = next(csv.reader(file, delimiter=";"))
# Iterate through each CSV file and copy the header
for file_name in csv_files:
file_path = os.path.join(folder_path, file_name)
data = []
with open(file_path, "r", newline="") as file:
reader = csv.reader(file, delimiter=";")
for row in reader:
data.append(row)
# Update the header in the current file
original_header = data[0]
data[0] = header_to_copy
# Print the differences between the original header and the copied header
differences = set(original_header) ^ set(header_to_copy)
print(f"Processed: {file_name}")
print("Differences:")
print("+----------------+----------------+")
print("| Original Header | Copied Header |")
print("+----------------+----------------+")
for item in differences:
original_present = "Yes" if item in original_header else "No"
copied_present = "Yes" if item in header_to_copy else "No"
print(f"| {item:<16}| {original_present:<16}| {copied_present:<16}|")
print("+----------------+----------------+")
# Print the position of each element in the header if it was not in the original header
position_dict = {element: position for position, element in enumerate(header_to_copy, start=1)}
for element in differences:
if element in header_to_copy:
position = position_dict[element]
print(f"Element '{element}' is at position {position} in the header.")
# Add a new column with the value "0" from the second row onwards at the specified position
for i in range(1, len(data)):
for element in differences:
if element in header_to_copy:
position = position_dict[element]
data[i].insert(position - 1, "0")
# Write the modified data back to the CSV file
with open(file_path, "w", newline="") as file:
writer = csv.writer(file, delimiter=";")
writer.writerows(data)
还有一个例子。我成功地找到了使用该位置,但是当有特定情况时它不起作用:
Talbe1.csv |a|b|c|d|e|f|克| |-|-|-|-|-|-|---| |1|2|3|5|6|7|888|
表2.csv |a|b|c|c (c)|d|e|f|克| |-|-|-|-----|-|-|-|---| |1|2|3| 4 |5|6|7|888|
结果: 表1.csv |a|b|c|c (c)|d|e|f|克| |-|-|-|-----|-|-|-|---| |1|2|3| 4 |5|6|7|888|
预期结果: 表1.csv |a|b|c|c (c)|d|e|f|克| |-|-|-|-----|-|-|-|---| |1|2|3| 5 |0|6|7|888|
它不起作用,我不知道为什么,因为我使用相同的位置编号
我建议除了跟踪文件名之外还构建一个标题列表,这样您就不必重新打开文件,并且可以使用该列表来构建输出。再次迭代文件时,只需使用
csv.DictReader
而不是 csv.reader
,您可以使用 row.get(header)
(默认为 0
)来构建该行的元素列表。然后将该行写入该文件的输出。您甚至不需要为此保持“主”文件打开。
headers_list = <the list of headers built during your first pass to find the longest number of headers>
for file in LIST_OF_FILES:
if file != MAX_HEADER_FILE_NAME:
data=list()
data.append(headers)
with open(os.join(folder_path, file), "r") as infile:
mydata = list()
csv.DictReader csv_file(infile)
for row in csv_file:
for header in headers:
mydata.append[row.get(header, 0)]
data.append(mydata)
//now open for writing as csv_write
with open(os.join(folder_path, file), "w") as outfile:
csv_writer = csv.writer(outfile, delimiter=',')
for row in data:
csv_writer.writerow(row);
有什么具体原因不使用 Pandas 吗?它会更快/更有效率。
import pandas as pd
import os
#get file path
folder_path = os.path.dirname(__file__)
#get list of files
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
#create empty df
df = pd.DataFrame()
#loop through each file and add it to empty df we created
for file in csv_files:
dfx = pd.read_csv(file, delim=';')
dfx['filename'] = file
df = pd.concat([df, dfx])
#fill nulls with 0s
df = df.fillna(0)
#reset index
df = df.reset_index(drop=True).copy()
#split into component dataframes
for file in list(df['filename'].unique()):
dfx = df.loc[df['filename'] == file].reset_index(drop=True).copy()
dfx.to_csv(file, index=False)