我有几个 .csv 文件,其中一些文件比其他文件有更多的标题和更多的列。
我想将缺失的列填充到没有它的文件中
我试过这个:
import os
import csv
# Get the directory where the Python script is located
script_directory = os.path.dirname(__file__)
# Define the folder containing the CSV files
folder_path = script_directory
# Get a list of CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
# Find the file with the most columns (maximum header length)
max_columns = 0
max_columns_file = ""
for file_name in csv_files:
file_path = os.path.join(folder_path, file_name)
with open(file_path, "r", newline="") as file:
reader = csv.reader(file, delimiter=";")
header = next(reader)
if len(header) > max_columns:
max_columns = len(header)
max_columns_file = file_name
# Read the header from the file with the most columns
header_to_copy = []
header_file_path = os.path.join(folder_path, max_columns_file)
with open(header_file_path, "r", newline="") as file:
reader = csv.reader(file, delimiter=";")
header_to_copy = next(reader)
# Define a function to ensure data rows have the same number of columns as the header
def ensure_same_columns(data, header_length):
for i in range(1, len(data)):
while len(data[i]) < header_length:
data[i].append("0")
while len(data[i]) > header_length:
data[i].pop()
# Iterate through each CSV file and copy the header
for file_name in csv_files:
file_path = os.path.join(folder_path, file_name)
data = []
with open(file_path, "r", newline="") as file:
reader = csv.reader(file, delimiter=";")
for row in reader:
data.append(row)
# Ensure data rows have the same number of columns as the header
ensure_same_columns(data, len(header_to_copy))
# Update the header in the current file
original_header = data[0]
data[0] = header_to_copy
# Print the differences between the original header and the copied header
differences = set(original_header).symmetric_difference(header_to_copy)
if differences:
print(f"Processed: {file_name}")
print("Differences:")
print("+----------------+----------------+")
print("| Original Header | Copied Header |")
print("+----------------+----------------+")
for item in differences:
original_present = "Yes" if item in original_header else "No"
copied_present = "Yes" if item in header_to_copy else "No"
print(f"| {item:<16}| {original_present:<16}| {copied_present:<16}|")
print("+----------------+----------------+")
else:
print(f"Processed: {file_name}")
print("No difference")
# Print the position of each element in the header if it was not in the original header
position_dict = {element: position for position,
element in enumerate(header_to_copy, start=1)}
for element in differences:
if element in header_to_copy:
position = position_dict[element]
print(
f"Element '{element}' is at position {position} in the header.")
# Write the modified data back to the CSV file
with open(file_path, "w", newline="") as file:
writer = csv.writer(file, delimiter=";")
writer.writerows(data)
# Print the differences between the original header and the copied header
differences = set(original_header).symmetric_difference(header_to_copy)
例如(我所期望的)
文件1.csv
a;c;d;e
1;3;4;5
文件 2.csv
a;b;c;d;e;f
1;2;3;4;5
结果一定是:
文件 1.csv
a;b;c;d;e;f
1;0;3;4;5;0
我得到了什么:
但实际上是这样的:
a;b;c;d;e;f
1;3;4;5;0;0
我尝试复制标题(最简单的方法)
但对于我接下来的所有行(从第二行开始)值,我不知道如何解决它(例如在 Excel 中添加列)
我想利用新旧列之间差异的位置,并从第 2 行开始添加“0”列,但仍然遇到麻烦。
我建议除了跟踪文件名之外还构建一个标题列表,这样您就不必重新打开文件,并且可以使用该列表来构建输出。再次迭代文件时,只需使用
csv.DictReader
而不是 csv.reader
,您可以使用 row.get(header)
(默认为 0
)来构建该行的元素列表。然后将该行写入该文件的输出。您甚至不需要为此保持“主”文件打开。
headers_list = <the list of headers built during your first pass to find the longest number of headers>
for file in LIST_OF_FILES:
if file != MAX_HEADER_FILE_NAME:
data=list()
data.append(headers)
with open(os.join(folder_path, file), "r") as infile:
mydata = list()
csv.DictReader csv_file(infile)
for row in csv_file:
for header in headers:
mydata.append[row.get(header, 0)]
data.append(mydata)
//now open for writing as csv_write
with open(os.join(folder_path, file), "w") as outfile:
csv_writer = csv.writer(outfile, delimiter=',')
for row in data:
csv_writer.writerow(row);
小更新:
import os
import csv
# Get the directory where the Python script is located
script_directory = os.path.dirname(__file__)
# Define the folder containing the CSV files
folder_path = script_directory
# Get a list of CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
# Find the file with the most columns (maximum header length)
max_columns_file = max(csv_files, key=lambda file_name: len(next(csv.reader(open(os.path.join(folder_path, file_name), "r", newline=""), delimiter=";"))))
# Read the header from the file with the most columns
with open(os.path.join(folder_path, max_columns_file), "r", newline="") as file:
header_to_copy = next(csv.reader(file, delimiter=";"))
# Iterate through each CSV file and copy the header
for file_name in csv_files:
file_path = os.path.join(folder_path, file_name)
data = []
with open(file_path, "r", newline="") as file:
reader = csv.reader(file, delimiter=";")
for row in reader:
data.append(row)
# Update the header in the current file
original_header = data[0]
data[0] = header_to_copy
# Print the differences between the original header and the copied header
differences = set(original_header) ^ set(header_to_copy)
print(f"Processed: {file_name}")
print("Differences:")
print("+----------------+----------------+")
print("| Original Header | Copied Header |")
print("+----------------+----------------+")
for item in differences:
original_present = "Yes" if item in original_header else "No"
copied_present = "Yes" if item in header_to_copy else "No"
print(f"| {item:<16}| {original_present:<16}| {copied_present:<16}|")
print("+----------------+----------------+")
# Print the position of each element in the header if it was not in the original header
position_dict = {element: position for position, element in enumerate(header_to_copy, start=1)}
for element in differences:
if element in header_to_copy:
position = position_dict[element]
print(f"Element '{element}' is at position {position} in the header.")
# Add a new column with the value "0" from the second row onwards at the specified position
for i in range(1, len(data)):
for element in differences:
if element in header_to_copy:
position = position_dict[element]
data[i].insert(position - 1, "0")
# Write the modified data back to the CSV file
with open(file_path, "w", newline="") as file:
writer = csv.writer(file, delimiter=";")
writer.writerows(data)
还有一个例子。 我成功地找到了使用该位置,但是当有特定情况时它不起作用:
Talbe1.csv a;b;c;c (c);d;e;e-f;f;g 1;0;2;0.2;3;4;0;0;0
表2.csv a;b;c;d;e;e-f;f;g 1;2;3;4;5;6;5.4;999
结果:
a;b;c;c (c);d;e;e-f;f;g 1;2;3;0;4;5;6;5.4;999
预期结果:
a;b;c;c (c);d;e;e-f;f;g 1;2;**0;**3;4;5;6;5.4;999
它不起作用,我不知道为什么,因为我使用相同的位置编号