我有两个
xlsx
文件如下:
value1 value2 value3
0.456 3.456 0.4325436
6.24654 0.235435 6.376546
4.26545 4.264543 7.2564523
和
value1 value2 value3
0.456 3.456 0.4325436
6.24654 0.23546 6.376546
4.26545 4.264543 7.2564523
我需要比较所有单元格,如果来自
file1 !=
的单元格来自 file2
print
的单元格。
import xlrd
rb = xlrd.open_workbook('file1.xlsx')
rb1 = xlrd.open_workbook('file2.xlsx')
sheet = rb.sheet_by_index(0)
for rownum in range(sheet.nrows):
row = sheet.row_values(rownum)
for c_el in row:
print c_el
如何添加
file1
和file2
的比较单元格?
pandas
,你可以像这样简单地完成:
import pandas as pd
df1 = pd.read_excel('excel1.xlsx')
df2 = pd.read_excel('excel2.xlsx')
difference = df1[df1!=df2]
print difference
结果将如下所示:
以下方法应该可以帮助您开始:
from itertools import zip_longest
import xlrd
rb1 = xlrd.open_workbook('file1.xlsx')
rb2 = xlrd.open_workbook('file2.xlsx')
sheet1 = rb1.sheet_by_index(0)
sheet2 = rb2.sheet_by_index(0)
for rownum in range(max(sheet1.nrows, sheet2.nrows)):
if rownum < sheet1.nrows:
row_rb1 = sheet1.row_values(rownum)
row_rb2 = sheet2.row_values(rownum)
for colnum, (c1, c2) in enumerate(zip_longest(row_rb1, row_rb2)):
if c1 != c2:
print("Row {} Col {} - {} != {}".format(rownum+1, colnum+1, c1, c2))
else:
print("Row {} missing".format(rownum+1))
这将显示两个文件之间不同的任何单元格。对于给定的两个文件,将显示:
Row 3 Col 2 - 0.235435 != 0.23546
xlrd.formular.colname()
:
print "Cell {}{} {} != {}".format(rownum+1, xlrd.formula.colname(colnum), c1, c2)
给你:
Cell 3B 0.235435 != 0.23546
import pandas as pd
import numpy as np
from xlsxwriter.utility import xl_rowcol_to_cell
template = pd.read_excel("template.xlsx",na_values=np.nan,header=None)
testSheet = pd.read_excel("test.xlsx",na_values=np.nan,header=None)
rt,ct = template.shape
rtest,ctest = testSheet.shape
df = pd.DataFrame(columns=['Cell_Location','BaseTemplate_Value','CurrentFile_Value'])
for rowNo in range(max(rt,rtest)):
for colNo in range(max(ct,ctest)):
# Fetching the template value at a cell
try:
template_val = template.iloc[rowNo,colNo]
except:
template_val = np.nan
# Fetching the testsheet value at a cell
try:
testSheet_val = testSheet.iloc[rowNo,colNo]
except:
testSheet_val = np.nan
# Comparing the values
if (str(template_val)!=str(testSheet_val)):
cell = xl_rowcol_to_cell(rowNo, colNo)
dfTemp = pd.DataFrame([[cell,template_val,testSheet_val]],
columns=['Cell_Location','BaseTemplate_Value','CurrentFile_Value'])
df = df.append(dfTemp)
df 是所需的数据框
2023更新
使用
df = pd.concat([df,dfTemp], ignore_index=True)
代替 df.append(dfTemp)
下面是程序的修改版本,修改为接受多个文件并将结果输出到txt文件。
import pandas as pd
import numpy as np
from xlsxwriter.utility import xl_rowcol_to_cell
wbList1 = ["tpl-excel1","tpl-excel2","tpl-excel3"]
wbList2 = ["test-excel1","test-excel2","test-excel3"]
output = open("workbook-compare-results.txt", "w")
for index in range(len(wbList1)):
fileName1 = f'{wbList1[index]}.xlsx'
fileName2 = f'{wbList2[index]}.xlsx'
woorkBook1 = pd.read_excel(fileName1,na_values=np.nan,header=None)
woorkBook2 = pd.read_excel(fileName2,na_values=np.nan,header=None)
rt,ct = woorkBook1.shape
rtest,ctest = woorkBook2.shape
df = pd.DataFrame(columns=['Cell_Location','BaseTemplate_Value','CurrentFile_Value'])
output.write(f"DIFFERENCE: {fileName1} | {fileName2}\n")
dfOut = ""
for rowNo in range(max(rt,rtest)):
for colNo in range(max(ct,ctest)):
# Fetching the woorkBook1 value at a cell
try:
woorkBook1_val = woorkBook1.iloc[rowNo,colNo]
except:
woorkBook1_val = np.nan
# Fetching the woorkBook2 value at a cell
try:
woorkBook2_val = woorkBook2.iloc[rowNo,colNo]
except:
woorkBook2_val = np.nan
# Comparing the values
if (str(woorkBook1_val)!=str(woorkBook2_val)):
cell = xl_rowcol_to_cell(rowNo, colNo)
dfTemp = pd.DataFrame([[cell,woorkBook1_val,woorkBook2_val]],
columns=['Cell_Location','BaseTemplate_Value','CurrentFile_Value'])
df = pd.concat([df,dfTemp], ignore_index=True)
if df.empty:
output.write("NO DIFFERENCE IN DATA\n")
else:
dfOut = df.to_string(index=False)
output.write(f"{dfOut}\n")
output.write("\n")
output.close()
from openpyxl import load_workbook
wb1 = load_workbook('test.xlsx')
wb2 = load_workbook('validation.xlsx')
for worksheet in wb1.sheetnames:
sheet1 = wb1[worksheet]
sheet2 = wb2[worksheet]
# iterate through the rows and columns of both worksheets
for row in range(1, sheet1.max_row + 1):
for col in range(1, sheet1.max_column + 1):
cell1 = sheet1.cell(row, col)
cell2 = sheet2.cell(row, col)
if cell1.value != cell2.value:
print("Sheet {0} -> Row {1} Column {2} - {3} != {4}".format(worksheet ,row, col, cell1.value, cell2.value))
df_file1 = pd.read_csv("Source_data.csv")
df_file1 = df_file1.replace(np.nan, '', regex=True) #Replacing Nan with space
df_file2 = pd.read_csv("Target_data.csv")
df_file2 = df_file2.replace(np.nan, '', regex=True)
df_i = pd.concat([df_file1, df_file2], axis='columns', keys=['file1', 'file2'])
df_f = df_i.swaplevel(axis='columns')[df_s.columns[0:]]
def highlight_diff(data, color='yellow'):
attr = 'background-color: {}'.format(color)
other = data.xs('file1', axis='columns', level=-1)
return pd.DataFrame(np.where(data.ne(other, level=0), attr, ''), index=data.index, columns=data.columns)
df_final = df_f.style.apply(highlight_diff, axis=None)
writer = pd.ExcelWriter('comparexcels.xlsx')
df_final.to_excel(writer)
writer.save()
import pandas as pd
import numpy as np
df1=pd.read_excel('Product_Category_Jan.xlsx')
df2=pd.read_excel('Product_Category_Feb.xlsx')
df1.equals(df2)
comparison_values = df1.values == df2.values
print (comparison_values)
rows,cols=np.where(comparison_values==False)
for item in zip(rows,cols):
df1.iloc[item[0], item[1]] = '{} --> {}'.format(df1.iloc[item[0],
item[1]],df2.iloc[item[0], item[1]])
改编 Martin Evans 上面的答案,因为 xlrd 不再支持 xlsx 和 openpyxl,这是我当前的解决方案:
from itertools import zip_longest
from openpyxl import load_workbook
def xlsx_same(fn1, fn2):
same = True
wb1 = load_workbook(filename = fn1, read_only=True)
wb2 = load_workbook(filename = fn2, read_only=True)
if len(wb1.sheetnames) != len(wb2.sheetnames):
print(f" Difference in number of sheets {len(wb1.sheetnames)} != {len(wb2.sheetnames)}")
same = False
for sheet in wb1.sheetnames:
sheet1 = wb1[sheet]
try:
sheet2 = wb2[sheet]
except KeyError:
print(f"{fn2} does not have sheet {sheet} which is in {fn1}")
same = False
break
if sheet1.max_row != sheet2.max_row:
print(f" Difference in number of rows {len(sheet1.rows)} != {len(sheet2.rows)}")
same = False
if sheet1.max_column != sheet2.max_column:
print(f" Difference in number of columns {sheet1.max_column} != {sheet2.max_column}")
same = False
for rownum,row_wb1 in enumerate(sheet1.rows):
if rownum < sheet2.max_row:
row_wb2 = sheet2.rows[rownum]
for colnum, (c1, c2) in enumerate(zip_longest(row_wb1, row_wb2)):
if c1 != c2:
same = False
print(
"Row {} Col {} - {} != {}".format(
rownum + 1, colnum + 1, c1, c2
)
)
else:
same = False
print("Row {} missing".format(rownum + 1))
return same