我正在尝试获取坏行发生位置的文件名。我有一个函数可以获取所有错误行并将其打印到 .txt 文件中,但是当我传入文件名参数时,它只会打印所有文件名。
这是坏线功能:
def badlines_collect(self, bad_line: list[str]) -> None:
badline_lst.append(bad_line)
today = date.today()
todaytime = datetime.datetime.now().strftime("%Y%m%d")
with open("bad_line1_{}.txt".format(todaytime), 'w') as fp:
for line in badline_lst:
fp.write("Today's date: " + str(today) + currentfile + ": {}\n".format(line))
fp.close()
print(badline_lst)
return None
这是我调用它并传入参数以获取文件名的函数:
def getCSV(self, cur_publisher):
"""
:return:
"""
print(bucket_name + '/' + cur_publisher)
dfm = pd.DataFrame()
filename = list(self.bucket.list_blobs(prefix=cur_publisher))
print(filename)
for file_name in filename:
if '.csv' in str(file_name.name):
print("Crawling on File {} ......\n".format(file_name.name))
currentfile = file_name.name
print(currentfile)
blop = self.bucket.blob(blob_name = "{}".format(file_name.name))
data = blop.download_as_string()
df = pd.read_csv(io.BytesIO(data), encoding='utf-8', sep=",", engine='python',
on_bad_lines=self.badlines_collect)
if (df.count().sum()) > 0:
df.insert(0, "filename", file_name.name)
dfm = pd.concat([dfm, df], ignore_index=True)
dfm = pd.concat([dfm, df], ignore_index=True)
dfm = dfm.rename_axis(index='', columns="index")
print(dfm)
else:
pass
print("{} is empty \n".format(file_name.name))
else:
pass
return self.stack
我得到的结果是打印到 bad_line1.txt 中的 gcs 存储桶中的所有文件名,而不是 bad line 错误
我假设
badline_lst
是一个全局变量。
我会推荐以下内容:仅使用badlines_collect
功能来收集file_name
s和bad_line
s。
然后在 getCSV 的末尾一次写下它们
试试这个:
def getCSV(self, cur_publisher):
print(bucket_name + '/' + cur_publisher)
dfm = pd.DataFrame()
filename = list(self.bucket.list_blobs(prefix=cur_publisher))
# maybe you don't need this here?
badline_lst = []
print(filename)
for file_name in filename:
if '.csv' in str(file_name.name):
print("Crawling on File {} ......\n".format(file_name.name))
currentfile = file_name.name
print(currentfile)
blop = self.bucket.blob(blob_name = "{}".format(file_name.name))
data = blop.download_as_string()
# define the function here, so you have access to the variables...
def badlines_collect(self, bad_line: list[str], file_name: str) -> None:
badline_lst.append((bad_line, file_name))
df = pd.read_csv(io.BytesIO(data), encoding='utf-8', sep=",", engine='python',
on_bad_lines=self.badlines_collect)
if (df.count().sum()) > 0:
df.insert(0, "filename", file_name.name)
#obj = DF(df)
dfm = pd.concat([dfm, df], ignore_index=True)
#print(df.head(10).to_string())
#df = df.rename_axis(index='', columns="index")
#df.Date = pd.to_datetime(df.date)
# obj = DF(df)
dfm = pd.concat([dfm, df], ignore_index=True)
dfm = dfm.rename_axis(index='', columns="index")
print(dfm)
#self.stack.push(obj)
else:
pass
print("{} is empty \n".format(file_name.name))
else:
pass
today = date.today()
todaytime = datetime.datetime.now().strftime("%Y%m%d")
with open("bad_line1_{}.txt".format(todaytime), 'w') as fp:
for line in badline_lst:
fp.write("Today's date: " + str(today) + line[1] + ": {}\n".format(line[0]))
fp.close()
print(badline_lst)
return None
return self.stack