我有两个 groupby 列(患者、遭遇)、一个时间戳列和一个变量列(值),我想要转发填充。
我只想转发组中一行中的填充值,如果当前行中的时间戳列与包含最后输入的值(不是最后一次转发填充值)的行之间的时间差小于一定量 -说60分钟。
这是我们进行前向填充之前的一个示例:
现在我想要的前向填充后:
我提供了下面的代码,其中我使用自定义“应用”方法,但我有 1.23 亿行数据和大约 20 个变量列 (C) 来转发填充。每列向前填充花费的时间太长,我想知道是否可以使用矢量方法来减少时间。
这是我的分组方法:
def ffill_across_episodes(group, targetCol,maxMinsDiff):
curPat = 0
curEnc = 0
lastValidEntry = np.nan
hasValidEntry = False
validTimestamp = None
for index, row in group.iterrows():
#print(f"Target col timestamp = {row['Event_timestamp']}, value = {row[targetCol]}")
if curPat!=row['PATIENT_ID'] or curEnc != row['ENCNTR_ID']: # change in patient - reset everything
#print(f"change patient or encounter")
curPat = row['PATIENT_ID']
curEnc = row['ENCNTR_ID']
if np.isnan(row[targetCol]):
hasValidEntry=False
#print(f"set NON valid prior entry")
else:
#print(f"set valid prior entry")
hasValidEntry=True
lastValidEntry=row[targetCol]
validTimestamp = row['Event_timestamp']
else: # same encounter and patient
if np.isnan(row[targetCol]): # have a case for fillforward
if hasValidEntry:
#print(f"has valid prior entry. Timediff is: {( row['Event_timestamp']-validTimestamp).total_seconds()/60:.2f} mins")
if ( row['Event_timestamp']-validTimestamp).total_seconds()/60 < maxMinsDiff:
group.at[index, targetCol] = lastValidEntry
#print(f"set index={index} entry to {lastValidEntry}")
else: # is a valid entry, so reset the valid entries
#print(f"set valid prior entry")
hasValidEntry=True
lastValidEntry=row[targetCol]
validTimestamp = row['Event_timestamp']
return group
这是我的前向填充代码:
for aVar in VITALS_COLS:
if aVar in df.columns:
print(f"Fwd filling {aVar} column by patient, encounter and across episode boundaries")
maxMinsDiff = MAX_HR_INTERVAL_BTW_VITALS*60
df_updated = df.groupby(GROUP_BY_ENCOUNTER_COLS).apply(ffill_across_episodes,aVar,maxMinsDiff)
else:
print(f"error: {aVar} is not a column within the dataframe")
df = df_updated
df_updated=None
df["last_valid_timestamp"] = df["timestamp"].where(~df["value"].isna()).ffill()
df["timedelta"] = df["timestamp"] - df["last_valid_timestamp"]
df.loc[df["timedelta"].dt.days <= 1, "value"] = df["value"].ffill()
df = df.drop(columns=["last_valid_timestamp", "timedelta"])
Original DataFrame:
timestamp value
0 2022-01-01 NaN
1 2022-01-02 cat
2 2022-01-03 NaN
3 2022-01-04 NaN
4 2022-01-05 dog
New DataFrame:
timestamp value
0 2022-01-01 NaN
1 2022-01-02 cat
2 2022-01-03 cat
3 2022-01-04 NaN
4 2022-01-05 dog