Python世界的菜鸟们仍在探索该分析,任何帮助将不胜感激。
样本数据:
import pandas as pd
df = pd.DataFrame({'A_confidence':[100,100,100,100,100,100,100,100,85,85,85,0,100,95,100,80,100,100,100,100,85,100,100,100,100,100,100,100,0,100,100,100,100,100,100,100,100,0,0,0,0,100,100,100,100,100,0,100,100],'B_confidence':[100,100,100,99.972,100,100,100,100,100,100,100,0,100,100,100,70,100,100,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,98,100,0,0,0,0,100,0,0,100,0,85,0,0],'C_confidence':[40,40,0,91.3,93,93,93,40,0,0,0,0,99,0,0,0,97.6,96.3,0,96.3,0,0,0,98.8,0,99.2,99.2,99.2,0,0,97.8,97.8,40,0,0,97.5,97.2,0,0,0,0,0,0,0,99,0,0,0,0],'D_confidence':[30,30,30,0,30,30,30,97.5,0,0,0,0,98.5,99.6,0,30,98.7,30,0,30,30,30,30,99.2,30,30,30,30,30,91.5,30,30,0,30,30,93.8,99,0,0,0,30,97.8,0,0,99.3,0,0,0,0],'A_name':['EEEEE','EEEEE','EEEEE','EEEEE','EEEEE','EEEEE','EEEEE','DDDDD','DDDDD','DDDDD','DDDDD','NULL','CCCCC','DDDDD','CCCCC','EEEEE','CCCCC','EEEEE','EEEEE','EEEEE','BBBBB','CCCCC','CCCCC','CCCCC','CCCCC','DDDDD','DDDDD','DDDDD','NULL','EEEEE','AAAAA','AAAAA','EEEEE','CCCCC','CCCCC','CCCCC','CCCCC','NULL','NULL','NULL','NULL','CCCCC','EEEEE','EEEEE','AAAAA','DDDDD','NULL','BBBBB','BBBBB'],'B_name':['zzzzz','zzzzz','xxxxx','yyyyy','mmmmm','mmmmm','mmmmm','nnnnn','qqqqq','qqqqq','qqqqq','NULL','ppppp','sssss','qqqqq','qqqqq','eeeee','jjjjj','NULL','jjjjj','qqqqq','qqqqq','qqqqq','ppppp','qqqqq','kkkkk','kkkkk','kkkkk','qqqqq','jjjjj','iiiii','iiiii','adfgh','qqqqq','qqqqq','adfgh','ppppp','NULL','NULL','NULL','NULL','ppppp','NULL','NULL','adfgh','NULL','adfgh','NULL','NULL'],'Time':['39:14.3','39:14.3','44:14.2','53:23.3','48:09.2','48:09.2','48:09.2','59:33.1','45:56.9','45:56.9','45:56.9','10:18.9','30:47.0','52:10.7','02:06.4','27:05.7','34:05.0','43:25.7','44:10.1','43:25.7','02:55.0','08:54.1','08:54.1','04:24.3','08:54.1','26:53.7','26:53.7','26:53.7','54:46.6','58:07.0','58:52.2','58:52.2','36:40.9','43:55.4','43:55.4','07:48.8','25:37.3','10:01.3','21:11.9','21:11.9','42:52.3','56:04.8','59:47.3','59:47.3','00:58.5','01:44.1','12:28.4','30:08.8','30:08.8']})
我一直在搜索论坛,但还没有找到一种方法来确定执行滚动平均值以获取平均值(7天)的方法。预期:
1. group unique combinations of A_name and B_name(wrt to date)
2. Calculate the rolling average for 7 days based on time(uploaded time) for
a. "A_confidence",
b. "B_confidence",
c. "C_confidence",
d. "D_confidence",
e. "E_confidence",
f. "F_confidence"
在线示例未同时显示groupby和多列滚动平均值的组合。在此先感谢
确保在(call_dt_key,aes_raw)上有索引。
CTE_Dates返回表中所有日期的列表,并计算每天的平均值。第一天需要这个average_current_day。服务器将以任何方式扫描整个索引,因此计算该平均值很便宜。
然后,对于每个不同的日子,我都使用自加入来计算前40天的平均值。这将在第一天返回NULL,在主查询中将其替换为average_current_day。
您不必在这里使用CTE,它只是使查询更易于阅读。
WITH
CTE_Dates
AS
(
SELECT
call_dt_key
,call_dt_key - INTERVAL '41 day' AS dt_from
,call_dt_key - INTERVAL '1 day' AS dt_to
,AVG(test_aes.aes_raw) AS average_current_day
FROM test_aes
GROUP BY call_dt_key
)
SELECT
CTE_Dates.call_dt_key ,COALESCE(prev40.average_40, CTE_Dates.average_current_day) AS average_40
从
CTE_Dates
LEFT JOIN LATERAL
(
SELECT AVG(test_aes.aes_raw) AS average_40
FROM test_aes
WHERE
test_aes.call_dt_key >= CTE_Dates.dt_from
AND test_aes.call_dt_key <= CTE_Dates.dt_to
) AS prev40 ON true
ORDER BY call_dt_key;
结果:
| call_dt_key | average_40 || -------------------------------------------- | ------------------ -|
| 2016年1月1日00:00:00 | 15 |
| 2016年1月5日00:00:00 | 15 |
| 2016年1月10日00:00:00 | 15.333333333333334 |
| 2016年1月15日00:00:00 | 15.294117647058824 |
| 2016年1月16日00:00:00 | 15.5 |
| 2016年1月20日00:00:00 | 15.652173913043478 |
| 2016年1月21日00:00:00 | 15.6 |
| 2016年1月31日00:00:00 | 15.555555555555555 |
| 2016年2月1日00:00:00 | 15.517241379310345 |
| 2016年2月10日00:00:00 | 15.483870967741936 |
| 2016年2月15日00:00:00 | 15.652173913043478 |
| 2016年2月26日00:00:00 | 15.333333333333334 |
| 2016年3月4日00:00:00 | 15 |
| 2016年3月18日00:00:00 | 15 |
这里是结果。