目标是在去除+-标准偏差后获得均值。对于我的表ACTUAL_OUTPUT中的EQUIP E1和MODEL M1,有6个数据,即10、10、100000、10、10.10。因此,预期结果为10,因为在这种情况下100000是一个离群值。
我的表格和数据如下。我对此进行了查询。有没有更好的办法呢?
--Table and data
CREATE TABLE ACTUAL_OUTPUT
(
EQUIP VARCHAR2(15),
MODEL VARCHAR2(15),
LOT VARCHAR2(15),
VAL NUMBER
)
delete FROM ACTUAL_OUTPUT;
INSERT INTO ACTUAL_OUTPUT VALUES('E1','M1','L1',10);
INSERT INTO ACTUAL_OUTPUT VALUES('E1','M1','L2',10);
INSERT INTO ACTUAL_OUTPUT VALUES('E1','M1','L3',100000);
INSERT INTO ACTUAL_OUTPUT VALUES('E1','M1','L4',10);
INSERT INTO ACTUAL_OUTPUT VALUES('E1','M1','L5',10);
INSERT INTO ACTUAL_OUTPUT VALUES('E1','M1','L6',10);
INSERT INTO ACTUAL_OUTPUT VALUES('E1','M2','L7',50);
-- Is there a better way then this?
SELECT avg(VAL_2) OUTLIER_REMOVED
FROM
(
SELECT
EQUIP,
MODEL,
CASE WHEN
VAL <= AVG(VAL) OVER (PARTITION BY EQUIP,MODEL)+2*STDDEV(VAL) OVER (PARTITION BY EQUIP,MODEL)
AND
VAL >= AVG(VAL) OVER (PARTITION BY EQUIP,MODEL)-2*STDDEV(VAL) OVER (PARTITION BY EQUIP,MODEL)
THEN VAL ELSE NULL END VAL_2
FROM ACTUAL_OUTPUT
)
WHERE EQUIP='E1' AND MODEL = 'M1';
我将其写为:
SELECT avg(VAL_2) as OUTLIER_REMOVED
FROM (SELECT ao.*,
STDDEV(VAL) OVER (PARTITION BY EQUIP, MODEL) as val_stddev,
AVG(VAL) OVER (PARTITION BY EQUIP, MODEL) as val_avg
FROM ACTUAL_OUTPUT ao
WHERE EQUIP = 'E1' AND MODEL = 'M1'
) ao
WHERE VAL >= val_avg - 2 * val_stddev AND
VAL <= val_avg + 2 * val_stddev;
但是,如果您的数据库具有良好的优化程序,则性能应该相同。
有什么区别?
where
子句在子查询中。大多数优化程序仍然应该执行此操作,但是我希望尽快进行过滤,以防万一优化程序感到困惑。