Thursday, May 24, 2012

Using STDEVP in TSQL to eliminate outliers

If you have a bell curve distribution of events, then only 68% of values will be within 1 standard deviation away from the mean (95% are covered by 2 standard deviations).
Using stdev / stdevp sql function I can limit the original data set and find an average for the values that are within the appropriate number of standard deviations excluding outliers.


declare @stdtest table (colname varchar(20), colvalue int)

insert into @stdtest (colname, colvalue) values ('x', 377942)
insert into @stdtest (colname, colvalue) values ('a', 377943)
insert into @stdtest (colname, colvalue) values ('a', 2041)
insert into @stdtest (colname, colvalue) values ('h', 5)
insert into @stdtest (colname, colvalue) values ('h', 6)
insert into @stdtest (colname, colvalue) values ('b', 5)
insert into @stdtest (colname, colvalue) values ('b', 8)
insert into @stdtest (colname, colvalue) values ('h', 7)

insert into @stdtest (colname, colvalue) values ('z', 377943)
insert into @stdtest (colname, colvalue) values ('z', 50000)
insert into @stdtest (colname, colvalue) values ('z', 2041)
insert into @stdtest (colname, colvalue) values ('z', 524)
insert into @stdtest (colname, colvalue) values ('z', 50)
insert into @stdtest (colname, colvalue) values ('z', 4)
insert into @stdtest (colname, colvalue) values ('z', -100000)


select test.colname, avg(test.colvalue) stat_average, count(*) effective_cnt, min(bounds.cnt) total_cnt
from @stdtest test
inner join
    (
        select colname, STDEVP(colvalue) cstdev, AVG(colvalue) cavg, count(*) cnt
        from @stdtest
        group by colname
    ) bounds on bounds.colname = test.colname
where abs(bounds.cavg - test.colvalue) <= bounds.cstdev * 1   -- 1stdev = 68%   2stdev = 95%
group by test.colname

2 comments: