Etsy's skyline port to perl
First, skyline is an anomaly detection tool published by Etsy. It use numpy/scipy to implement 9 detection algorithms. You can find it at http://github.com/etsy/skyline.git
I have heard PDL for a long time, and then I thought: maybe I can re-implement those algorithms using PDL? This must be the best way to learn PDL!
Now, I got it done.
use warnings; | |
use strict; | |
use 5.010; | |
#use Data::Dumper; | |
use PDL; | |
use PDL::Fit::Polynomial; | |
use PDL::Finance::Talib; | |
use Statistics::Distributions qw/tdistr/; | |
use Statistics::Normality qw/shapiro_wilk_test/; | |
use JSON; | |
use File::Slurp; | |
use constant FULL_DURATION => 86400; | |
use constant MAX_RESOLUTION => 3600; | |
#$PDL::toolongtoprint = 20000; | |
sub tail_avg { | |
my $timeseries = shift; | |
return $timeseries->[-1][1] if $#$timeseries < 3; | |
my $t = | |
( $timeseries->[-1][1] + $timeseries->[-2][1] + $timeseries->[-3][1] ) / | |
3; | |
return $t; | |
} | |
sub median_absolute_deviation { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
my $median = $series->median(); | |
my $demedianed = abs( $series - $median ); | |
my $median_deviation = $demedianed->median(); | |
return undef if $median_deviation == 0; | |
my $test_statistic = $demedianed->at(-1) / $median_deviation; | |
return 1 if $test_statistic > 6; | |
} | |
sub first_hour_average { | |
my $timeseries = shift; | |
my $last_hour_threshold = time() - ( FULL_DURATION - 3600 ); | |
my $series = pdl( map { $_->[1] } | |
grep { $_->[0] < $last_hour_threshold } @$timeseries ); | |
my $mean = ( $series->stats )[0]; | |
my $stdDev = ( $series->stats )[6]; | |
my $t = tail_avg($timeseries); | |
return abs( $t - $mean ) > 3 * $stdDev; | |
} | |
sub stddev_from_average { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
my $mean = ( $series->stats )[0]; | |
my $stdDev = ( $series->stats )[6]; | |
my $t = tail_avg($timeseries); | |
return abs( $t - $mean ) > 3 * $stdDev; | |
} | |
sub least_squares { | |
my $timeseries = shift; | |
my $x = pdl( map { $_->[0] } @$timeseries ); | |
my $y = pdl( map { $_->[1] } @$timeseries ); | |
#my $A = $x->dummy(0)->append(1); | |
my ( $yfit, $coeffs ) = fitpoly1d( $x, $y, 2 ); | |
my @errors; | |
for my $i ( 0 .. $y->nelem - 1 ) { | |
my $projected = $coeffs->index(1) * $x->index($i) + $coeffs->index(0); | |
my $error = $y->index($i) - $projected; | |
push @errors, $error; | |
} | |
return undef if $#errors < 3; | |
my $std_dev = ( pdl(@errors)->stats )[6]; | |
my $t = ( $errors[-1] + $errors[-2] + $errors[-3] ) / 3; | |
return abs($t) > $std_dev * 3 and int($std_dev) != 0 and int($t) != 0; | |
} | |
sub histogram_bins { | |
my $timeseries = shift; | |
my $t = tail_avg($timeseries); | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
# Attention: there is some different between pdl hist and numpy histogram! | |
my ( $bins, $h ) = | |
hist( $series, $series->min, $series->max, $series->nelem / 15 ); | |
for my $index ( 0 .. $h->nelem - 1 ) { | |
if ( $h->index($index) <= 20 ) { | |
if ( $index == 0 ) { | |
if ( $t <= $bins->index(0) ) { | |
return 1; | |
} | |
} | |
elsif ( $t >= $bins->index($index) | |
and $t < $bins->index( $index + 1 ) ) | |
{ | |
return 1; | |
} | |
} | |
} | |
return undef; | |
} | |
sub mean_subtraction_cumulation { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] || 0 } @$timeseries ); | |
my $mean = ( $series->stats )[0]; | |
$series = $series - $mean; | |
my $stdDev = ( $series->stats )[6]; | |
return abs( $series->at(-1) ) > 3 * $stdDev; | |
} | |
sub stddev_from_moving_average { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
my $expAverage = ta_ema( $series, 101 ); | |
my $stdDev = ta_stddev( $series, 101, 1 ); | |
return abs( $series->at(-1) - $expAverage->at(-1) ) > 3 * $stdDev->at(-1); | |
} | |
sub grubbs { | |
my $timeseries = shift; | |
my $series = pdl( map { $_->[1] } @$timeseries ); | |
my $mean = ( $series->stats )[0]; | |
my $stdDev = ( $series->stats )[6]; | |
my $tail_average = tail_avg($timeseries); | |
my $z_score = ( $tail_average - $mean ) / $stdDev; | |
my $len_series = $series->nelem; | |
my $threshold = tdistr( $len_series-2, 0.05/(2*$len_series) ); | |
my $threshold_squared = $threshold * $threshold; | |
my $grubbs_score = ( ( $len_series - 1 ) / sqrt($len_series) ) * | |
sqrt( $threshold_squared / ( $len_series - 2 + $threshold_squared ) ); | |
return $z_score > $grubbs_score; | |
} | |
sub sw_test { | |
my $timeseries = shift; | |
my $hour_ago = time() - 3600; | |
my $reference = | |
[ map { $_->[1] } grep { $_->[0] >= $hour_ago } @$timeseries ]; | |
return undef unless $#$reference > 6 and $#$reference < 5000; | |
my ( $pval, $w_statistic ) = shapiro_wilk_test($reference); | |
return $pval < 0.05 ? 1 : undef; | |
} | |
sub main { | |
my $data = from_json( read_file('data.json') ); | |
my $initial = time - MAX_RESOLUTION; | |
my $min_pass = 7; | |
for my $datapoint ( @{ $data->{'results'} } ) { | |
$datapoint->[0] = $initial; | |
$initial += 1; | |
} | |
my $ret = grep { $_->( $data->{'results'} ) } ( | |
\&median_absolute_deviation, \&first_hour_average, | |
\&stddev_from_average, \&least_squares, | |
\&histogram_bins, \&mean_subtraction_cumulation, | |
\&stddev_from_moving_average, \&grubbs, | |
\&sw_test | |
); | |
say $ret > $min_pass ? "Your data don't pass test." : "OK"; | |
} | |
main(); |
I should say that PDL is not so easy to write as numpy. For example: you must write '$p->index($p->nelem - 1)' to just get the last one of an array! In numpy, the numpy object has methods almost totally like original object. But in PDL, we didn't get these easily things...you can't write '$p->[-1]' or even '$p->index(-1)'!tks, I change this to '$p->at(-1)' now.
BTW, I didn't find any K-S test implement at CPAN(well, except Statisic::R),so I use S-W test instead. S-W test is better when the length of array is less than 5000. I think the check values in latest one hour should be less than this, yes?
$p->slice(-1)
and $p->at(-1)
so much thanks~~Those functions distribute in too many PODs(and the `->at()` examples in PDL::Slice just show howto get array but not one item), I just can't find the right thing quickly...
This is my first PDL script, may you tell me some overall documents or books about PDL?
http://sourceforge.net/projects/pdl/files/PDL_2013/PDL-Book/
Great book to get you started in PDL.
so much tks!
Do you have an example data.json file?