Introduction
The purpose of this post is to illustrate different filtering techniques for calculating the actual power curve of a wind turbine using SCADA data. Typically data in which the turbine is experiencing downtime can be filtered out using status/fault/error codes provided by the Original Equipment Manufacturer’s (OEM) SCADA system. However, it has been my experience that these data are not of sufficient quality to properly filter out all ‘bad’ data and in many cases when analyzing a wind plant these data are not provided. As such it is useful to employ statistical filtering techniques to the power curve data which approximate what I would call the ‘ideal’ normal operations of the wind turbine. We’ll employ a couple different methods in this post using R.
Setup / Import
Let’s load the necessary libraries and create a bin-wise power curve function which we’ll be calling a few times here.
#libraries
library(MASS)
library(tidyverse)
library(knitr)
library(viridis)
# binwise power curve function which we'll use a few times
calc_binwise_pc<-function(df){
df %>%
mutate(ws_bin=factor(round(dc_ws_norm,2))) %>%
group_by(ws_bin) %>%
summarise(n=n(),kw_norm=mean(kw_norm)) %>%
mutate(ws_bin_real=as.numeric(as.character(ws_bin)))
}
#show data
names(df)
## [1] "timestamp_local" "timestamp_utc"
## [3] "data_source_name" "ws_mean"
## [5] "dir_mean" "amb_temp_mean"
## [7] "wind_speed_turbulence" "kw"
## [9] "consumed_energy" "produced_energy"
## [11] "reactive_power_avg" "state"
## [13] "state.name" "density"
## [15] "dc_ws_mean"
Tidy
Let’s tidy up the data a bit. It’s useful for applying filtering techniques consistently across turbine types with different capacities to normalize the wind speed and power measurements.
#tidy
df<-mutate_if(df,is.character,as.factor)
# normalise wind speed and power
df<-mutate(df,dc_ws_norm=dc_ws_mean/max(dc_ws_mean,na.rm=TRUE),kw_norm=kw/max(kw,na.rm=TRUE))
# create container for power curves
pc<-list()
Transform
Status Code Filtering
First, what are the turbine states?
state.count<-df %>%
group_by(state.name) %>%
summarise(n=n(),isProducing=sum(kw_norm>0)/n) %>%
arrange(desc(isProducing))
kable(state.count)
state.name | n | isProducing |
---|---|---|
Initialization | 22 | 1.0000000 |
Production | 21072 | 1.0000000 |
No state data | 86 | 0.7790698 |
Start up | 383 | 0.5326371 |
Setup | 101 | 0.5148515 |
Error | 188 | 0.3085106 |
Ready | 2467 | 0.0417511 |
Health check | 52 | 0.0384615 |
Standby | 14048 | 0.0239892 |
Service | 13698 | 0.0188349 |
Low temperature | 60 | 0.0166667 |
Grid loss | 77 | 0.0129870 |
Unknown | 3 | 0.0000000 |
What does the power curve look like?
ggplot(df,aes(dc_ws_norm,kw_norm,fill=state.name))+
geom_point(color='black',pch=21,alpha=.5)+
theme_bw()+
scale_x_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
scale_y_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
labs(title='All data')
So looks like we should just filter for periods of ‘Production’… I know not all turbines are this clean, but leave me alone, it’s a sample dataset. Of course I cherry picked!
df<-mutate(df,filter.avail=state.name=='Production')
Status Code Power Curve
So by filtering the power curve for status codes, we get this:
pc[['status']]<-df %>%
filter(filter.avail & !is.na(dc_ws_norm) & !is.na(kw_norm)) %>%
calc_binwise_pc() %>%
mutate(method='status code')
# show plot
df %>% filter(filter.avail) %>%
ggplot(aes(dc_ws_norm,kw_norm))+
geom_point(data=filter(df,!filter.avail),aes(dc_ws_norm,kw_norm),color='gray',alpha=.5)+
geom_point(color='black',fill='blue',pch=21,alpha=.5)+
geom_line(data=pc[['status']],aes(ws_bin_real,kw_norm),lwd=1.5,color='red')+
theme_bw()+
scale_x_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
scale_y_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
labs(title='Filtered for Production State Name')
Statistical Filtering
For all methods below, first apply some ‘sanity’ filters which filter out obvious low/no production periods
#assume cutout wind speed = 25
df <-mutate(df,filter.basic_stat=kw>=10 & dc_ws_mean>0 & dc_ws_mean<25 & !is.na(dc_ws_mean) & !is.na(kw))
pc[['basic_stat']]<-df %>% filter(filter.basic_stat) %>%
calc_binwise_pc() %>%
mutate(method='basic statistical')
# show plot
df %>% filter(filter.basic_stat) %>%
ggplot(aes(dc_ws_norm,kw_norm))+
geom_point(data=filter(df,!filter.basic_stat),aes(dc_ws_norm,kw_norm),color='gray',alpha=.5)+
geom_point(color='black',fill='blue',pch=21,alpha=.5)+
geom_line(data=pc[['basic_stat']],aes(ws_bin_real,kw_norm),lwd=1.5,color='red')+
theme_bw()+
scale_x_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
scale_y_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
labs(title='Filtered for Basic Stat Filters')
So… better than nothing, but not very good. Let’s try and get rid of some of those outliers.
Smoothing Spline Filtering
After applying the basic statistical filters, fit a smoothing spline to the data and define any point with a residual error >10% Power as an outlier. 10% is a rule of thumb of course, but seems to be a conservative filter in that it only removes obvious outliers.
# create smoothing spline
fit.spl<-df %>%
filter(filter.basic_stat) %>%
dplyr::select(dc_ws_norm,kw_norm) %>%
smooth.spline()
# predict power using the fitted spline
df$kw.spline<-predict(object=fit.spl,df$dc_ws_norm)$y
#define outliers
df<-mutate(df,spline.resid=abs(kw_norm-kw.spline),
filter.spline=spline.resid<.1)
# calculate power curve
pc[['spline']]<-df %>%
filter(filter.basic_stat & filter.spline) %>%
calc_binwise_pc()%>%
mutate(method='spline')
# show plot
df %>% filter(filter.basic_stat & filter.spline) %>%
ggplot(aes(dc_ws_norm,kw_norm,color=spline.resid))+
geom_point(data=filter(df,!filter.basic_stat | !filter.spline),aes(dc_ws_norm,kw_norm),color='gray',alpha=.5)+
geom_point(alpha=.5)+
scale_color_viridis()+
geom_line(data=pc[['spline']],aes(ws_bin_real,kw_norm),color='red',lwd=1.5)+
theme_bw()+
scale_x_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
scale_y_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
labs(title='Filtered for Basic Stat Filters and Smoothing Spline')
OK, I’m pretty happy with that method. Let’s look at another.
Mahalanobis Filtering
After applying the basic statistical filters, use a k-means approach to define centroids to the data, then define any point where the mahalanobis distance from it’s centroid is greater than a certain threshold. I’m basing this method on a paper I found where 15 centroids seems to fit turbine power curves well when using all the data and outliers are defined where the mahalanobis distance is >2.5. I’ve found that a mahalanobis threshold of 4.0 is a bit more conservative.
# calculate centroids using k-means
set.seed(1)
k<- df %>%
filter(filter.basic_stat) %>%
dplyr::select(dc_ws_norm,kw_norm) %>%
kmeans(centers = 15,iter.max = 1000)
# format and append output
df<-mutate(df,cluster=NA)
df$cluster[df$filter.basic_stat]<-k$cluster
k_centroids<-as.data.frame(k$centers)
k_centroids$cluster<-c(1:nrow(k_centroids))
names(k_centroids)<-c('ws_centroid','kw_centroid','cluster')
df<-left_join(df,k_centroids,by='cluster')
# calculate the mahalanobis distance for points in each centroid
df<-mutate(df,mahala=NA)
for (i in 1:nrow(k_centroids)){
x<-as.matrix(df[df$cluster==i & df$filter.basic_stat,c('dc_ws_norm','kw_norm')])
if(length(x)>10){
df$mahala[df$cluster==i & !is.na(df$cluster)]<-
mahalanobis(x = x,center = c(k_centroids$ws_centroid[i],k_centroids$kw_centroid[i]),cov=cov(x))
}
}
# apply mahalanobis filter
df<-mutate(df,filter.mahala=mahala<4)
# calculate power curve
pc[['mahala']]<-df %>%
filter(filter.basic_stat & filter.mahala) %>%
calc_binwise_pc()%>%
mutate(method='mahalanobis')
# show plot
df %>% filter(filter.basic_stat & filter.mahala) %>%
ggplot(aes(dc_ws_norm,kw_norm,color=mahala))+
geom_point(data=filter(df,!filter.basic_stat),aes(dc_ws_norm,kw_norm),color='gray',alpha=.5)+
geom_point(data=filter(df,filter.basic_stat | !filter.mahala),aes(dc_ws_norm,kw_norm),color='gray',alpha=.5)+
geom_point(alpha=.5)+
geom_point(data=k_centroids,aes(ws_centroid,kw_centroid),size=3,color='black',fill='red',pch=21)+
scale_color_viridis()+
geom_line(data=pc[['mahala']],aes(ws_bin_real,kw_norm),color='red',lwd=1.5)+
theme_bw()+
scale_x_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
scale_y_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
labs(title='Filtered for Basic Stat Filters and Mahalanobis Distance')
OK, I’m happy with that as well. How about binning by power and removing statistical outliers?
Power Binning
Let’s bin the data by power (2% bin of normalized power) and filter out points where wind speed > +/- 2 standard deviations)
#create power bins
df<-mutate(df,kw_norm_bin=cut(kw_norm,seq(0,1,.02)))
# calculate normal distribution of wind speed in each power bin
power_bins<- df %>%
group_by(kw_norm_bin) %>%
summarise(power_bin_mean=mean(dc_ws_norm,na.rm=TRUE),
power_bin_sd=sd(dc_ws_norm,na.rm=TRUE))
# filter data in each bin >2sd of the mean
df<-left_join(df,power_bins,'kw_norm_bin') %>%
mutate(power_bin_pnorm=pnorm(dc_ws_norm,power_bin_mean,power_bin_sd),
filter.powerBin=between(power_bin_pnorm,1-.9545,.9545))
# show plot of bins
df %>%
filter(filter.basic_stat) %>%
ggplot(aes(kw_norm_bin,dc_ws_norm))+
geom_boxplot()+
coord_flip()+
theme_bw()+
scale_y_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
labs(title='Power Bin Illustration')
# calculate power curve
pc[['powerBin']]<-df %>%
filter(filter.basic_stat & filter.powerBin) %>%
calc_binwise_pc()%>%
mutate(method='power bin')
# show plot of filter
df %>% filter(filter.basic_stat & filter.powerBin) %>%
ggplot(aes(dc_ws_norm,kw_norm,color=power_bin_pnorm))+
geom_point(data=filter(df,!filter.basic_stat | !filter.powerBin),aes(dc_ws_norm,kw_norm),color='gray',alpha=.5)+
geom_point(alpha=.5)+
scale_color_viridis()+
geom_line(data=pc[['powerBin']],aes(ws_bin_real,kw_norm),color='red',lwd=1.5)+
theme_bw()+
scale_x_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
scale_y_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
labs(title='Filtered for Basic Stat Filters and Power Bin')
All right, we’ve got some good candidates, at least for this dataset.
Power Curves Side-by-Side
pc<-bind_rows(pc)
ggplot(pc,aes(ws_bin_real,kw_norm,color=method))+
geom_line(lwd=1.5,alpha=.5)+
theme_bw()+
scale_x_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
scale_y_continuous(breaks=seq(0,1,.1),labels=scales::percent)+
labs(title='Power Curve Comparisons')
More to come here. Thanks for reading!
Share this post
Twitter
Google+
Facebook
Reddit
LinkedIn
StumbleUpon
Pinterest
Email