# Title: Peer Group Analysis Generate Synthetic Dataset # Author: David Weston Birkbeck College, University of London # Date: January 2013 # This software is provided "as is" and licenced under GPL2, see # http://www.r-project.org/Licenses/GPL-2 # If you use this software we ask that you include the following citation: # Weston DJ, Hand DJ, Adams NM, Whitrow C and Juszczak P. Plastic card fraud detection using peer # group analysis. Advances in Data Analysis and Classification, 2(1), (2008), 45-62 ##### Peer Group Demo Preliminaries - Generate a simple dataset ##### # Create 50 times series from 2 peer groups, # where each peer group follows a random walk. # A time series is constructed by adding Gaussian noise to the position on a random walk. # All time series from the same peer group share the same random walk. #### Generate p-dimensional random walk #### pD.Random.Walk <- function(start.loc, n, max.step.size){ p= dim(start.loc)[2]; x<-matrix(0, nrow = n, ncol=p) r<-apply(x,2,OneD.Random.Walk,max.step.size=max.step.size) r<-sweep(r,2,start.loc,'+') return(r) } OneD.Random.Walk <- function (x, start.value=0, max.step.size=1){ n = length(x) return(start.value + cumsum(runif(n, min=-max.step.size, max=max.step.size))) } GenerateSyntheticData <- function(m,n,p, display.graphs = TRUE){ start.loc1<-matrix(rep(10,p), nrow=1); # start location of first peer group peer1 <- pD.Random.Walk(start.loc1,n,1); start.loc2<-matrix(rep(0,p), nrow=1); peer2 <- pD.Random.Walk(start.loc2,n,1); #Generate the 3 dimensional matrix of time-aligned time series x <-array(0, dim = c(m,n,p), dimnames = c('time series ID','observation time','observation feature')); # First half of time series are from peer group 1 # Standard deviation is 0.1 for(ct in 1:(m/2)){ x[ct,,]<-peer1+matrix(rnorm(p*n,0,0.1),nrow = n,ncol=p); } #Second half of time-series are from peer group 2 for(ct in ((m/2)+1):m){ x[ct,,]<-peer2+matrix(runif(p*n,0,0.1),nrow = n,ncol=p); } if (display.graphs){ # Display the two random walks. Show the only the first dimension x11() plot(peer1[, 1],type="b",ylim=range(c(peer1[, 1], peer2[, 1])),xlab="", ylab="", main = "Two Random Walks") par(new=TRUE) plot(peer2[, 1],type="b", ylim=range(c(peer1[, 1], peer2[, 1])), axes=FALSE, xlab="", ylab="") if (p>1){ # Display the two random walks in 2D (using the first 2 dimensions) if p >=2. x11() plot(peer1[, 1],peer1[, 2],pch=20,xlab="",ylab="",xlim=range(c(peer1[, 1], peer2[, 1])),ylim=range(c(peer1[, 2], peer2[, 2])), main = "Two Random Walks (2D)") par(new=TRUE) plot(peer2[, 1],peer2[, 2],pch=20,xlab="",ylab="",xlim=range(c(peer1[, 1], peer2[, 1])),ylim=range(c(peer1[, 2], peer2[, 2]))) segments(head(peer1[, 1], -1), head(peer1[, 2], -1), tail(peer1[, 1], -1), tail(peer1[,2], -1), col ="blue") segments(head(peer2[, 1], -1), head(peer2[, 2], -1), tail(peer2[, 1], -1), tail(peer2[,2], -1), col ="red") } } names.array <- list(mapply(paste,rep("time series ID = ",m),1:m), mapply(paste,rep("time t = ",n),1:n), mapply(paste,rep("feature ",p),1:p)) names(names.array) <- c('time series ID','observation time','observation feature') attr(x, "dimnames") <- names.array return(x) } DisplayPeerTimeSeries <- function(x,m,n){ #Display peer group and outlier time series x11() for(ct in 2:(m/2)){ plot(x[ct ,1:n,1],type="b", ylim=range(x[ ,1:n,1]), axes=FALSE, xlab="", ylab="",lty=2,pch =20, col="gray") par(new=TRUE) } #Display "outlier" time series plot(x[1 ,1:n,1],type="b", ylim=range(x[ ,1:n,1]), axes=FALSE, xlab="", ylab="",col ="blue" ,main =" Outlier Time Series - Blue") }