Run Chart Check 16 Jan 2015
Kevin Little, Ph.D.
Friday, January 16, 2015
This R script uses simulation to estimate the probability of seeing k consecutive values above or below the median in a series of independent observations of length n. I model observations above the median with the value +1 and observations below the median with the value -1. I discuss the application of the simulation in the blog post dated 2 February 2015, here.
set.seed(1234
#Functions needed for the simulation
#Function 1: vec_maker
#make a vector of n/2 1's and n/2 -1s. Input value nl should be even vec_maker <- function(nl){ vec0 <- c(rep(1,nl/2),rep(-1,nl/2)) }
#Function 2: vec_permute
#take a vector of nl/2 1's and nl/2 -1s and randomly permute the entries. vec_permute <- function(vec_input,nl){ vec1 <- sample(vec_input,nl,replace=FALSE) }
#Function 3: df_maker_perm
#make a data frame with a given number of columns, each a permuted vector of a base vector with exactly
#n1/2 +1s and nl/2 -1s. df_maker_perm <- function(num_of_cols,nl) { base_vec <- vec_maker(nl) df1 <- vec_permute(base_vec,nl) for(i in 2:num_of_cols){ df1 <- cbind.data.frame(df1,vec_permute(base_vec,nl)) } names(df1) <- paste0("v",c(1:num_of_cols)) return(df1) }
#Function 4: count_one_sid
#takes a vector of +1s and -1s and counts the number of times there are more than
#k-1 consecutive -1s or +1s count_one_side <- function(x2,k) { n <- length(x2) index <- 0 for(j in 1:(n-k+1)) { vsub <- x2[j:(j+k-1)] if(abs(sum(vsub))==k) { index <- 1+index } } return(index) }
#Function 5: count_non_zero
#function to convert the runs count vector to just presence or absence of at least one run of at least length k count_non_zero <-function(x){ if(x>0){ cnz <- 1 } else { cnz <- 0 } }
#Function 6: pct_trials out
#function to output a result of ntrials with vectors of length nl and
#length k_one_side consecutive valus on one side of the median.
#Code uses the fact that a dataframe is a list of columns and so we use lapply function to apply the
#count runs function to the columns and assemble the answer into a vector. Then use count_non_zero and
#sapply to convert any non-zero count of runs to the value 1 to compute the percent of vectors with at
#least one run of length k pct_trials_out <- function(ntrials,nl,k_one_side) { dfx <- df_maker_perm(ntrials,nl) count_all <- unlist((lapply(dfx,count_one_side,k=k_one_side))) count_gt0 <- sapply(count_all,count_non_zero) pct_out <- sum(count_gt0)/ntrials }
#Let the series have nl = 20 values and we want to estimate the chances of seeing k_one_side = 6
#consecutive values. p_six_20 <- pct_trials_out(10000,20,6)
p_six_20
## [1] 0.1122
#Check the example at http://mathworld.wolfram.com/Run.html that calculates the probability of seeing six
#consecutive red cards or six consecutive black cards in a well-shuffled deck of 52 cards. This is identical
#to seeing a run of six consecutive values above the median or below the median in a series of 52 independent
#observations that have zero probability of any two values being identical (no ties). prob_six_red_or_black <- pct_trials_out(10000,52,6)
prob_six_red_or_black
## [1] 0.4665
#compare to value shown on mathematica site: 0.46424...