##### STAT 6640 ASSIGNMENT NO.3 ##### # AUTHOR: HAIBAO TANG # # DATE : FEB-24-2007 # ##################################### # Given a given sequence training dataset, the program calculates # the probability of given test data and outputs the probability # PLEASE RUN "lab1starter.R" FIRST!! ### Global settings # a,g,c,t, corresponds to 1,2,3,4 nuclst <- c(1,2,3,4) names(nuclst) <- c('a','g','c','t') # event logger repTime <- function (message) { cat(paste(date(),":",message,"\n")) } ### Question 1 repTime("Count transitions for nucleotide and store in $counts.") # Initialize counts array to store transitions of each type counts <- array(0, dim=c(r,r,4,4)) # Loops below scan the data sequentially while adding to counts array for (entry in 1:N) { # Only fill the upper triangle where position1 < postion2 for (position1 in 1:(r-1)) { for (position2 in max(1,position1+1):r) { # Retrieve the nucleotide id (1-4) for position1 and position2 nuc1 <- nuclst[data1[entry,position1]] nuc2 <- nuclst[data1[entry,position2]] # Increment the transition cell by one counts[position1,position2,nuc1,nuc2] <- counts[position1,position2,nuc1,nuc2] + 1 } } } ### Question 2 repTime("Calculate chisquare statistics and store in $assocs and $pvals.") # Initialize array to store chisq statistics assocs <- array(0, dim=c(r,r)) # chisq values pvals <- array(0, dim=c(r,r)) # p-values # Loops to scan counts array, upper triangle for (position1 in 1:(r-1)) { for (position2 in max(1,position1+1):r) { chi_test <- chisq.test(counts[position1,position2,,]) assocs[position1,position2] <- chi_test$statistic pvals[position1,position2] <- chi_test$p.value } } ### Question 3 repTime("Identify most influential position $wmax.") wmax <- 0 # position with largest effect assocmax <- 0 for (j in 1:r) { # Sum both rows and cols for a particular position in assocs rowsum <- sum(assocs[j,]) + sum(assocs[,j]) if (rowsum > assocmax) {wmax <- j; assocmax <- rowsum} } ### Question 4 repTime("Generate position weight matrices $weight(1-4).") # Generate model consists of weight matrices of positions dependent on wmax # weight1,2,3,4 represent a,g,c,t at wmax position weights <- array(0, dim=c(4,4,r)) # Scan the sequences again for (entry in 1:N) { nuc_h <- nuclst[data1[entry,wmax]] # nucleotide at wmax position for (position in 1:r) { nuc_p <- nuclst[data1[entry,position]] weights[nuc_h,nuc_p,position] <- weights[nuc_h,nuc_p,position]+1 } } # We need frequencies rather than counts weights <- weights weight1 <- weights[1,,]/weights[1,1,wmax] weight2 <- weights[2,,]/weights[2,2,wmax] weight3 <- weights[3,,]/weights[3,3,wmax] weight4 <- weights[4,,]/weights[4,4,wmax] prob_h <- c(weights[1,1,wmax],weights[2,2,wmax],weights[3,3,wmax],weights[4,4,wmax])/N weights[1,,] <- weight1 weights[2,,] <- weight2 weights[3,,] <- weight3 weights[4,,] <- weight4 ### Question 5 repTime("Calculate probability for testing data.") # A local procedure that accepts a sequence and outputs the probability signalProb <- function (seq, weights, prob_h, wmax) { seq <- tolower(seq) seq <- strsplit(seq,'')[[1]] # Check 1)equal length with the training set; 2)valid DNA seq seq_length <- length(seq) if (seq_length!=dim(weights)[3]) stop(paste("Input length (",seq_length,") does not match.")) for (s in seq) { if (!(s %in% names(nuclst))) stop(paste("Input sequence contains invalid symbol",s)) } nuc_h <- nuclst[seq[wmax]] # nucleotide at wmax position weighth <- weights[nuc_h,,] # one of weight(1-4) to use as weight matrix # The final prob. is the product of weights prob <- prob_h[nuc_h] for (s in 1:seq_length) prob <- prob*weighth[nuclst[seq[s]],s] prob # return value } # MODIFY THIS LINE TO TEST DIFFERENT DATA testdata <- c("aggctttgaa","cttgatagac") for (d in testdata) cat(paste("P(",d,") =",signalProb(d,weights,prob_h,wmax),'\n')) ### DONE