# Helm et al. - Characterization of differential transcript abundance through time during Nematostella vectensis development # this file contains R scripts and regular expressions used to build the full data matrix #----------------------------------------------------------------------------------------- # Loading count files into R. The count files were generated using the # in-house script bowtie_map_to_counts.py, available as an additional file a20 <- read.table("20_2.counts", header = TRUE) a21<- read.table("21-7.counts", header = TRUE) a22<- read.table("22-12.counts", header = TRUE) a23<- read.table("23-24.counts", header = TRUE) a24<- read.table("24-5.counts", header = TRUE) a25<- read.table("25-10.counts", header = TRUE) b40<- read.table("40-2.counts", header = TRUE) b41<- read.table("41-6.counts", header = TRUE) b42<- read.table("42-12.counts", header = TRUE) b43<- read.table("43-24.counts", header = TRUE) b44<- read.table("44-5.counts", header = TRUE) b45<- read.table("45-10.counts", header = TRUE) ### merge data ### Md <- merge(a20, b40, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h") Md <- merge(Md, a21, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h") Md <- merge(Md, b41, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h") Md <- merge(Md, a22, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h", "count_rep1_12h") Md <- merge(Md, b42, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h", "count_rep1_12h", "count_rep2_12h") Md <- merge(Md, a23, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h", "count_rep1_12h", "count_rep2_12h", "count_rep1_24h") Md <- merge(Md, b43, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h", "count_rep1_12h", "count_rep2_12h", "count_rep1_24h", "count_rep2_24h") Md <- merge(Md, a24, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h", "count_rep1_12h", "count_rep2_12h", "count_rep1_24h", "count_rep2_24h", "count_rep1_5d") Md <- merge(Md, b44, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h", "count_rep1_12h", "count_rep2_12h", "count_rep1_24h", "count_rep2_24h", "count_rep1_5d", "count_rep2_5d") Md <- merge(Md, a25, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("reference", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h", "count_rep1_12h", "count_rep2_12h", "count_rep1_24h", "count_rep2_24h", "count_rep1_5d", "count_rep2_5d", "count_rep1_10d") Md <- merge(Md, b45, by.x="reference", by.y="reference", all=TRUE) names(Md) <- c("transcript", "count_rep1_2h", "count_rep2_2h", "count_rep1_7h", "count_rep2_7h", "count_rep1_12h", "count_rep2_12h", "count_rep1_24h", "count_rep2_24h", "count_rep1_5d", "count_rep2_5d", "count_rep1_10d", "count_rep2_10d") # Note at the names call, the word “reference” is changed to the word “transcript” rownames(Md) <- Md[,1] ### creating RPM and pruning data in R ### # get rid of non-numeric column names: n <- data.matrix(Md[,2:13]) # set NAs to 0 n[is.na(n)] <- 0 # load edgeR library(edgeR) nf <- calcNormFactors(n) lib.size <- colSums(n) lib.effective.size <- lib.size * nf norm.multiplier <- 1000000 / lib.effective.size # Create the normalized matrix q <- n * norm.multiplier # Rename the columns colnames(q) <- c("normalized_count_rep1_2h","normalized_count_rep2_2h","normalized_count_rep1_7h","normalized_count_rep2_7h","normalized_count_rep1_12h","normalized_count_rep2_12h","normalized_count_rep1_24h","normalized_count_rep2_24h","normalized_count_rep1_5d","normalized_count_rep2_5d","normalized_count_rep1_10d","normalized_count_rep2_10d") #These data should not be used as inpute into edgeR for future analysis, but instead can be used to plot, or to average for STEM analysis (below) # Bind the normalized counts to the dataframe Md2 <- cbind(Md,q) # Need to get rid of NAs: Md2[is.na(Md2)] <- 0 ### Creating average normalized data for STEM analysis ### average_normalized_count_2h <- (Md2$normalized_count_rep1_2h+Md2$normalized_count_rep2_2h)/2 average_normalized_count_7h <- (Md2$normalized_count_rep1_7h+Md2$normalized_count_rep2_7h)/2 average_normalized_count_12h <- (Md2$normalized_count_rep1_12h+Md2$normalized_count_rep2_12h)/2 average_normalized_count_24h <- (Md2$normalized_count_rep1_24h+Md2$normalized_count_rep2_24h)/2 average_normalized_count_5d <- (Md2$normalized_count_rep1_5d+Md2$normalized_count_rep2_5d)/2 average_normalized_count_10d <- (Md2$normalized_count_rep1_10d+Md2$normalized_count_rep2_10d)/2 avgMd2<- cbind(average_normalized_count_2h,average_normalized_count_7h,average_normalized_count_12h,average_normalized_count_24h,average_normalized_count_5d,average_normalized_count_10d) rownames(avgMd2) <- Md2[,1] Md3 <- merge(Md2, avgMd2, by.x="transcript", by.y="row.names", all=TRUE) ### Adding Blast data ### G <- read.table("hit_definitions_all.txt", header=TRUE, sep="\t") g <- G rownames(g) <-G[,1] B <- merge(Md3, g, by.x = "transcript", by.y = "Reference", all=TRUE) ### adding cluster data ### C <- read.table("reference_names_clusters.txt", header=TRUE) Z <- merge(B, C, by.x= "transcript", by.y = "reference", all=TRUE) ### output of STEM analysis, requiering both R and regular expressions (in the program TextWrangler) ### # Building STEM analysis in R # average <- Z[,26:31] rownames(average) <- Z[,1] write.table(average, "average") ### regular expression to format in TextWranger ### # SEARCH REPLACE # " # + \t # NA 0 # added the word “transcript”, to the column names ### launch STEM and build plots ### # At the command line: # cd /Applications/stem # java -mx1024M -jar stem.jar # Run on the standard settings ### edit STEM output file with Regular expressions ### # SEARCH REPLACE # ^\t # Selected # \tID_\d+\t([\d+;]+).* \t\1 # text > change case > all lower ### Getting STEM data into matrix with R ### stem <- read.table("stem_plot_data", header = TRUE) Zlower <- tolower(Z$transcript) Zlower <- as.data.frame(Zlower) rownames(Z) <- Zlower[,1] colnames(Zs)[1] <- "lowercase_transcript" Zs <- merge(Z, stem, by.x= "row.names", by.y = "lowercase_transcript", all=TRUE) #lowercase_transcript was a temporary column to merge STEM data. # Saving full database in R: write.table(Zs, "matrix.R") #For the statistical analysis using edgeR see file #20121213_Helm_et_al_exactTest_goseq.R# #We provide a matrix file (Additional_file_7_matrix.txt) that contains data described above and the results of the statistical analysis. #The three ribosomal sequences (18S,28S, mitochondrial genome) used in the mapping were added to the final matrix.