# Ancestral Reconstruction using Threshold Models # 1) Make a working directory (e.g., discrete) and select it as your working. setwd("/Users/jcsantos/Desktop/R_class_winter_2015_home/0_Bio559R_course_final_files/week_9_office/data") # 2) Several packages and functions are available for this purpose. library(ape) library(phytools) library(geiger) # 3) Load our trees and data from last class and check concordance: bird_pruned_tree <- read.tree("bird_pruned_MB.newick") bird_pruned_tree birds_traits_data <- read.table("bird_data_pruned.txt", header = TRUE, sep = "\t") birds_traits_data ############ Ancestral Reconstruction for Multistate Discrete traits using Threshold model ################ # 4) only Food birds_food_data <- as.data.frame(birds_traits_data$Food) # Select this binary data and make it a vector row.names(birds_food_data) <- birds_traits_data$Genus_Species birds_food_data # 5) use the 'name.check' function to determine if we have concordance name.check (bird_pruned_tree, birds_food_data) # we check the concordance between a data file and a phylogenetic tree. # [1] "OK" # 6) We make concordant need to prepare our data using the function 'treedata' birds_food <- treedata(bird_pruned_tree, birds_food_data) birds_food$phy # a class phylo birds_food$data # a matrix # 7) We are going to prepare our data for analyses: birds_food_phy <- birds_food$phy # assign phylogeny to object birds_food_dat <- as.character(birds_food$data) # assign data to character vector names(birds_food_dat) <- rownames(birds_food$data) # give names to entries from original data birds_food_dat #birds_food_dat_matrix <- as.matrix(birds_food_dat) #birds_food_dat_matrix # Struthio_camelus Dromaius_novaehollandiae Apteryx_owenii Apteryx_haastii # "omnivore" "omnivore" "carnivore" "carnivore" # 8) We can do ancestral character estimation under the threshold model using 'phytools' ?ancThresh # ancThresh(tree, x, ngen=1000, sequence=NULL, method="mcmc", model=c("BM","OU","lambda"), control=list(), ...) # tree -- phylogeny class phylo # X -- vector containing discrete or continuous character states # ngen -- number of generations for the MCMC, the more the better # model -- model for the evolution of the liability. # control -- is a list of control parameters for the MCMC # 'sample', the sampling interval for the MCMC # other control variables and priors # 9) we are going to set up MCMC parameters and run the model ngen <- 1e+05 # number of generations to run burnin <- 0.2 * ngen # approximate number of initial generations to eliminate sample <- 1000 #sample posterior distribution every 1000 samples # 10) we run the MCMC on our data using the 'ancThresh' and saving it to an mcmc object mcmc_food <- ancThresh(birds_food_phy, birds_food_dat[1:length(birds_food_phy$tip.label)], ngen = ngen, sequence = c("vegetarian", "omnivore", "carnivore"), model = "lambda", control = list(sample = sample, plot = TRUE)) # Requirements # birds_food_phy: is a phylogeny of clas phylo # birds_food_dat[1:length(birds_food_phy$tip.label)]: is a vector of traits in the order of the phylogeny labels # ngen: number of generations # sequence: the exact sequence of trait states order (easier for binary) # model of trait evolution (in this case lambda, see last class) # control: priors and other controls of the mcmc chains # if running correctly this will look like this (but it will take a long time) # MCMC starting.... # gen 1000 # gen 2000 # gen 3000 # gen 4000 # ... # gen 100000 # 11) Let's explore our mcmc runs by plotting the likelihood profile plot(mcmc_food$par[, "logLik"], type = "l", xlab = "generation", ylab = "logLik") # NOT STATIONARY: We need to run this chain for way more time. ## get the estimates of thresholds of our data colMeans(mcmc_food$par[(0.2 * ngen/sample):(ngen/sample) + 1, c("vegetarian", "omnivore", "carnivore")]) # vegetarian omnivore carnivore # 0.000000 1.213957 Inf # 12) Prepare for plotting ancestral reconstructions # create a matrix of colors matrix_food_colors <-to.matrix(birds_food_dat, c("vegetarian", "omnivore", "carnivore")) # vegetarian omnivore carnivore # Struthio_camelus 0 1 0 # Dromaius_novaehollandiae 0 1 0 # Apteryx_owenii 0 0 1 # Apteryx_haastii 0 0 1 # Apteryx_australis 0 0 1 # 13) CRITICAL STEP FOR THE PLOT: Reorder the matrix to match the phylogeny name order phylogeny_name_order <- as.data.frame (birds_food_phy$tip.label) # creates a data frame for species names in correct order names(phylogeny_name_order) <- c("species_names") # changes the name of the column rownames(phylogeny_name_order) <- phylogeny_name_order$species_names # assigns the names to rows matrix_food_colors_ordered <- matrix_food_colors[rownames(phylogeny_name_order),,drop=FALSE] # reorders matrix names # 14) plot of the ancestral reconstructions plotTree(birds_food_phy, setEnv = TRUE, ftype = "off") tiplabels(pie = matrix_food_colors_ordered, piecol = palette()[1:3], cex = 0.2) nodelabels(pie = mcmc_food$ace, piecol = palette()[1:3], cex = 0.2) ############ Ancestral Reconstruction for Binary using Threshold model ################ # 15) select only a binary variable # only migration birds_migration_temp <- as.data.frame(birds_traits_data$Migration) # Select this binary data and make it a vector birds_migration_temp$species <- birds_traits_data$Genus_Species birds_migration_temp # 16) eliminate incomplete cases and make data frame birds_migration_data <- birds_migration_temp[complete.cases(birds_migration_temp),] row.names(birds_migration_data) <- birds_migration_data$species names(birds_migration_data) <- c("migration", "genus_species") head(birds_migration_data) # migration genus_species # Struthio_camelus No Struthio_camelus # Dromaius_novaehollandiae No Dromaius_novaehollandiae # Apteryx_owenii No Apteryx_owenii # Apteryx_haastii No Apteryx_haastii # Apteryx_australis No Apteryx_australis # Nothoprocta_perdicaria No Nothoprocta_perdicaria # 17) Prepare the data as binary # make factors to characters birds_migration_data$migration <- as.character(birds_migration_data$migration) # make data truly binary (0-no migratory and 1-migratory) birds_migration_data$migration[birds_migration_data$migration == "No"] <- "0" birds_migration_data$migration[birds_migration_data$migration == "Yes"] <- "1" birds_migration_data # 18) use the 'name.check' function to determine if we have concordance name.check (bird_pruned_tree, birds_migration_data) # we check the concordance between a data file and a phylogenetic tree. # NOT CONCORDANT # Provides a list of species in tree with no data ($tree_not_data) # 19) We make concordant need to prepare our data using the function 'treedata' birds_migration <- treedata(bird_pruned_tree, birds_migration_data) birds_migration$phy # a class phylo birds_migration$data # a matrix # We are going to prepare our data for analyses: birds_migration_phy <- birds_migration$phy birds_migration_dat <- as.numeric(birds_migration$data[,1]) names(birds_migration_dat) <- birds_migration$data[,2] birds_migration_dat_matrix <- as.matrix(birds_migration_dat) # 20) We can do ancestral character estimation under the threshold model using 'phytools' # We are going to set up MCMC parameters (based on example by Revell) ngen <- 5e+05 # number of generations to run burnin <- 0.2 * ngen # approximate number of initial generations to eliminate sample <- 1000 #sample posterior distribution every 1000 samples ## we run the MCMC on our data using the 'ancThresh' and saving it to an mcmc object mcmc_migration <- ancThresh(birds_migration_phy, birds_migration_dat[1:length(birds_migration_phy$tip.label)], ngen = ngen, sequence = c("0", "1"), model = "lambda", control = list(sample = sample, plot = FALSE)) # 21) Let's explore our mcmc runs ## plot our likelihood profile plot(mcmc_migration$par[, "logLik"], type = "l", xlab = "generation", ylab = "logLik") # STATIONARY: We need to run this chain for way more time. ## get the estimates of thresholds of our data colMeans(mcmc$par[(0.2 * ngen/sample):(ngen/sample) + 1, c("0", "1")]) # 0 1 # 0.000000 Inf # 22) Prepare for plotting our ancestral reconstructions # create a matrix of colors matrix_migration_colors <-to.matrix(birds_migration_dat, c("non migratory", "migratory")) # 23) CRITICAL STEP FOR PLOT: Reorder the matrix to match the phylogeny name order phylogeny_name_order <- as.data.frame (birds_migration_phy$tip.label) # creates a data frame for species names in correct order names(phylogeny_name_order) <- c("species_names") # changes the name of the column rownames(phylogeny_name_order) <- phylogeny_name_order$species_names # assigns the names to rows matrix_food_colors_ordered <- matrix_food_colors[rownames(phylogeny_name_order),,drop=FALSE] # re orders matrix names # 24) plot the reconstructions plotTree(birds_migration_phy, setEnv = TRUE, ftype = "off") tiplabels(pie = matrix_food_colors_ordered, piecol = palette()[1:2], cex = 0.2) nodelabels(pie = mcmc_migration$ace, piecol = palette()[1:2], cex = 0.2) ### Ancestral Reconstruction for a Discretized Continuous traits using Threshold model ### # 25) Load our trees and data from last class and check concordance: bird_pruned_tree <- read.tree("bird_pruned_MB.newick") bird_pruned_tree birds_traits_data <- read.table("bird_data_pruned.txt", header = TRUE, sep = "\t") birds_traits_data # 26) only log10mass_g birds_traits_data$log10mass <- log10(birds_traits_data$Mass_g) # transform data birds_log10mass_data <- as.data.frame(birds_traits_data$log10mass) # Select log10mass data and make it a vector row.names(birds_log10mass_data) <- birds_traits_data$Genus_Species birds_log10mass_data # 27) use the 'name.check' function to determine if we have concordance name.check (bird_pruned_tree, birds_log10mass_data) # we check the concordance between a data file and a phylogenetic tree. # [1] "OK" # 28) We prepare data using function 'treedata' birds_log_mass <- treedata(bird_pruned_tree, birds_log10mass_data) birds_log_mass$phy # a class phylo birds_log_mass$data # a matrix birds_log_mass_phy <- birds_log_mass$phy birds_log_mass_dat <- as.character(birds_log_mass$data) names(birds_log_mass_dat) <- rownames(birds_log_mass$data) # give names to entries from original data birds_log_mass_dat # 29) Discretizing log_mass_data (a priori thresholds) using Kernel Density Estimation density_birds_log_mass_dat <- density(as.numeric(birds_log_mass_dat)) # returns the density data density_birds_log_mass_dat # Call: # density.default(x = as.numeric(birds_log_mass_dat)) #Data: as.numeric(birds_log_mass_dat) (451 obs.); Bandwidth 'bw' = 0.2005 # x y # Min. :0.1145 Min. :0.0000498 # 1st Qu.:1.4777 1st Qu.:0.0083575 # Median :2.8408 Median :0.0888518 # Mean :2.8408 Mean :0.1832174 # 3rd Qu.:4.2040 3rd Qu.:0.3572858 # Max. :5.5671 Max. :0.5633794 # 30) Estimate modes and plot them in density graph ######################### create a function to find modes ############################### find_modes<- function(x) { modes <- NULL for ( i in 2:(length(x)-1) ) { if ( (x[i] > x[i-1]) & (x[i] > x[i+1]) ) { modes <- c(modes,i) } } if ( length(modes) == 0 ) { modes = 'This is a monotonic distribution' } return(modes) } ############################### run function to find modes ############################### birds_log_mass_dat_modes_indices <- find_modes(density_birds_log_mass_dat$y) birds_log_mass_dat_modes_indices # get the actual modes modes_values <- density(as.numeric(birds_log_mass_dat))$x[birds_log_mass_dat_modes_indices] #[1] 1.266946 2.590082 # plot modes plot(density_birds_log_mass_dat) # plots the results abline( v = modes_values[1], col=4, lty=2) abline( v = modes_values[2], col=4, lty=2) # 32) Input modes as thresholds # we include our modes to discretize our continuous data in order a <- b <- c <- d th_log_mass_dat <- sapply(as.numeric(birds_log_mass_dat), threshState, thresholds = setNames(c(0, 1.266946, 2.590082, Inf), letters[1:4])) names(th_log_mass_dat) <- rownames(birds_log_mass$data) # give names to entries from original data th_log_mass_dat # 33) we are going to set up MCMC parameters and run the model ngen <- 1e+05 # number of generations to run burnin <- 0.2 * ngen # approximate number of initial generations to eliminate sample <- 1000 #sample posterior distribution every 1000 samples # we run the MCMC on our data using the 'ancThresh' and saving it to an mcmc object mcmc_log_mass <- ancThresh(birds_log_mass_phy, th_log_mass_dat[1:length(birds_log_mass_phy$tip.label)], ngen = ngen, sequence = letters[1:4], model = "lambda", control = list(sample = sample, plot = TRUE)) # Requirements # birds_log_mass_phy: is a phylogeny of clas phylo # th_log_mass_dat[1:length(birds_log_mass_phy$tip.label)]: is a vector of traits in the order of the phylogeny labels # ngen: number of generations # sequence: the exact sequence of trait states order (easier for binary) # model of trait evolution (in this case lambda, see last class) # control: priors and other controls of the mcmc chains # if running correctly this will look like this (but it will take a long time) # MCMC starting.... # gen 1000 # gen 2000 # gen 3000 # gen 4000 # ... # gen 100000 # 34) Let's explore our mcmc runs ## plot our likelihood profile plot(mcmc_log_mass$par[, "logLik"], type = "l", xlab = "generation", ylab = "logLik") # STATIONARY # 34) get the estimates of thresholds of our data parameter_means <- colMeans(mcmc_log_mass$par[(0.2 * ngen/sample):(ngen/sample) + 1, letters[1:4]]) parameter_means # "very small-a", "small-b", "medium-c", "large-d") # a b c d # 0.0000000 0.9998883 2.0926211 Inf lambda_mean <- mean(mcmc_log_mass$par$lambda[(0.2 * ngen/sample):(ngen/sample) + 1]) lambda_mean # [1] 0.9989808 # 35) Prepare for plotting ancestral reconstructions # create a matrix of colors matrix_log_mass_dat <-to.matrix(th_log_mass_dat, letters[1:4]) # a b c d # Struthio_camelus 0 0 0 1 # Dromaius_novaehollandiae 0 0 0 1 # Apteryx_owenii 0 0 0 1 # 36) CRITICAL STEP FOR PLOT: Reorder the matrix to match the phylogeny name order phylogeny_name_order <- as.data.frame (birds_log_mass_phy$tip.label) # creates a data frame for species names in correct order names(phylogeny_name_order) <- c("species_names") # changes the name of the column rownames(phylogeny_name_order) <- phylogeny_name_order$species_names # assigns the names to rows matrix_log_mass_ordered <- matrix_log_mass_dat[rownames(phylogeny_name_order),,drop=FALSE] # re orders matrix names matrix_log_mass_ordered # 37) plot the reconstructions plotTree(birds_log_mass_phy, setEnv = TRUE, ftype = "off") tiplabels(pie = matrix_log_mass_ordered, piecol = palette()[1:4], cex = 0.2) nodelabels(pie = mcmc_log_mass$ace, piecol = palette()[1:4], cex = 0.2) # 14) Plot threshold parameter space, preparation library(ggplot2) install.packages("reshape") library(reshape) histogram <- as.data.frame(rbind (mcmc_log_mass$par$a[(0.2 * ngen/sample):(ngen/sample) + 1], mcmc_log_mass$par$b[(0.2 * ngen/sample):(ngen/sample) + 1], mcmc_log_mass$par$c[(0.2 * ngen/sample):(ngen/sample) + 1])) mdata <- melt(histogram) dim(mdata) # 243 rows mdata$phenotypes <- rep.int(c("a", "b", "c"), 81) # name column with phenotypes head(mdata) # variable value phenotypes # 1 V1 0.000000 a # 2 V1 1.000000 b # 3 V1 2.006712 c # 4 V2 0.000000 a # 5 V2 1.000000 b # 6 V2 2.006712 c df <- as.data.frame(parameter_means) ggplot(mdata,aes(x=value)) + geom_histogram(data=subset(mdata,phenotypes == 'a'),fill = "red", alpha = 0.2, position="identity", binwidth =.05) + geom_histogram(data=subset(mdata,phenotypes == 'b'),fill = "blue", alpha = 0.2, position="identity", binwidth =.05) + geom_histogram(data=subset(mdata,phenotypes == 'c'),fill = "green", alpha = 0.2, position="identity", binwidth =.05) + geom_vline(data=df, aes(xintercept=parameter_means),linetype="dashed", size=1)