########################## Correlation between continuous traits ################################# # read 'ape’, ‘geiger’, and ‘caper’ packages library(ape) library(geiger) library(caper) # set working directory setwd("/Users/jcsantos/Desktop/R_class_winter_2015_home/0_Bio559R_course_final_files/week_11_office_2/data") # 32) our starting data bird_pruned_tree <- read.tree("bird_pruned_MB.newick") bird_pruned_tree birds_traits_data <- read.table("bird_data_pruned.txt", header = TRUE, sep = "\t") # 33) Let’s get a nice mixture of continuous and discrete variables: body mass, metabolic rate, migration, flight birds_BayesTraits_temp <- subset(birds_traits_data, select = c(Genus_Species,Mass_g,BMR_kJ_per_h,Migration, Flightless)) # Select this binary data and make it a vector birds_BayesTraits_data <- birds_BayesTraits_temp[complete.cases(birds_BayesTraits_temp),] # only complete cases birds_BayesTraits_data$log_mass <- log10(birds_BayesTraits_data$Mass_g) birds_BayesTraits_data$log_BMR <- log10(birds_BayesTraits_data$BMR_kJ_per_h) birds_BayesTraits_data$Migration <- as.numeric(birds_BayesTraits_data$Migration) birds_BayesTraits_data$Migration[birds_BayesTraits_data$Migration == 1] <- "0" birds_BayesTraits_data$Migration[birds_BayesTraits_data$Migration == 2] <- "1" birds_BayesTraits_data$Flightless <- as.numeric(birds_BayesTraits_data$Flightless) birds_BayesTraits_data$Flightless[birds_BayesTraits_data$Flightless == 1] <- "0" birds_BayesTraits_data$Flightless[birds_BayesTraits_data$Flightless == 2] <- "1" birds_BayesTraits_data_end <- subset(birds_BayesTraits_data, select = c(Genus_Species,Migration, Flightless, log_mass, log_BMR)) head(birds_BayesTraits_data_end) # 34) use the 'name.check' function to determine if we have concordance name.check (bird_pruned_tree, birds_BayesTraits_data_end) # we check the concordance between a data file and a phylogenetic tree. # NOT CONCORDANT # Provides a list of species in tree with no data ($tree_not_data) # 35) We make concordant need to prepare our data using the function 'treedata' birds_continuous_discrete <- treedata(bird_pruned_tree, birds_BayesTraits_data_end) birds_continuous_discrete$phy # a class phylo birds_continuous_discrete$data # a matrix # 36) We are going to prepare our data to contain both continuous and discrete data in a data.frame birds_continuous_discrete_phy <- birds_continuous_discrete$phy birds_continuous_discrete_dat <- birds_continuous_discrete$data birds_continuous_discrete_dat_df <- as.data.frame(birds_continuous_discrete$data) write.table(birds_continuous_discrete_dat_df, file ="birds_continuous_discrete_data.txt", sep ="\t") birds_continuous_discrete_dat_df <- read.table("birds_continuous_discrete_data.txt") str(birds_continuous_discrete_dat_df) # 'data.frame': 399 obs. of 5 variables: # $ Genus_Species: Factor w/ 399 levels "Accipiter_nisus",..: 352 130 42 41 40 248 69 68 194 364 ... # $ Migration : int 0 0 0 0 0 0 0 0 0 0 ... # $ Flightless : int 1 1 1 1 1 0 0 0 0 0 ... # $ log_mass : num 4.97 4.59 3.14 3.4 3.5 ... # $ log_BMR : num 2.32 2.1 1.15 1.28 1.22 ... name.check (birds_continuous_discrete_phy, birds_continuous_discrete_dat_df) # concordant i.e., "OK" ############### Using caper -- Phylogenetic linear models (generalized least squares-GLS) ######################### # 37) We need to make a comparative data object in caper # requires a phylogeny class phylo: birds_migration_flight_phy # requires a data class data.frame with a column with species names: birds_migration_flight_dat_df # we will include the vcv (the variance-covariance) matrix of the phylogeny birds_continuous_discrete_caper <- comparative.data (birds_continuous_discrete_phy, birds_continuous_discrete_dat_df, names.col = Genus_Species, vcv=TRUE, vcv.dim=3) # Comparative dataset of 399 taxa: # Phylogeny: birds_continuous_discrete_phy # 399 tips, 398 internal nodes # chr [1:399] "Nothoprocta_perdicaria" "Apteryx_australis" "Apteryx_haastii" "Apteryx_owenii" ... # VCV matrix present: # VCV.array [1:399, 1:399, 1:29] 89.2 30.3 30.3 30.3 30.3 ... # Data: birds_continuous_discrete_dat_df # $ Migration : int [1:399] 0 0 0 0 0 0 0 0 0 0 ... # $ Flightless: int [1:399] 0 1 1 1 1 1 0 0 0 0 ... # $ log_mass : num [1:399] 2.66 3.5 3.4 3.14 4.59 ... # $ log_BMR : num [1:399] 0.803 1.22 1.279 1.153 2.101 ... # 38) determine the association between log_mass and log_BMR (two continuous traits) using pgls # lambda log_BMR_log_mass_lambda <- pgls(log_BMR ~ log_mass, birds_continuous_discrete_caper, lambda='ML') summary(log_BMR_log_mass_lambda) # Call: # pgls(formula = log_BMR ~ log_mass, data = birds_continuous_discrete_caper, lambda = "ML") # Residuals: # Min 1Q Median 3Q Max # -0.037447 -0.007895 -0.000579 0.008341 0.038068 # Branch length transformations: # kappa [Fix] : 1.000 # lambda [ ML] : 0.729 # lower bound : 0.000, p = < 2.22e-16 # upper bound : 1.000, p = < 2.22e-16 # 95.0% CI : (0.614, 0.818) # delta [Fix] : 1.000 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) -1.029599 0.060232 -17.094 < 2.2e-16 *** # log_mass 0.700371 0.011672 60.007 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.01218 on 397 degrees of freedom # Multiple R-squared: 0.9007, Adjusted R-squared: 0.9004 # F-statistic: 3601 on 1 and 397 DF, p-value: < 2.2e-16 # lambda = 0 (no signal) log_BMR_log_mass_lambda_0 <- pgls(log_BMR ~ log_mass, birds_continuous_discrete_caper, lambda=0.0001) summary(log_BMR_log_mass_lambda_0) # Call: # pgls(formula = log_BMR ~ log_mass, data = birds_continuous_discrete_caper, # lambda = 1e-04) # Residuals: # Min 1Q Median 3Q Max # -0.067915 -0.005525 -0.000172 0.005657 0.058366 # Branch length transformations: # kappa [Fix] : 1.000 # lambda [Fix] : 0.000 # delta [Fix] : 1.000 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) -0.8564504 0.0167448 -51.147 < 2.2e-16 *** # log_mass 0.6588844 0.0078372 84.071 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.011 on 397 degrees of freedom # Multiple R-squared: 0.9468, Adjusted R-squared: 0.9467 # F-statistic: 7068 on 1 and 397 DF, p-value: < 2.2e-16 # lambda and kappa log_BMR_log_mass_lambda_kappa <- pgls(log_BMR ~ log_mass, birds_continuous_discrete_caper, lambda='ML', kappa = 'ML') summary(log_BMR_log_mass_lambda_kappa) # Call: # pgls(formula = log_BMR ~ log_mass, data = birds_continuous_discrete_caper, lambda = "ML", kappa = "ML") # Residuals: # Min 1Q Median 3Q Max # -0.0246390 -0.0041248 0.0008323 0.0052119 0.0193141 # Branch length transformations: # kappa [ ML] : 1.319 # lower bound : 0.000, p = < 2.22e-16 # upper bound : 3.000, p = < 2.22e-16 # 95.0% CI : (1.033, 1.616) # lambda [ ML] : 0.707 # lower bound : 0.000, p = < 2.22e-16 # upper bound : 1.000, p = < 2.22e-16 # 95.0% CI : (0.592, 0.798) # delta [Fix] : 1.000 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) -1.019644 0.060918 -16.738 < 2.2e-16 *** # log_mass 0.699671 0.011539 60.637 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.007634 on 397 degrees of freedom # Multiple R-squared: 0.9026, Adjusted R-squared: 0.9023 # F-statistic: 3677 on 1 and 397 DF, p-value: < 2.2e-16 # lambda, kappa and delta log_BMR_log_mass_lambda_kappa_delta <- pgls(log_BMR ~ log_mass, birds_continuous_discrete_caper, lambda='ML', kappa = 'ML', delta = 'ML') summary(log_BMR_log_mass_lambda_kappa_delta) # Call: # pgls(formula = log_BMR ~ log_mass, data = birds_continuous_discrete_caper, lambda = "ML", kappa = "ML", delta = "ML") # Residuals: # Min 1Q Median 3Q Max # -0.0210659 -0.0048325 -0.0000999 0.0036841 0.0196281 # Branch length transformations: # kappa [ ML] : 1.315 # lower bound : 0.000, p = < 2.22e-16 # upper bound : 3.000, p = < 2.22e-16 # 95.0% CI : (1.040, 1.598) # lambda [ ML] : 0.716 # lower bound : 0.000, p = < 2.22e-16 # upper bound : 1.000, p = < 2.22e-16 # 95.0% CI : (0.604, 0.804) # delta [ ML] : 1.062 # lower bound : 0.000, p = 1.0284e-09 # upper bound : 3.000, p = 4.9347e-08 # 95.0% CI : (0.480, 1.703) # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) -1.017455 0.058926 -17.267 < 2.2e-16 *** # log_mass 0.699455 0.011533 60.648 < 2.2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.006362 on 397 degrees of freedom # Multiple R-squared: 0.9026, Adjusted R-squared: 0.9023 # F-statistic: 3678 on 1 and 397 DF, p-value: < 2.2e-16 # 39) plot profile in a tree transformation parameters mod.l <- pgls.profile(log_BMR_log_mass_lambda_kappa_delta, 'lambda') mod.k <- pgls.profile(log_BMR_log_mass_lambda_kappa_delta, 'kappa') mod.d <- pgls.profile(log_BMR_log_mass_lambda_kappa_delta, 'delta') par(mfrow=c(2,2)) plot(mod.l) plot(mod.d) plot(mod.k) # 40) model comparison: you can use anova or aic. It is better if branch length transformations # were the same AIC(log_BMR_log_mass_lambda,log_BMR_log_mass_lambda_0, log_BMR_log_mass_lambda_kappa, log_BMR_log_mass_lambda_kappa_delta) # df AIC # log_BMR_log_mass_lambda 2 -751.6870 # log_BMR_log_mass_lambda_0 2 -555.9929 # log_BMR_log_mass_lambda_kappa 2 -756.3216 # log_BMR_log_mass_lambda_kappa_delta 2 -756.3368 <-- lowest AIC score # 41) model diagnostic # The first two show the fit of the residuals to a normal distribution: # a density plot of the distribution of the residuals and a normal Q-Q plot # the distribution of the residuals against their expected distribution under a normal distribution. # The second two show the fitted values against both the residuals and the observed value to look # for pattern in the residuals within the model. par(mfrow=c(2,2)) plot(log_BMR_log_mass_lambda_kappa_delta) ############### Using caper -- Phylogenetic independent contrasts (PICs) ######################### This is probably the most basic approach in comparative methods. # 42) As usual, we need to prepare the data for PICs birds_continuous_discrete_caper_pics <- comparative.data (birds_continuous_discrete_phy, birds_continuous_discrete_dat_df, names.col = Genus_Species) birds_continuous_discrete_caper_pics # Comparative dataset of 399 taxa: # Phylogeny: birds_continuous_discrete_phy # 399 tips, 398 internal nodes # chr [1:399] "Nothoprocta_perdicaria" "Apteryx_australis" "Apteryx_haastii" "Apteryx_owenii" ... # Data: birds_continuous_discrete_dat_df # $ Migration : int [1:399] 0 0 0 0 0 0 0 0 0 0 ... # $ Flightless: int [1:399] 0 1 1 1 1 1 0 0 0 0 ... # $ log_mass : num [1:399] 2.66 3.5 3.4 3.14 4.59 ... # $ log_BMR : num [1:399] 0.803 1.22 1.279 1.153 2.101 ... # 43) We use the 'crunch' function original from the CAIC package (Purvis and Rambaut, 1995b), # based on the method of Pagel (1992) to calculate contrasts at polytomies. birds_log_BMR_log_mass_pic <- crunch(log_BMR ~ log_mass, data=birds_continuous_discrete_caper) summary(birds_log_BMR_log_mass_pic) # Call: # lm(log_BMR ~ log_mass - 1, data = contrData) # Residuals: # Min 1Q Median 3Q Max # -1.73020 0.00630 0.02239 0.04251 0.24862 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # log_mass 0.26157 0.06007 4.355 1.7e-05 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.09657 on 397 degrees of freedom # Multiple R-squared: 0.04559, Adjusted R-squared: 0.04318 # F-statistic: 18.96 on 1 and 397 DF, p-value: 1.699e-05 # 44) We can also run PICs diagnostics for outliers and influencing contrasts, # Three types of diagnostics are available and all should be non-significant # NV or estimated Nodal Value: This tests if there is no relationship between the # magnitude of estimated contrasts and the estimated nodal value. # SD or estimated Standard Deviation at the Node: Under PICs, we assume that variance # accumulates in continuous characters equally along all branches and that the amount # of variation is proportional to branch length. The square root of the expected variance # calculated at each node can therefore be used to standardize the absolute diference # (`raw contrasts') between sister taxa. If this standardization is successful, # there should be no relationship between absolute values of standardized contrasts and the # estimated standard deviation. # AGE or Age of the node: If the phylogeny is ultrametric, then the node age, # as time from the present, provides a further test of the effectiveness of the # standardization of the contrasts. # If significant, we better use PGLS or transform branch lengths to make the test non-significant par(mfrow=c(2,2)) caic.diagnostics(birds_log_BMR_log_mass_pic) # log_mass : # Estimate Std. Error t value Pr(>|t|) # NV 0.0242598 0.0038012 6.3822 4.887e-10 *** # SD -0.0025444 0.0013367 -1.9035 0.05769 . # AGE -0.0105206 0.0025408 -4.1407 4.234e-05 *** # --- # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #45) robust crunch with its diagnostics birds_log_BMR_log_mass_pic <- crunch(log_BMR ~ log_mass, data=birds_continuous_discrete_caper) birds_log_BMR_log_mass_pic_Robust <- caic.robust(birds_log_BMR_log_mass_pic) summary(birds_log_BMR_log_mass_pic_Robust) # Call: # lm(log_BMR ~ log_mass - 1, data = contrData) # Residuals: # Min 1Q Median 3Q Max # -0.079619 -0.013141 0.000365 0.012593 0.222263 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # log_mass 0.70845 0.01708 41.48 <2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.0256 on 396 degrees of freedom # Multiple R-squared: 0.8129, Adjusted R-squared: 0.8124 # F-statistic: 1721 on 1 and 396 DF, p-value: < 2.2e-16 par(mfrow=c(2,2)) caic.diagnostics(birds_log_BMR_log_mass_pic_Robust, outlier=2) # Excluding 1 contrast with absolute studentised residuals > 3 # log_mass : # Estimate Std. Error t value Pr(>|t|) # NV 0.0225679 0.0033466 6.7435 5.507e-11 *** # SD -0.0010300 0.0011953 -0.8617 0.3894 # AGE -0.0027373 0.0024309 -1.1260 0.2608 # --- # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1