########################## Correlation between continuous traits #################################

# read 'ape’, ‘geiger’, and ‘caper’ packages

library(ape)
library(geiger)
library(caper)

# set working directory
setwd("/Users/jcsantos/Desktop/R_class_winter_2015_home/0_Bio559R_course_final_files/week_11_office_2/data")

# 32) our starting data

bird_pruned_tree <- read.tree("bird_pruned_MB.newick")
bird_pruned_tree
birds_traits_data <- read.table("bird_data_pruned.txt", header = TRUE, sep = "\t")

# 33) Let’s get a nice mixture of continuous and discrete variables: body mass, metabolic rate, migration, flight

birds_BayesTraits_temp <- subset(birds_traits_data, select = c(Genus_Species,Mass_g,BMR_kJ_per_h,Migration, Flightless)) # Select this binary data and make it a vector
birds_BayesTraits_data <- birds_BayesTraits_temp[complete.cases(birds_BayesTraits_temp),] # only complete cases
birds_BayesTraits_data$log_mass <- log10(birds_BayesTraits_data$Mass_g)
birds_BayesTraits_data$log_BMR <- log10(birds_BayesTraits_data$BMR_kJ_per_h)
birds_BayesTraits_data$Migration <- as.numeric(birds_BayesTraits_data$Migration)
birds_BayesTraits_data$Migration[birds_BayesTraits_data$Migration == 1] <- "0"
birds_BayesTraits_data$Migration[birds_BayesTraits_data$Migration == 2] <- "1"
birds_BayesTraits_data$Flightless <- as.numeric(birds_BayesTraits_data$Flightless)
birds_BayesTraits_data$Flightless[birds_BayesTraits_data$Flightless == 1] <- "0"
birds_BayesTraits_data$Flightless[birds_BayesTraits_data$Flightless == 2] <- "1"
birds_BayesTraits_data_end <- subset(birds_BayesTraits_data, select = c(Genus_Species,Migration, Flightless, log_mass, log_BMR))


head(birds_BayesTraits_data_end)

# 34) use the 'name.check' function to determine if we have concordance

name.check (bird_pruned_tree, birds_BayesTraits_data_end) # we check the concordance between a data file and a phylogenetic tree.
# NOT CONCORDANT
# Provides a list of species in tree with no data ($tree_not_data)

# 35) We make concordant need to prepare our data using the function 'treedata'

birds_continuous_discrete <- treedata(bird_pruned_tree, birds_BayesTraits_data_end)
birds_continuous_discrete$phy # a class phylo
birds_continuous_discrete$data # a matrix

# 36) We are going to prepare our data to contain both continuous and discrete data in a data.frame 

birds_continuous_discrete_phy <- birds_continuous_discrete$phy
birds_continuous_discrete_dat <- birds_continuous_discrete$data
birds_continuous_discrete_dat_df <- as.data.frame(birds_continuous_discrete$data)
write.table(birds_continuous_discrete_dat_df, file ="birds_continuous_discrete_data.txt", sep ="\t")
birds_continuous_discrete_dat_df <- read.table("birds_continuous_discrete_data.txt")
str(birds_continuous_discrete_dat_df)

# 'data.frame':	399 obs. of  5 variables:
# $ Genus_Species: Factor w/ 399 levels "Accipiter_nisus",..: 352 130 42 41 40 248 69 68 194 364 ...
# $ Migration    : int  0 0 0 0 0 0 0 0 0 0 ...
# $ Flightless   : int  1 1 1 1 1 0 0 0 0 0 ...
# $ log_mass     : num  4.97 4.59 3.14 3.4 3.5 ...
# $ log_BMR      : num  2.32 2.1 1.15 1.28 1.22 ...

name.check (birds_continuous_discrete_phy, birds_continuous_discrete_dat_df) # concordant i.e., "OK"

###############    Using caper -- Phylogenetic linear models (generalized least squares-GLS) #########################


# 37) We need to make a comparative data object in caper

# requires a phylogeny class phylo: birds_migration_flight_phy
# requires a data class data.frame with a column with species names: birds_migration_flight_dat_df
# we will include the vcv (the variance-covariance) matrix of the phylogeny

birds_continuous_discrete_caper <- comparative.data (birds_continuous_discrete_phy, birds_continuous_discrete_dat_df, names.col = Genus_Species, vcv=TRUE, vcv.dim=3)

#  Comparative dataset of 399 taxa:
# Phylogeny: birds_continuous_discrete_phy 
#   399 tips, 398 internal nodes
#   chr [1:399] "Nothoprocta_perdicaria" "Apteryx_australis" "Apteryx_haastii" "Apteryx_owenii" ...
# VCV matrix present:
#   VCV.array [1:399, 1:399, 1:29] 89.2 30.3 30.3 30.3 30.3 ...
# Data: birds_continuous_discrete_dat_df 
#   $ Migration : int [1:399] 0 0 0 0 0 0 0 0 0 0 ...
#   $ Flightless: int [1:399] 0 1 1 1 1 1 0 0 0 0 ...
#   $ log_mass  : num [1:399] 2.66 3.5 3.4 3.14 4.59 ...
#   $ log_BMR   : num [1:399] 0.803 1.22 1.279 1.153 2.101 ...

# 38) determine the association between log_mass and log_BMR (two continuous traits) using pgls


# lambda

log_BMR_log_mass_lambda <- pgls(log_BMR ~ log_mass, birds_continuous_discrete_caper, lambda='ML')
summary(log_BMR_log_mass_lambda)

# Call:
# pgls(formula = log_BMR ~ log_mass, data = birds_continuous_discrete_caper, lambda = "ML")

# Residuals:
#       Min        1Q    Median        3Q       Max 
# -0.037447 -0.007895 -0.000579  0.008341  0.038068 

# Branch length transformations:

# kappa  [Fix]  : 1.000
# lambda [ ML]  : 0.729
#   lower bound : 0.000, p = < 2.22e-16
#   upper bound : 1.000, p = < 2.22e-16
#   95.0% CI   : (0.614, 0.818)
# delta  [Fix]  : 1.000

# Coefficients:
#             Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -1.029599   0.060232 -17.094 < 2.2e-16 ***
# log_mass     0.700371   0.011672  60.007 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.01218 on 397 degrees of freedom
# Multiple R-squared: 0.9007,	Adjusted R-squared: 0.9004 
# F-statistic:  3601 on 1 and 397 DF,  p-value: < 2.2e-16 


# lambda = 0 (no signal)

log_BMR_log_mass_lambda_0 <- pgls(log_BMR ~ log_mass, birds_continuous_discrete_caper, lambda=0.0001)
summary(log_BMR_log_mass_lambda_0)


# Call:
# pgls(formula = log_BMR ~ log_mass, data = birds_continuous_discrete_caper, 
#    lambda = 1e-04)

# Residuals:
#       Min        1Q    Median        3Q       Max 
# -0.067915 -0.005525 -0.000172  0.005657  0.058366 

# Branch length transformations:

# kappa  [Fix]  : 1.000
# lambda [Fix]  : 0.000
# delta  [Fix]  : 1.000

# Coefficients:
#              Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.8564504  0.0167448 -51.147 < 2.2e-16 ***
# log_mass     0.6588844  0.0078372  84.071 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.011 on 397 degrees of freedom
# Multiple R-squared: 0.9468,	Adjusted R-squared: 0.9467 
# F-statistic:  7068 on 1 and 397 DF,  p-value: < 2.2e-16 

# lambda and kappa

log_BMR_log_mass_lambda_kappa <- pgls(log_BMR ~ log_mass, birds_continuous_discrete_caper, lambda='ML', kappa = 'ML')
summary(log_BMR_log_mass_lambda_kappa)

# Call:
# pgls(formula = log_BMR ~ log_mass, data = birds_continuous_discrete_caper, lambda = "ML", kappa = "ML")

# Residuals:
#       Min         1Q     Median         3Q        Max 
# -0.0246390 -0.0041248  0.0008323  0.0052119  0.0193141 

# Branch length transformations:

# kappa  [ ML]  : 1.319
#   lower bound : 0.000, p = < 2.22e-16
#   upper bound : 3.000, p = < 2.22e-16
#   95.0% CI   : (1.033, 1.616)
# lambda [ ML]  : 0.707
#   lower bound : 0.000, p = < 2.22e-16
#   upper bound : 1.000, p = < 2.22e-16
#   95.0% CI   : (0.592, 0.798)
# delta  [Fix]  : 1.000

# Coefficients:
#             Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -1.019644   0.060918 -16.738 < 2.2e-16 ***
# log_mass     0.699671   0.011539  60.637 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.007634 on 397 degrees of freedom
# Multiple R-squared: 0.9026,	Adjusted R-squared: 0.9023 
# F-statistic:  3677 on 1 and 397 DF,  p-value: < 2.2e-16 

# lambda, kappa and delta

log_BMR_log_mass_lambda_kappa_delta <- pgls(log_BMR ~ log_mass, birds_continuous_discrete_caper, lambda='ML', kappa = 'ML', delta = 'ML')
summary(log_BMR_log_mass_lambda_kappa_delta)

# Call:
# pgls(formula = log_BMR ~ log_mass, data = birds_continuous_discrete_caper, lambda = "ML", kappa = "ML", delta = "ML")

# Residuals:
#       Min         1Q     Median         3Q        Max 
# -0.0210659 -0.0048325 -0.0000999  0.0036841  0.0196281 

# Branch length transformations:

# kappa  [ ML]  : 1.315
#   lower bound : 0.000, p = < 2.22e-16
#   upper bound : 3.000, p = < 2.22e-16
#   95.0% CI   : (1.040, 1.598)
# lambda [ ML]  : 0.716
#   lower bound : 0.000, p = < 2.22e-16
#   upper bound : 1.000, p = < 2.22e-16
#   95.0% CI   : (0.604, 0.804)
# delta  [ ML]  : 1.062
#   lower bound : 0.000, p = 1.0284e-09
#   upper bound : 3.000, p = 4.9347e-08
#   95.0% CI   : (0.480, 1.703)

# Coefficients:
#             Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -1.017455   0.058926 -17.267 < 2.2e-16 ***
# log_mass     0.699455   0.011533  60.648 < 2.2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.006362 on 397 degrees of freedom
# Multiple R-squared: 0.9026,	Adjusted R-squared: 0.9023 
# F-statistic:  3678 on 1 and 397 DF,  p-value: < 2.2e-16 


# 39)  plot profile in a tree transformation parameters

mod.l <- pgls.profile(log_BMR_log_mass_lambda_kappa_delta, 'lambda')
mod.k <- pgls.profile(log_BMR_log_mass_lambda_kappa_delta, 'kappa')
mod.d <- pgls.profile(log_BMR_log_mass_lambda_kappa_delta, 'delta')

par(mfrow=c(2,2))
plot(mod.l)
plot(mod.d) 
plot(mod.k)

# 40) model comparison: you can use anova or aic. It is better if branch length transformations
# were the same

AIC(log_BMR_log_mass_lambda,log_BMR_log_mass_lambda_0, log_BMR_log_mass_lambda_kappa, log_BMR_log_mass_lambda_kappa_delta)


#                                     df       AIC
# log_BMR_log_mass_lambda              2 -751.6870
# log_BMR_log_mass_lambda_0            2 -555.9929
# log_BMR_log_mass_lambda_kappa        2 -756.3216
# log_BMR_log_mass_lambda_kappa_delta  2 -756.3368 <-- lowest AIC score


# 41) model diagnostic

# The first two show the fit of the residuals to a normal distribution: 
# a density plot of the distribution of the residuals and a normal Q-Q plot
# the distribution of the residuals against their expected distribution under a normal distribution.
# The second two show the fitted values against both the residuals and the observed value to look
# for pattern in the residuals within the model.

par(mfrow=c(2,2))
plot(log_BMR_log_mass_lambda_kappa_delta)


###############    Using caper -- Phylogenetic independent contrasts (PICs) #########################

This is probably the most basic approach in comparative methods.

# 42) As usual, we need to prepare the data for PICs

birds_continuous_discrete_caper_pics <- comparative.data (birds_continuous_discrete_phy, birds_continuous_discrete_dat_df, names.col = Genus_Species)
birds_continuous_discrete_caper_pics 

# Comparative dataset of 399 taxa:
# Phylogeny: birds_continuous_discrete_phy 
#   399 tips, 398 internal nodes
#   chr [1:399] "Nothoprocta_perdicaria" "Apteryx_australis" "Apteryx_haastii" "Apteryx_owenii" ...
# Data: birds_continuous_discrete_dat_df 
#   $ Migration : int [1:399] 0 0 0 0 0 0 0 0 0 0 ...
#   $ Flightless: int [1:399] 0 1 1 1 1 1 0 0 0 0 ...
#   $ log_mass  : num [1:399] 2.66 3.5 3.4 3.14 4.59 ...
#   $ log_BMR   : num [1:399] 0.803 1.22 1.279 1.153 2.101 ...

# 43) We use the 'crunch' function original from the CAIC package (Purvis and Rambaut, 1995b), 
# based on the method of Pagel (1992) to calculate contrasts at polytomies.

birds_log_BMR_log_mass_pic <- crunch(log_BMR ~ log_mass, data=birds_continuous_discrete_caper)
summary(birds_log_BMR_log_mass_pic)

# Call:
# lm(log_BMR ~ log_mass - 1, data = contrData)

# Residuals:
#     Min       1Q   Median       3Q      Max 
# -1.73020  0.00630  0.02239  0.04251  0.24862 

# Coefficients:
#         Estimate Std. Error t value Pr(>|t|)    
# log_mass  0.26157    0.06007   4.355  1.7e-05 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.09657 on 397 degrees of freedom
# Multiple R-squared:  0.04559,	Adjusted R-squared:  0.04318 
# F-statistic: 18.96 on 1 and 397 DF,  p-value: 1.699e-05

# 44) We can also run PICs diagnostics for outliers and influencing contrasts,

# Three types of diagnostics are available and all should be non-significant

# NV or estimated Nodal Value: This tests if there is no relationship between the 
# magnitude of estimated contrasts and the estimated nodal value. 

# SD or estimated Standard Deviation at the Node: Under PICs, we assume that variance 
# accumulates in continuous characters equally along all branches and that the amount 
# of variation is proportional to branch length. The square root of the expected variance 
# calculated at each node can therefore be used to standardize the absolute diference 
# (`raw contrasts') between sister taxa. If this standardization is successful,
# there should be no relationship between absolute values of standardized contrasts and the
# estimated standard deviation.

# AGE or Age of the node: If the phylogeny is ultrametric, then the node age, 
# as time from the present, provides a further test of the effectiveness of the 
# standardization of the contrasts.

# If significant, we better use PGLS or transform branch lengths to make the test non-significant

par(mfrow=c(2,2))
caic.diagnostics(birds_log_BMR_log_mass_pic)

# log_mass :
#       Estimate Std. Error t value  Pr(>|t|)    
# NV   0.0242598  0.0038012  6.3822 4.887e-10 ***
# SD  -0.0025444  0.0013367 -1.9035   0.05769 .  
# AGE -0.0105206  0.0025408 -4.1407 4.234e-05 ***

# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


#45) robust crunch with its diagnostics


birds_log_BMR_log_mass_pic <- crunch(log_BMR ~ log_mass, data=birds_continuous_discrete_caper)
birds_log_BMR_log_mass_pic_Robust <- caic.robust(birds_log_BMR_log_mass_pic)
summary(birds_log_BMR_log_mass_pic_Robust)

# Call:
# lm(log_BMR ~ log_mass - 1, data = contrData)

# Residuals:
#       Min        1Q    Median        3Q       Max 
# -0.079619 -0.013141  0.000365  0.012593  0.222263 

# Coefficients:
#          Estimate Std. Error t value Pr(>|t|)    
# log_mass  0.70845    0.01708   41.48   <2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.0256 on 396 degrees of freedom
# Multiple R-squared:  0.8129,	Adjusted R-squared:  0.8124 
# F-statistic:  1721 on 1 and 396 DF,  p-value: < 2.2e-16


par(mfrow=c(2,2))
caic.diagnostics(birds_log_BMR_log_mass_pic_Robust, outlier=2)

# Excluding 1 contrast with absolute studentised residuals > 3

# log_mass :
#      Estimate Std. Error t value  Pr(>|t|)    
# NV   0.0225679  0.0033466  6.7435 5.507e-11 ***
# SD  -0.0010300  0.0011953 -0.8617    0.3894    
# AGE -0.0027373  0.0024309 -1.1260    0.2608    

# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1