This tutorial will explain how to create phylogenetic trees using parsimony, maximum likelihood and bayesian approaches

0. Prerequisits

  1. RAxML needs to be installed in your PATH – you can download it from: here
  2. Libraries: ape, ips and phangorn installed
  3. ClustalW needs to be installed in your PATH – download it from: here
library("ape")
library("phangorn")
library("ips")
library("ggplot2")
library("ggtree")

1. Loading data and aligning the sequences

Data in fasta format
Check the alignment – are there any gaps?

data<-read.dna("/Users/martafarrebelmonte/Documents/WORK/Langurs/christian_OnlyFrancois.fa", format="fasta")
dataAli<-clustal(data)
checkAlignment(dataAli)
## 
## Number of sequences: 31 
## Number of sites: 387 
## 
## Some gap lengths are not multiple of 3: 1
## 
## Frequencies of gap lengths:
##  1 
## 93 
##    => no gap on the left border of the alignment
##    => no gap on the right border of the alignment
## 
## Number of unique contiguous base segments defined by gaps: 4 
## Number of segment lengths not multiple of 3: 2 
##     => on the left border of the alignement: 0 
##     => on the right border                 : 1 
##     => positions of these segments inside the alignment: 17..194 
## 
## Number of segregating sites (including gaps): 52
## Number of sites with at least one substitution: 52
## Number of sites with 1, 2, 3 or 4 observed bases:
##   1   2   3   4 
## 335  51   1   0

datPhy <- phyDat(dataAli, type = "DNA", levels = NULL)
alview(dataAli,file="msa.fa", uppercase = TRUE, showpos = FALSE)

2. Distance methods

Here we will use distance methods to produce the tree – only as an example
You’ll need to select between F81 or JC69 models of base frequencies

dm <- dist.ml(datPhy, "F81") #F81 uses empirical base frequencies
treeUPGMA <-upgma(dm)
treeNJ <- NJ(dm)
layout(matrix(c(1,2), 2, 1), height=c(1,2))
par(mar = c(0,0,2,0)+ 0.1)
plot(treeUPGMA, main="UPGMA")
plot(treeNJ, "unrooted", main="NJ")

3. Parsimony

Here we will use parsimony

parsimony(treeUPGMA, datPhy)
## [1] 89
parsimony(treeNJ, datPhy)
## [1] 87
treePars <- optim.parsimony(treeNJ, datPhy)
## Final p-score 85 after  2 nni operations
treeRatchet  <- pratchet(datPhy, start=treeNJ, trace = 0)
#parsimony(c(treePars, treeRatchet), datPhy)

4. Maximum likelihood

First, we will use phangorn package

a. Model selection

Using modelTest. You can select the best model using AICc from the table or let R do it

mt <- modelTest(datPhy, multicore=TRUE)
## negative edges length changed to 0!
mt[order(mt$AICc),]
##      Model df    logLik      AIC         AICw     AICc        AICcw
## 14   HKY+I 64 -1016.996 2161.993 3.336830e-01 2187.831 5.631007e-01
## 16 HKY+G+I 65 -1016.342 2162.684 2.362338e-01 2189.413 2.554068e-01
## 15   HKY+G 64 -1018.776 2165.552 5.629969e-02 2191.390 9.500752e-02
## 22   GTR+I 68 -1013.500 2163.000 2.016441e-01 2192.510 5.428831e-02
## 24 GTR+G+I 69 -1012.884 2163.767 1.374322e-01 2194.240 2.285246e-02
## 23   GTR+G 68 -1015.260 2166.519 3.470728e-02 2196.029 9.344184e-03
## 10   K80+I 61 -1039.701 2201.401 9.244994e-10 2224.675 5.624296e-09
## 12 K80+G+I 62 -1039.109 2202.217 6.148367e-10 2226.328 2.460999e-09
## 18   SYM+I 65 -1035.564 2201.129 1.059449e-09 2227.858 1.145435e-09
## 20 SYM+G+I 66 -1034.883 2201.766 7.703826e-10 2229.404 5.288258e-10
## 11   K80+G 61 -1042.508 2207.016 5.582072e-11 2230.289 3.395916e-10
## 19   SYM+G 65 -1037.732 2205.464 1.212510e-10 2232.193 1.310919e-10
## 13     HKY 63 -1055.085 2236.170 2.605479e-17 2261.136 6.801656e-17
## 21     GTR 67 -1051.508 2237.016 1.707378e-17 2265.580 7.373805e-18
## 9      K80 60 -1080.174 2280.348 6.649510e-27 2302.802 6.095105e-26
## 17     SYM 64 -1074.507 2277.014 3.522540e-26 2302.852 5.944399e-26
## 6    F81+I 63 -1080.287 2286.574 2.957642e-28 2311.540 7.720985e-28
## 8  F81+G+I 64 -1080.011 2288.022 1.433597e-28 2313.861 2.419240e-28
## 7    F81+G 63 -1082.509 2291.019 3.204225e-29 2315.985 8.364695e-29
## 2     JC+I 60 -1101.585 2323.170 3.343027e-36 2345.624 3.064301e-35
## 4   JC+G+I 61 -1101.309 2324.618 1.620494e-36 2347.892 9.858459e-36
## 3     JC+G 60 -1103.926 2327.853 3.216381e-37 2350.307 2.948214e-36
## 5      F81 62 -1118.636 2361.272 1.780427e-44 2385.383 7.126493e-44
## 1       JC 59 -1140.297 2398.595 1.399208e-52 2420.246 1.915837e-51
##         BIC
## 14 2415.332
## 16 2419.981
## 15 2418.891
## 22 2432.173
## 24 2436.898
## 23 2435.692
## 10 2442.865
## 12 2447.639
## 18 2458.426
## 20 2463.022
## 11 2448.479
## 19 2462.762
## 13 2485.551
## 21 2502.230
## 9  2517.854
## 17 2530.353
## 6  2535.955
## 8  2541.361
## 7  2540.400
## 2  2560.676
## 4  2566.082
## 3  2565.358
## 5  2606.694
## 1  2632.142
# choose best model from the table according to AICc
bestmodel <- mt$Model[which.min(mt$AICc)]
env <- attr(mt, "env")
bestmodel
## [1] "HKY+I"
# let R search the table
fitStart <- eval(get(bestmodel, env), env)
fitStart
## 
##  loglikelihood: -1016.996 
## 
## unconstrained loglikelihood: -809.369 
## Proportion of invariant sites: 0.8009637 
## 
## Rate matrix:
##          a        c        g        t
## a  0.00000  1.00000 20.13343  1.00000
## c  1.00000  0.00000  1.00000 20.13343
## g 20.13343  1.00000  0.00000  1.00000
## t  1.00000 20.13343  1.00000  0.00000
## 
## Base frequencies:  
## 0.3244634 0.2362314 0.1322922 0.307013
b. Running ml and bootstrapping

WARNING: You need to type the model and optGamma accordingly!
If bestModel is “xxx + G”, then optGamma = TRUE

fit <- optim.pml(fitStart, rearrangement = "stochastic",optGamma=FALSE, optInv=TRUE, model="HKY")
bs <- bootstrap.pml(fit, bs=100, optNni=TRUE, multicore=TRUE)
treeML<-plotBS(midpoint(fit$tree), bs, p = 50, type="p")

write.tree(treeML, file="phanghorn_tree.tre")

Using RAxML

We will try RAxML.

execRAxML<-"/Users/martafarrebelmonte/software/standard-RAxML-master/raxmlHPC-SSE3"
# 1. Create best tree
bestTree<-raxml(dataAli, m="GTRGAMMA",p=12345, N=20, exec=execRAxML, file = "T1")
# 2. Bootstraping
bootstraping<-raxml(dataAli, m="GTRGAMMA",p=12345, b=12345, N=100, exec=execRAxML, file = "T2")

Running the bipartitions outside R

cd /Users/martafarrebelmonte/tests/
/Users/martafarrebelmonte/software/standard-RAxML-master/raxmlHPC-SSE3 -m GTRGAMMA -p 12345 -f b -t RAxML_bestTree.T1 -z RAxML_bootstrap.T2 -n T3

Plotting the tree using ggtree

tree<-read.tree(file="RAxML_bipartitions.T3")
ggtree(tree) + geom_treescale() + geom_tiplab(size=2) + geom_nodelab(size=2)