This tutorial will explain how to create phylogenetic trees using parsimony, maximum likelihood and bayesian approaches
library("ape")
library("phangorn")
library("ips")
library("ggplot2")
library("ggtree")
Data in fasta format
Check the alignment – are there any gaps?
data<-read.dna("/Users/martafarrebelmonte/Documents/WORK/Langurs/christian_OnlyFrancois.fa", format="fasta")
dataAli<-clustal(data)
checkAlignment(dataAli)
##
## Number of sequences: 31
## Number of sites: 387
##
## Some gap lengths are not multiple of 3: 1
##
## Frequencies of gap lengths:
## 1
## 93
## => no gap on the left border of the alignment
## => no gap on the right border of the alignment
##
## Number of unique contiguous base segments defined by gaps: 4
## Number of segment lengths not multiple of 3: 2
## => on the left border of the alignement: 0
## => on the right border : 1
## => positions of these segments inside the alignment: 17..194
##
## Number of segregating sites (including gaps): 52
## Number of sites with at least one substitution: 52
## Number of sites with 1, 2, 3 or 4 observed bases:
## 1 2 3 4
## 335 51 1 0
datPhy <- phyDat(dataAli, type = "DNA", levels = NULL)
alview(dataAli,file="msa.fa", uppercase = TRUE, showpos = FALSE)
Here we will use distance methods to produce the tree – only as an example
You’ll need to select between F81 or JC69 models of base frequencies
dm <- dist.ml(datPhy, "F81") #F81 uses empirical base frequencies
treeUPGMA <-upgma(dm)
treeNJ <- NJ(dm)
layout(matrix(c(1,2), 2, 1), height=c(1,2))
par(mar = c(0,0,2,0)+ 0.1)
plot(treeUPGMA, main="UPGMA")
plot(treeNJ, "unrooted", main="NJ")
Here we will use parsimony
parsimony(treeUPGMA, datPhy)
## [1] 89
parsimony(treeNJ, datPhy)
## [1] 87
treePars <- optim.parsimony(treeNJ, datPhy)
## Final p-score 85 after 2 nni operations
treeRatchet <- pratchet(datPhy, start=treeNJ, trace = 0)
#parsimony(c(treePars, treeRatchet), datPhy)
Using modelTest. You can select the best model using AICc from the table or let R do it
mt <- modelTest(datPhy, multicore=TRUE)
## negative edges length changed to 0!
mt[order(mt$AICc),]
## Model df logLik AIC AICw AICc AICcw
## 14 HKY+I 64 -1016.996 2161.993 3.336830e-01 2187.831 5.631007e-01
## 16 HKY+G+I 65 -1016.342 2162.684 2.362338e-01 2189.413 2.554068e-01
## 15 HKY+G 64 -1018.776 2165.552 5.629969e-02 2191.390 9.500752e-02
## 22 GTR+I 68 -1013.500 2163.000 2.016441e-01 2192.510 5.428831e-02
## 24 GTR+G+I 69 -1012.884 2163.767 1.374322e-01 2194.240 2.285246e-02
## 23 GTR+G 68 -1015.260 2166.519 3.470728e-02 2196.029 9.344184e-03
## 10 K80+I 61 -1039.701 2201.401 9.244994e-10 2224.675 5.624296e-09
## 12 K80+G+I 62 -1039.109 2202.217 6.148367e-10 2226.328 2.460999e-09
## 18 SYM+I 65 -1035.564 2201.129 1.059449e-09 2227.858 1.145435e-09
## 20 SYM+G+I 66 -1034.883 2201.766 7.703826e-10 2229.404 5.288258e-10
## 11 K80+G 61 -1042.508 2207.016 5.582072e-11 2230.289 3.395916e-10
## 19 SYM+G 65 -1037.732 2205.464 1.212510e-10 2232.193 1.310919e-10
## 13 HKY 63 -1055.085 2236.170 2.605479e-17 2261.136 6.801656e-17
## 21 GTR 67 -1051.508 2237.016 1.707378e-17 2265.580 7.373805e-18
## 9 K80 60 -1080.174 2280.348 6.649510e-27 2302.802 6.095105e-26
## 17 SYM 64 -1074.507 2277.014 3.522540e-26 2302.852 5.944399e-26
## 6 F81+I 63 -1080.287 2286.574 2.957642e-28 2311.540 7.720985e-28
## 8 F81+G+I 64 -1080.011 2288.022 1.433597e-28 2313.861 2.419240e-28
## 7 F81+G 63 -1082.509 2291.019 3.204225e-29 2315.985 8.364695e-29
## 2 JC+I 60 -1101.585 2323.170 3.343027e-36 2345.624 3.064301e-35
## 4 JC+G+I 61 -1101.309 2324.618 1.620494e-36 2347.892 9.858459e-36
## 3 JC+G 60 -1103.926 2327.853 3.216381e-37 2350.307 2.948214e-36
## 5 F81 62 -1118.636 2361.272 1.780427e-44 2385.383 7.126493e-44
## 1 JC 59 -1140.297 2398.595 1.399208e-52 2420.246 1.915837e-51
## BIC
## 14 2415.332
## 16 2419.981
## 15 2418.891
## 22 2432.173
## 24 2436.898
## 23 2435.692
## 10 2442.865
## 12 2447.639
## 18 2458.426
## 20 2463.022
## 11 2448.479
## 19 2462.762
## 13 2485.551
## 21 2502.230
## 9 2517.854
## 17 2530.353
## 6 2535.955
## 8 2541.361
## 7 2540.400
## 2 2560.676
## 4 2566.082
## 3 2565.358
## 5 2606.694
## 1 2632.142
# choose best model from the table according to AICc
bestmodel <- mt$Model[which.min(mt$AICc)]
env <- attr(mt, "env")
bestmodel
## [1] "HKY+I"
# let R search the table
fitStart <- eval(get(bestmodel, env), env)
fitStart
##
## loglikelihood: -1016.996
##
## unconstrained loglikelihood: -809.369
## Proportion of invariant sites: 0.8009637
##
## Rate matrix:
## a c g t
## a 0.00000 1.00000 20.13343 1.00000
## c 1.00000 0.00000 1.00000 20.13343
## g 20.13343 1.00000 0.00000 1.00000
## t 1.00000 20.13343 1.00000 0.00000
##
## Base frequencies:
## 0.3244634 0.2362314 0.1322922 0.307013
WARNING: You need to type the model and optGamma accordingly!
If bestModel is “xxx + G”, then optGamma = TRUE
fit <- optim.pml(fitStart, rearrangement = "stochastic",optGamma=FALSE, optInv=TRUE, model="HKY")
bs <- bootstrap.pml(fit, bs=100, optNni=TRUE, multicore=TRUE)
treeML<-plotBS(midpoint(fit$tree), bs, p = 50, type="p")
write.tree(treeML, file="phanghorn_tree.tre")
We will try RAxML.
execRAxML<-"/Users/martafarrebelmonte/software/standard-RAxML-master/raxmlHPC-SSE3"
# 1. Create best tree
bestTree<-raxml(dataAli, m="GTRGAMMA",p=12345, N=20, exec=execRAxML, file = "T1")
# 2. Bootstraping
bootstraping<-raxml(dataAli, m="GTRGAMMA",p=12345, b=12345, N=100, exec=execRAxML, file = "T2")
Running the bipartitions outside R
cd /Users/martafarrebelmonte/tests/
/Users/martafarrebelmonte/software/standard-RAxML-master/raxmlHPC-SSE3 -m GTRGAMMA -p 12345 -f b -t RAxML_bestTree.T1 -z RAxML_bootstrap.T2 -n T3
Plotting the tree using ggtree
tree<-read.tree(file="RAxML_bipartitions.T3")
ggtree(tree) + geom_treescale() + geom_tiplab(size=2) + geom_nodelab(size=2)