library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.5
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-2
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(coefplot)
Create a directory called Data
inside of your working directory. Then copy the Floristic Survey Data into your data folder.
Split your dataframe into two dataframes, your metadata, so information about your quadrate (Garlic mustard composition, population, etc) and then your flotistic survey
MyData<-FloristicSurvey[,1:10]
MyData$Population<-as.factor(MyData$Population)
SpeciesComp<-FloristicSurvey[,11:length(FloristicSurvey)]
rownames(SpeciesComp)<-MyData[,1] # keep the quadrat names as part of your floristic survey dataframe.
Species diversity is a complex concept. Here we are speaking about alpha diversity which concentrates on only the site-specific diversity (http://www.metagenomics.wiki/pdf/definition/alpha-beta-diversity).
Different types of alpha diversity exist, and they all have their advantages and drawbacks. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/
Richness measures the number of species present. ex. chaos1 measurement.
Eveness measures the distribution of individuals in the species categories (from having one dominant species and singletons to having only evenly distributed abundances of species)
Diversity indexes combine these measures.
The Shannon diversity index is sensitive to rare species, The Simpson’s diversity index is sensitive to abundant species, and the Berger–Parker dominance is sensitive to only the most abundant species.
What are we trying to do here? This is an important question BEFORE choosing an index.
Ranking sites: Diversity index
Detect effects: Richness
MyData$Richness<-rowSums(SpeciesComp!=0)
MyData$Shannon<-diversity(SpeciesComp, index = "shannon", MARGIN = 1, base = exp(1))
Now we model this.
STEP 1. Fit the response data to a known distribution
mod1<-lm(Richness~ Population + Location + Rosettes + Bolting+ Budding + Bud_Flw, data = MyData)
car::qqp(MyData$Richness, "norm")
## [1] 2 23
plot(density(MyData$Richness)) # does it fit a normal distribution?
shapiro.test(MyData$Richness)
##
## Shapiro-Wilk normality test
##
## data: MyData$Richness
## W = 0.92447, p-value = 0.03507
bptest(mod1) # are the residuals homoskedastic?
##
## studentized Breusch-Pagan test
##
## data: mod1
## BP = 16.547, df = 8, p-value = 0.03519
plot(mod1) # visual check
STEP 2. Likelihood ratio test (test parameters)
To test each parameter, remove it and see if the model is as ‘likely’ without it.
mod1.mPopulation<-update(mod1,.~. - Population)
anova(mod1,mod1.mPopulation)
mod1.mLocation<-update(mod1.mPopulation,.~. - Location)
anova(mod1.mPopulation,mod1.mLocation)
mod1.mRosettes<-update(mod1.mLocation,.~. - Rosettes)
anova(mod1.mLocation,mod1.mRosettes)
mod1.mBolting<-update(mod1.mRosettes,.~. - Bolting)
anova(mod1.mRosettes,mod1.mBolting)
mod1.mBudding<-update(mod1.mBolting,.~. - Budding)
anova(mod1.mBolting,mod1.mBudding)
mod1.mBud_Flw<-update(mod1.mBudding,.~. - Bud_Flw)
anova(mod1.mBudding,mod1.mBud_Flw)
mod1<-mod1.mBud_Flw
summary(mod1) # this is not done to look at the "stars" but rather to look at the Estimate value.
##
## Call:
## lm(formula = Richness ~ 1, data = MyData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8 -2.8 0.2 2.2 5.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.8000 0.5977 9.704 1.3e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.274 on 29 degrees of freedom
ggplot(MyData, aes(y=Richness,x=Location))+
geom_boxplot(aes(color=Location))+
labs(y="Richness")
We just did a linear model based on one measurement: Richness. But after having discusssed the problems with diversity indexes, you understand why that might not be that telling of what is truely happening in our system. What if the second year garlic mustards cause a turnover in plant species, but the same number of species are present? A linear model on richness wouldnt detect that, but a RDA could.
This is a linear model but instead of having a vector as a response variable, we have a matrix (hence the “multi” in multivariate!)
Transform the data from count. Its better to use data that is scaled so it doesnt violate the RDA assumptions of normal variability (which count data doesnt always follow).
For more information please see: http://adn.biol.umontreal.ca/~numericalecology/Reprints/Legendre_&_Gallagher.pdf
SpeciesComp.hell <- decostand(SpeciesComp, "hell")
Now we do the RDA:
var <-MyData[,c(2:6)] # Isolate my variables to make this whole code easier to read, since theres been no flowering we wont include it (all 0's will just mess with the analysis)
formulaRDA<- rda(SpeciesComp.hell ~ Location+Rosettes+Bolting+Budding+Population, data=var, scale=T) # notice my predictive variable are being scaled too.
head(summary(formulaRDA))
##
## Call:
## rda(formula = SpeciesComp.hell ~ Location + Rosettes + Bolting + Budding + Population, data = var, scale = T)
##
## Partitioning of correlations:
## Inertia Proportion
## Total 34.00 1.0000
## Constrained 14.95 0.4398
## Unconstrained 19.05 0.5602
##
## Eigenvalues, and their contribution to the correlations
##
## Importance of components:
## RDA1 RDA2 RDA3 RDA4 RDA5 RDA6
## Eigenvalue 4.0743 3.10888 2.4345 1.90164 1.63860 0.94374
## Proportion Explained 0.1198 0.09144 0.0716 0.05593 0.04819 0.02776
## Cumulative Proportion 0.1198 0.21127 0.2829 0.33880 0.38700 0.41475
## RDA7 RDA8 PC1 PC2 PC3 PC4
## Eigenvalue 0.63029 0.222784 3.06471 2.60018 2.29136 1.95726
## Proportion Explained 0.01854 0.006552 0.09014 0.07648 0.06739 0.05757
## Cumulative Proportion 0.43329 0.439844 0.52998 0.60646 0.67385 0.73142
## PC5 PC6 PC7 PC8 PC9 PC10
## Eigenvalue 1.68533 1.41905 1.07048 1.05366 0.83616 0.68163
## Proportion Explained 0.04957 0.04174 0.03148 0.03099 0.02459 0.02005
## Cumulative Proportion 0.78099 0.82272 0.85421 0.88520 0.90979 0.92984
## PC11 PC12 PC13 PC14 PC15 PC16
## Eigenvalue 0.64415 0.53203 0.35457 0.322234 0.20672 0.170141
## Proportion Explained 0.01895 0.01565 0.01043 0.009477 0.00608 0.005004
## Cumulative Proportion 0.94878 0.96443 0.97486 0.984339 0.99042 0.995423
## PC17 PC18 PC19
## Eigenvalue 0.091951 0.050543 0.0131293
## Proportion Explained 0.002704 0.001487 0.0003862
## Cumulative Proportion 0.998127 0.999614 1.0000000
##
## Accumulated constrained eigenvalues
## Importance of components:
## RDA1 RDA2 RDA3 RDA4 RDA5 RDA6 RDA7
## Eigenvalue 4.0743 3.1089 2.4345 1.9016 1.6386 0.94374 0.63029
## Proportion Explained 0.2724 0.2079 0.1628 0.1272 0.1096 0.06311 0.04215
## Cumulative Proportion 0.2724 0.4803 0.6431 0.7703 0.8798 0.94296 0.98510
## RDA8
## Eigenvalue 0.2228
## Proportion Explained 0.0149
## Cumulative Proportion 1.0000
##
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores: 5.603627
##
##
## Species scores
##
## RDA1 RDA2 RDA3 RDA4 RDA5
## Claytonia_virginiana -0.60959 -0.44706 0.1462 0.25740 0.05763
## Anemone_hepatica -0.69219 -0.15007 -0.2992 0.06439 -0.30256
## Grass_tuft -0.48590 0.03869 0.2121 0.02759 -0.30302
## Trillium_grandifolium -0.69730 -0.15118 -0.3014 0.06486 -0.30479
## Erythronium.trout.lily. -0.28272 -0.35729 0.3250 -0.17059 0.15218
## Acer_saccharum 0.01268 0.73028 -0.4676 0.03506 0.28808
## ....
## RDA6
## Claytonia_virginiana -0.13130
## Anemone_hepatica 0.06181
## Grass_tuft -0.07682
## Trillium_grandifolium 0.06226
## Erythronium.trout.lily. 0.26998
## Acer_saccharum -0.11611
## ....
##
##
## Site scores (weighted sums of species scores)
##
## RDA1 RDA2 RDA3 RDA4 RDA5 RDA6
## 7o3 -2.224 -0.74371 -0.8003 -0.2049 -1.1553 0.28693
## 7o1 -3.474 -0.83239 -1.0484 0.7362 -2.0861 0.96394
## 7o2 -1.886 -0.14466 -1.2266 0.1142 -0.9153 -0.03611
## 7i3 -1.407 -0.11930 2.3925 -0.7342 0.9569 -0.86520
## 7i2 -1.428 -0.15997 1.8727 -0.5637 0.9456 -0.80560
## 7i1 -1.081 -0.09481 1.9391 -0.5646 1.1964 -0.78500
## ....
##
##
## Site constraints (linear combinations of constraining variables)
##
## RDA1 RDA2 RDA3 RDA4 RDA5 RDA6
## 7o3 -2.326 -0.5043 -1.005 0.2164 -1.0167 0.2077
## 7o1 -2.326 -0.5043 -1.005 0.2164 -1.0167 0.2077
## 7o2 -2.326 -0.5043 -1.005 0.2164 -1.0167 0.2077
## 7i3 -1.356 -0.2570 2.487 -0.2920 0.4661 -1.0941
## 7i2 -1.384 -0.3122 2.188 -0.3745 1.0430 -1.1252
## 7i1 -1.783 -0.0128 1.471 -1.1997 0.4833 0.3552
## ....
##
##
## Biplot scores for constraining variables
##
## RDA1 RDA2 RDA3 RDA4 RDA5 RDA6
## Locationo -0.21329 0.1546 -0.4775805 -0.03473 -0.09610 0.7207
## Rosettes 0.14894 -0.1056 0.3843656 0.21297 -0.08963 -0.5990
## Bolting 0.32841 -0.3951 0.1454138 0.36202 0.06608 -0.7018
## Budding -0.21652 -0.1303 0.8247868 -0.28405 0.19915 -0.3250
## Population3 0.07258 0.4477 -0.5276056 -0.33768 0.48648 -0.3887
## Population7 -0.93674 -0.1706 0.2548633 -0.09913 -0.08615 -0.1011
## Population13 0.45356 -0.5242 -0.0000398 -0.57187 -0.40918 0.1002
## Population14 0.19039 -0.4254 -0.0057301 0.63318 0.51767 0.3093
##
##
## Centroids for factor constraints
##
## RDA1 RDA2 RDA3 RDA4 RDA5 RDA6
## Locationi 0.2182 -0.1582 4.886e-01 0.03553 0.09832 -0.7373
## Locationo -0.2182 0.1582 -4.886e-01 -0.03553 -0.09832 0.7373
## Population1 0.4506 1.3759 5.699e-01 0.76834 -1.04111 0.1642
## Population3 0.1485 0.9161 -1.080e+00 -0.69094 0.99541 -0.7952
## Population7 -1.9167 -0.3491 5.215e-01 -0.20284 -0.17629 -0.2068
## Population13 0.9281 -1.0726 -8.143e-05 -1.17014 -0.83726 0.2050
## Population14 0.3896 -0.8703 -1.172e-02 1.29558 1.05924 0.6329
anova(formulaRDA, permutations=1000) # Is the RDA meaningful?
anova.cca(formulaRDA, by="margin", permutations=1000) # Are any predictor variables meaningful?
Depending on whether some things were meaningful, we can see how much of the variation types of variables caused (note that if they wernt meaningful including them here is meaningless):
varpart(SpeciesComp.hell, ~Population, ~ Location, ~Rosettes + Bolting + Budding, data=var)
##
## Partition of variance in RDA
##
## Call: varpart(Y = SpeciesComp.hell, X = ~Population, ~Location,
## ~Rosettes + Bolting + Budding, data = var)
##
## Explanatory tables:
## X1: ~Population
## X2: ~Location
## X3: ~Rosettes + Bolting + Budding
##
## No. of explanatory tables: 3
## Total variation (SS): 19.54
## Variance: 0.67378
## No. of observations: 30
##
## Partition table:
## Df R.square Adj.R.square Testable
## [a+d+f+g] = X1 4 0.54252 0.46933 TRUE
## [b+d+e+g] = X2 1 0.03475 0.00028 TRUE
## [c+e+f+g] = X3 3 0.17609 0.08102 TRUE
## [a+b+d+e+f+g] = X1+X2 5 0.57728 0.48921 TRUE
## [a+c+d+e+f+g] = X1+X3 7 0.64170 0.52769 TRUE
## [b+c+d+e+f+g] = X2+X3 4 0.22402 0.09987 TRUE
## [a+b+c+d+e+f+g] = All 8 0.65327 0.52118 TRUE
## Individual fractions
## [a] = X1 | X2+X3 4 0.42132 TRUE
## [b] = X2 | X1+X3 1 -0.00651 TRUE
## [c] = X3 | X1+X2 3 0.03198 TRUE
## [d] 0 0.02535 FALSE
## [e] 0 0.02639 FALSE
## [f] 0 0.06761 FALSE
## [g] 0 -0.04495 FALSE
## [h] = Residuals 0.47882 FALSE
## Controlling 1 table X
## [a+d] = X1 | X3 4 0.44666 TRUE
## [a+f] = X1 | X2 4 0.48893 TRUE
## [b+d] = X2 | X3 1 0.01884 TRUE
## [b+e] = X2 | X1 1 0.01988 TRUE
## [c+e] = X3 | X1 3 0.05836 TRUE
## [c+f] = X3 | X2 3 0.09959 TRUE
## ---
## Use function 'rda' to test significance of fractions of interest
smry <- summary(formulaRDA)
df1 <- data.frame(smry$sites[,1:2]) # RDA1 and RDA2 (for each quadrate)
df2 <- data.frame(smry$biplot[,1:2]) # loadings for RDA1 and RDA2 (for each variable)
rda.plot <- ggplot(df1, aes(x=RDA1, y=RDA2)) +
geom_text(aes(label=rownames(df1)),size=3,position=position_jitter(width=.2,height=.2)) +
geom_point(aes(alpha=0.3)) +
geom_hline(yintercept=0, linetype="dotted") +
geom_vline(xintercept=0, linetype="dotted")
(formulaRDA.PLOT<-rda.plot +
geom_segment(data=df2, aes(x=0, xend=RDA1, y=0, yend=RDA2),
color="red", arrow=arrow(length=unit(0.01,"npc"))) +
geom_text(data=df2,
aes(x=RDA1,y=RDA2,label=rownames(df2),
hjust=0.5*(1-sign(RDA1)),vjust=0.5*(1-sign(RDA2))),
color="red", size=4)+ theme(legend.position="none"))
What is an NMDS?
Usually, when viewing data in space (Principle component analysis, RDA, etc), many axes are calculated (eigenvectors) but since visualization is only useful in a few dimension a lot of the variation is ignored. In NMDS, the number of axes are explicitly chosen prior to the analysis and the data are fitted to those dimensions. The “true” distance, or the overall difference in how two plots differ, is displayed on the graph. One of the limitations, however, is that unlike RDA the axis are not truely meaningful (since NMDS is not an eigenvalue-eigenvector technique) and thus they cannot be associated with the variance in predictors.
We need to produce the betadiversity matrix now. Betadiversity is a measure of difference between sites.
M <- as.matrix(SpeciesComp)
dist_M <- vegdist(M, method = "bray", binary = T)
The metaMDS analysis could have done the distance matrix internally but i would rather control it.
meta.nmds <- metaMDS(dist_M, k=2, trymax = 1000) # ok so k=2 means I want 2 dimensions
## Run 0 stress 0.1308598
## Run 1 stress 0.1425695
## Run 2 stress 0.1890539
## Run 3 stress 0.1303213
## ... New best solution
## ... Procrustes: rmse 0.01090199 max resid 0.05223528
## Run 4 stress 0.1362695
## Run 5 stress 0.1309801
## Run 6 stress 0.1308525
## Run 7 stress 0.1473352
## Run 8 stress 0.1358731
## Run 9 stress 0.1539389
## Run 10 stress 0.147345
## Run 11 stress 0.1381301
## Run 12 stress 0.1407537
## Run 13 stress 0.1364188
## Run 14 stress 0.1425684
## Run 15 stress 0.1364188
## Run 16 stress 0.13587
## Run 17 stress 0.1494478
## Run 18 stress 0.1441935
## Run 19 stress 0.1472597
## Run 20 stress 0.1372004
## Run 21 stress 0.1303174
## ... New best solution
## ... Procrustes: rmse 0.001109632 max resid 0.004437517
## ... Similar to previous best
## *** Solution reached
#data for plotting
##NMDS points
NMDS.data<-data.frame(Location=MyData$Location,Population=MyData$Population)
NMDS.data$NMDS1<-meta.nmds$points[ ,1]
NMDS.data$NMDS2<-meta.nmds$points[ ,2]
Notice how this analysis was iterative?
The metaMDS function seeks a solution and stops computation when an acceptable solution has been found, or it stops after some pre-specified number of attempts (trymax = 1000). This is another difference in how the NMDS works. Unlike other multivariate analyses, it is not a direct analysis, instead it is a numerical technique that will search for a likely solution (with a low stress level).
For more information on the NMDS see https://strata.uga.edu/software/pdf/mdsTutorial.pdf.
ggplot(data = NMDS.data, aes(y = NMDS2, x = NMDS1))+
geom_point( aes(color = NMDS.data$Population,shape = NMDS.data$Location), size = 1.5,alpha=0.6)