Main statistical analyses

library(PopGenome)
library(bigmemory)
library(plyr)
library(scales)
library(asbio) #R2
library(ppcor) #correlation
library(stringr) #reged
library(reshape2)
library(lmodel2)
library(boot)
library(tidyverse)

go <- read.csv("data/unclustered_go.csv",header = T)
annotation <- read.csv("data/annotation.csv",header=T)
kegg <- read.csv("data/kegg.csv",header = F)
kegg <- merge(annotation[,c(1,3),drop=F], kegg, by.x = "NCBI", by.y = 1)[,2:3]
colnames(kegg) <- c("gene", "kegg")

Fst computed using PopGenome on haplotype data

az_early <- as.character(read.table("data/az_early.txt")[[1]])
az_late  <- as.character(read.table("data/az_late.txt")[[1]])
tx_early  <- as.character(read.table("data/tx_early.txt")[[1]])
tx_late  <- as.character(read.table("data/tx_late.txt")[[1]])
variant <- read.table("data/variant.txt", header=F, col.names = c("name"))

Code run on the OIST cluster, and re-loaded

vcf <- readData("data/vcf/16", format="VCF", gffpath="data/gff/16")

vcf.test <- set.populations(vcf,list(az_early=az_early,az_late=az_late, tx_early=tx_early,tx_late=tx_late), diploid=TRUE)
vcf.test@populations[[1]]

vcf.test <- splitting.data(vcf.test, subsites="gene")
length(vcf.test@region.names)
vcf.test <- F_ST.stats(vcf.test, mode="nucleotide")

gene_names <- c()
fst <- c()
dxy <- c()
dx <- c()
for (i in 1:16) {
  load(paste0("data/fst/",i,".RData"))
  fst <- cbind(fst, genes1@hap.F_ST.pairwise)
  dxy <- cbind(dxy, genes1@hap.diversity.between)
  dx <- cbind(dx, t(genes1@hap.diversity.within))
  info <- get.feature.names(genes1, gff.file=paste0("/Users/sasha/Dropbox (OIST)/projects/bee varroa/data/gff/",i,"/Group",i,".gff"), chr=paste0("LG",i))
  gene_names <- c(gene_names,str_match(info, "ID=(GB[0-9]+)")[,2])
}

fst <- as.data.frame(t(fst))
colnames(fst) <- c("az_early_az_late", "az_early_tx_early", "az_early_tx_late", "az_late_tx_early", "az_late_tx_late", "tx_early_tx_late")
fst$names <- gene_names
fst <- filter(fst, names %in% variant$name) 
full.pops <- lm(az_early_az_late ~ tx_early_tx_late + az_late_tx_late, data=fst)
summary(full.pops)

## 
## Call:
## lm(formula = az_early_az_late ~ tx_early_tx_late + az_late_tx_late, 
##     data = fst)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.211377 -0.007209 -0.005599  0.003796  0.243532 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.0055994  0.0004951  11.310  < 2e-16 ***
## tx_early_tx_late 0.8700692  0.0093083  93.472  < 2e-16 ***
## az_late_tx_late  0.5649043  0.0870577   6.489 9.51e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02845 on 4863 degrees of freedom
##   (30 observations deleted due to missingness)
## Multiple R-squared:  0.6612, Adjusted R-squared:  0.6611 
## F-statistic:  4746 on 2 and 4863 DF,  p-value: < 2.2e-16

noafr <- lm(az_early_az_late ~ tx_early_tx_late , data=as.data.frame(fst))
partial.R2(noafr, full.pops)*100

## [1] 0.9062168

The similarity between post-Varroa population, i.e., the African contribution to similarity explains only a small percentage of the variance

with(fst, cor.test(az_early_az_late, tx_early_tx_late, method="s"))

## 
##  Spearman's rank correlation rho
## 
## data:  az_early_az_late and tx_early_tx_late
## S = 2267300000, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8820745

ggplot(subset(as.data.frame(fst), az_early_az_late != 0 | tx_early_tx_late !=0),aes(az_early_az_late,tx_early_tx_late))+geom_point()+theme_bw()+xlab(expression(paste(F["st"]," AZ pre vs. post")))+ylab(expression(paste(F["st"]," TX pre vs. post")))+geom_smooth(method="lm",se=F)+annotate("text",.05,.4,label = "r = 0.88")

with(as.data.frame(fst), cor.test(az_early_az_late, tx_early_tx_late, method="s"))

## 
##  Spearman's rank correlation rho
## 
## data:  az_early_az_late and tx_early_tx_late
## S = 2267300000, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8820745

Re-analysis using an absolute distance measure

dxy <- as.data.frame(t(dxy))
colnames(dxy) <-  c("az_early_az_late", "az_early_tx_early", "az_early_tx_late", "az_late_tx_early", "az_late_tx_late", "tx_early_tx_late")
dxy$names <- gene_names
dxy <- filter(dxy, names %in% variant$name) 

dx <- as.data.frame(t(dx))
colnames(dx) <-  c("az_early", "az_late", "tx_early", "tx_late")
dx$names <- gene_names
dx <- filter(dx, names %in% variant$name) 

cor(dxy[,1] / dx[,1], dxy[,6] / dx[,3], method="s", use = "complete.obs") # Re-doing with dxy

## [1] 0.8407106

Comparing degree of Africanization using the filtred dat set

filtered <- read.csv("data/filtered.csv", header=F, col.names = c("name","count")) 
# the maximum is 660, of which 282 are in the pre- populations and 278 are in the post
filtered$african <- (382-(filtered$count-282))/382

with(left_join(data.frame(fst=fst[,1], name=fst[,"names"]), filtered, by=c("name")), cor.test(fst, african, method ="s"))

## 
##  Spearman's rank correlation rho
## 
## data:  fst and african
## S = 3.3505e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.7298048

left_join(data.frame(fst=fst[,1], name=fst[,"names"]), filtered, by=c("name")) %>% ggplot(aes(african, fst))+geom_point()+geom_smooth()+xlab("Proportion African genes per locus")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Fst computed using PopGenome on nucleotide frequency data using data from before imputation.

Just to check that we’re not getting some sort of spurious imputation artifact.

fst_pre <- c()
for (i in 1:16) {
  load(paste0("data/fst_pre_beagle/",i,"_nuc.RData"))
  fst_pre <- cbind(fst_pre, genes2@nuc.F_ST.pairwise)
}

fst_pre <- t(fst_pre)
colnames(fst_pre) <- c("az_early_az_late", "az_early_tx_early", "az_early_tx_late", "az_late_tx_early", "az_late_tx_late", "tx_early_tx_late")
summary(glm(az_early_az_late ~ tx_early_tx_late + az_late_tx_late, data=as.data.frame(fst_pre)))

## 
## Call:
## glm(formula = az_early_az_late ~ tx_early_tx_late + az_late_tx_late, 
##     data = as.data.frame(fst_pre))
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.243948  -0.000781  -0.000781  -0.000781   0.263291  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.0007810  0.0001309   5.966 2.49e-09 ***
## tx_early_tx_late 0.9233039  0.0042879 215.330  < 2e-16 ***
## az_late_tx_late  0.7102010  0.0516320  13.755  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.0002099234)
## 
##     Null deviance: 14.272  on 13245  degrees of freedom
## Residual deviance:  2.780  on 13243  degrees of freedom
##   (39 observations deleted due to missingness)
## AIC: -74582
## 
## Number of Fisher Scoring iterations: 2

summary(glm(az_early_az_late ~ tx_early_tx_late + az_late_tx_late, data=as.data.frame(fst)))

## 
## Call:
## glm(formula = az_early_az_late ~ tx_early_tx_late + az_late_tx_late, 
##     data = as.data.frame(fst))
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.211377  -0.007209  -0.005599   0.003796   0.243532  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.0055994  0.0004951  11.310  < 2e-16 ***
## tx_early_tx_late 0.8700692  0.0093083  93.472  < 2e-16 ***
## az_late_tx_late  0.5649043  0.0870577   6.489 9.51e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.0008096245)
## 
##     Null deviance: 11.6215  on 4865  degrees of freedom
## Residual deviance:  3.9372  on 4863  degrees of freedom
##   (30 observations deleted due to missingness)
## AIC: -20827
## 
## Number of Fisher Scoring iterations: 2

with(as.data.frame(fst_pre), plot(az_early_az_late, tx_early_tx_late ))

Analsis with novel haplotypes removed

only run on Sango

filtered_fst <- read.csv("data/filtered_fst.csv")

with(filtered_fst, cor.test(az_early_az_late, tx_early_tx_late,method="s"))

## 
##  Spearman's rank correlation rho
## 
## data:  az_early_az_late and tx_early_tx_late
## S = 1.2287e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3367131

with(filtered_fst, plot(az_early_az_late, tx_early_tx_late))

summary(glm(az_early_az_late ~ tx_early_tx_late, data=filtered_fst))

## 
## Call:
## glm(formula = az_early_az_late ~ tx_early_tx_late, data = filtered_fst)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.20738  -0.03832  -0.03096  -0.00157   0.46630  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.038315   0.001731   22.13   <2e-16 ***
## tx_early_tx_late 0.274149   0.014241   19.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.01148057)
## 
##     Null deviance: 59.430  on 4807  degrees of freedom
## Residual deviance: 55.176  on 4806  degrees of freedom
##   (81 observations deleted due to missingness)
## AIC: -7829.3
## 
## Number of Fisher Scoring iterations: 2

with(filtered_fst, cor.test(az_early_az_late,  tx_early_tx_late,method="s"))

## 
##  Spearman's rank correlation rho
## 
## data:  az_early_az_late and tx_early_tx_late
## S = 1.2287e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3367131

Effect of novel haplotype removal on Fst

Sanity check plots. We expect that removing novel haplotypes, which are presumably due to African immigration, will decrease the relatedness in post-varroa populaions

filtered_fst.melt <- melt(filtered_fst[,c("az_early_az_late","az_early_tx_early","tx_early_tx_late","az_late_tx_late")])

## No id variables; using all as measure variables

filtered_fst.melt$variable <- mapvalues(filtered_fst.melt$variable, levels(filtered_fst.melt$variable), c("AZ pre vs. post", "AZ & TX pre", "TX pre vs. post", "AZ & TX post"))

filtered_fst.melt$variable <- factor(filtered_fst.melt$variable, levels=levels(filtered_fst.melt$variable)[c(2,4,1,3)])

filtered_fst.melt$filtered <- "yes"

fst.melt <- melt(fst[,c("az_early_az_late","az_early_tx_early","tx_early_tx_late","az_late_tx_late")])

## No id variables; using all as measure variables

fst.melt$filtered <- "no"
ggplot(na.omit(subset(rbind(fst.melt,filtered_fst.melt),fst>10^-9)),aes(x=variable,y=value))+geom_boxplot()+scale_y_log10(labels=trans_format("log10", math_format(10^.x)))+theme_bw()+xlab("Population comparison")+ylab(expression(F["st"])) +theme(axis.title=element_text(face="bold"))+facet_grid(.~filtered, scales = "free_x")

Allele frequency shifts

Allelic frequencies computed using vcftools --freq2 in each population

freqs <- read_tsv("data/frequencies.txt.gz") %>% transmute(AZ = az_late - az_early, TX = tx_late - tx_early) %>% gather(key="pop", value = "late vs early")

## Parsed with column specification:
## cols(
##   chrom = col_character(),
##   pos = col_integer(),
##   az_early = col_double(),
##   az_late = col_double(),
##   tx_early = col_double(),
##   tx_late = col_double()
## )

colorCodes <- list("Arizona"=rgb(244,191,100, maxColorValue=255), "Texas"=rgb(99,177,244,  maxColorValue=255))
ggplot(freqs, aes(`late vs early`, fill=pop))+geom_histogram(bins=50) + theme_bw()+scale_fill_manual(values=c(colorCodes[["Arizona"]], colorCodes[["Texas"]]))+ xlab("Post- vs. pre-Varroa allele frequency")+ylab("Number of sites")

ggplot(freqs, aes(`late vs early`/IQR(`late vs early`)/1.349, fill=pop))+geom_histogram(aes(y=..density..),binwidth=0.1) + theme_bw()+scale_fill_manual(values=c(colorCodes[["Arizona"]], colorCodes[["Texas"]]))+ xlab("Post- vs. pre-Varroa allele frequency")+ylab("Number of sites")+ stat_function(fun = dnorm, color="blue") + xlim(-5, 5)

Functional analysis

GO

go$az_early_az_late <- fst[go$gene,"az_early_az_late"]
go$tx_early_tx_late <- fst[go$gene,"tx_early_tx_late"]
subgo<- ddply(go, .(go), subset, length(gene) >=10 ) # remove any rare terms 


lmfit <- function(formula, data, indices) {
    d<-data[indices, ]
    fit <- (lm(formula, data = d))
    return(coef(fit))
}

Code below run on cluster

results <- boot(data = subgo, statistic = lmfit, R = 10000, formula = az_early_az_late~go+0, strata=subgo$go, parallel="multicore", ncpus=10)
results <- boot(data = subgo, statistic = lmfit, R = 10000, formula = tx_early_tx_late~go+0, strata=subgo$go, parallel="multicore", ncpus=10)

# load pre-computer bootstrapped data
load("data/az.RData")
az_go_boot <- results
load("data/tx.RData")
tx_go_boot <- results

az_go.lm <- lm(az_early_az_late~go+0, data=subgo)
az_go.sig <- data.frame(coef = numeric())
for (i in 1:length(az_go.lm$coefficients)) {
    bci <- boot.ci(az_go_boot, type = "basic", index = i)
    if (bci$basic[4]>0 & bci$basic[4] > 0) az_go.sig[gsub("go","",names(az_go.lm$coefficients)[i]),"coef"] = results$t0[i]
#    print(sprintf("%s,%.4f,%.4f,%.4f", names(az_go.lm$coefficients)[i], results$t0[i], bci$basic[4], bci$basic[5]))
}

tx_go.lm <- lm(tx_early_tx_late~go+0, data=subgo)
tx_go.sig <- data.frame(coef = numeric())
for (i in 1:length(tx_go.lm$coefficients)) {
    bci <- boot.ci(tx_go_boot, type = "basic", index = i)
    if (bci$basic[4]>0 & bci$basic[4] > 0) tx_go.sig[gsub("go","",names(tx_go.lm$coefficients)[i]),"coef"] = results$t0[i]
}

all_go <- na.omit(merge(az_go.sig,tx_go.sig,by="row.names",all.x=TRUE))
all_go$aveEff <- rowMeans(all_go[,c(2,3)])

go.mlm <- lm(cbind(az_early_az_late, tx_early_tx_late)~go+0, data=subgo)

plot(summary(go.mlm)[[1]]$coefficients[,1], summary(go.mlm)[[2]]$coefficients[,1])

cor.test(summary(go.mlm)[[1]]$coefficients[,1], summary(go.mlm)[[2]]$coefficients[,1])

## 
##  Pearson's product-moment correlation
## 
## data:  summary(go.mlm)[[1]]$coefficients[, 1] and summary(go.mlm)[[2]]$coefficients[, 1]
## t = 20.655, df = 199, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7760801 0.8652878
## sample estimates:
##       cor 
## 0.8257815

KEGG pathway enrichment

kegg <- merge(kegg, fst, by.x = "gene", by.y = "names", all.x = T)
subkegg<- ddply(kegg, .(kegg), subset, length(gene) >= 10 ) # remove any rare terms

az_kegg_boot <-  boot(data = subkegg, statistic = lmfit, R = 1000, formula = az_early_az_late~kegg+0, strata=subkegg$kegg, parallel="multicore", ncpus=2)
tx_kegg_boot <- boot(data = subkegg, statistic = lmfit, R = 1000, formula = tx_early_tx_late~kegg+0, strata=subkegg$kegg, parallel="multicore", ncpus=2)
save( az_kegg_boot, tx_kegg_boot, file="kegg_boot.RData")

load("data/kegg_boot.RData")

az_kegg.sig <- data.frame(coef = numeric())
for (i in 1:length(az_kegg_boot$t0)) {
    bci <- boot.ci(az_kegg_boot, type = "basic", index = i)
    if (bci$basic[4]>0 & bci$basic[4] > 0) az_kegg.sig[gsub("kegg","",names(az_kegg_boot$t0)[i]),"coef"] = az_kegg_boot$t0[i]
}

tx_kegg.sig <- data.frame(coef = numeric())
for (i in 1:length(az_kegg_boot$t0)) {
    bci <- boot.ci(tx_kegg_boot, type = "basic", index = i)
    if (bci$basic[4]>0 & bci$basic[4] > 0) tx_kegg.sig[gsub("kegg","",names(az_kegg_boot$t0)[i]),"coef"] = tx_kegg_boot$t0[i]
}

kegg.mlm <- lm(cbind(az_early_az_late, tx_early_tx_late)~kegg+0, data=subkegg)

plot(summary(kegg.mlm)[[1]]$coefficients[,1], summary(kegg.mlm)[[2]]$coefficients[,1])

cor.test(summary(kegg.mlm)[[1]]$coefficients[,1], summary(kegg.mlm)[[2]]$coefficients[,1])

## 
##  Pearson's product-moment correlation
## 
## data:  summary(kegg.mlm)[[1]]$coefficients[, 1] and summary(kegg.mlm)[[2]]$coefficients[, 1]
## t = 15.833, df = 105, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7728418 0.8878603
## sample estimates:
##       cor 
## 0.8395174

Morphometrics

This analysis was written by Jatin Arora

library(ggplot2)
library(shapes)

## 
## Attaching package: 'shapes'

## The following object is masked from 'package:ff':
## 
##     add

library(asbio)
library(Hmisc)

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:boot':
## 
##     melanoma

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:boot':
## 
##     aml

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize

## The following object is masked from 'package:bigmemory':
## 
##     describe

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# convert from 2d matrix of coordinates to 3d matrix
make3dMatrix = function(dat){

   k = 19 # number of landmarks
   m = 2  # number of dimensions
   ns = nrow(dat)    # number of samples

   input = array(0, dim=c(k,m,ns))
   for(n in 1:ns){
      i = 1
      for(k in 1:19){
         for(m in 1:2){
            input[k,m,n] =  unlist(dat[n,], use.names=F)[i]
            i = i + 1
         }
      }
   }

   return(input)
}

#---------------------------------------------------------------------------------
# load africanisation data
admix = read.table("Tx_Az_morpho_Jatin/admixture_K3.txt",header=T)

file.create("stat.txt")

## [1] TRUE

to.plot = c()
for(pop in c("arizona", "texas")){

   # load raw coordinates data in _format and _db formats
   raw = read.delim(paste("Tx_Az_morpho_Jatin/", pop, "_coord_Jun_Format.txt", sep=""), header = F, sep = "\t", skip = 1)
   db = read.delim(paste("Tx_Az_morpho_Jatin/", pop, "_coord_Jun_DB.txt", sep=""))
   db = db[,-ncol(db)]

   # load samples year information
   yr = read.delim(paste("Tx_Az_morpho_Jatin/", pop, "_yrs.txt", sep=""), header = F)

   # convert coordinate matrix into 3d one and do procrustes anaylsis
   mat = make3dMatrix(dat = raw)
   p = procGPA(x = mat, scale = T, distances = T)

   # extract centroid sizes
   centroid.size = data.frame(size = p$size)
   centroid.size$id = sub("-[1-2]\\..*","",db$image)
   centroid.size$id = sub("\\.[1-2]\\..*","",centroid.size$id)

   centroid.admix = merge(x = centroid.size, y = admix, by = "id", all.x = T, all.y = F, sort = F)
   oldnames <- colnames(centroid.admix)
   centroid.admix <- aggregate(centroid.admix[,c(2:5)],by=list(centroid.admix$id),FUN=mean)
   colnames(centroid.admix) <- oldnames
   t = cor.test(centroid.admix$size,centroid.admix$POP2, method = "s", exact = F)
   print(paste(pop,t$estimate,t$p.value))

   # store centroid size thereby excluding drones
   frame = merge(x = centroid.size, y = yr, by.x = "id", by.y = "V1", all.x = T, sort = F)
   frame = frame[frame$size < 900,]
   frame$pop = capitalize(pop)
   to.plot = rbind(to.plot, frame)

}

## [1] "To speed up use option distances=FALSE"
## [1] "To speed up use option pcaoutput=FALSE"
## [1] "arizona -0.424589432273319 1.89597842553382e-07"
## [1] "To speed up use option distances=FALSE"
## [1] "To speed up use option pcaoutput=FALSE"
## [1] "texas -0.491836381207708 8.8563762311328e-16"

to.plot2 <- na.omit(aggregate(to.plot$size, by=list(to.plot$id),FUN=mean))
colnames(to.plot2) <- c("id", "size")
metadata <- unique(na.omit(to.plot[,c(1,3,4)]))
to.plot2 <- na.omit(merge(x = to.plot2, y = metadata, by = "id", all.x = T, all.y = T, sort = F))

with(subset(to.plot2, pop == "Arizona"), cor.test(size,V2,method="s"))

## 
##  Spearman's rank correlation rho
## 
## data:  size and V2
## S = 641400, p-value = 1.014e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## -0.433034

with(subset(to.plot2, pop == "Texas"), cor.test(size,V2,method="s"))

## 
##  Spearman's rank correlation rho
## 
## data:  size and V2
## S = 3030800, p-value = 1.672e-10
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.4012539

#Figure S1
to.plot3 <- na.omit(join(to.plot2,admix))

## Joining by: id

to.plot3$pop <- factor(to.plot3$pop, levels = c("Texas", "Arizona"))
colnames(to.plot3)[3] <- "year"
colnames(to.plot3)[6] <- "African"
to.plot3$epoch <- "pre"
to.plot3$epoch[to.plot3$year>1995] <- "post"

ggplot(to.plot3,aes(as.factor(year),size,color=African))+facet_grid(pop~.,scales = "free_y")+geom_jitter(width = .3)+ xlab("Year") + ylab("Centroid size") + theme_bw()+theme(panel.grid.major = element_blank(), strip.background = element_rect(fill = "#FFFFFF", colour = "#FFFFFF"), axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),legend.position="top")+stat_smooth(color="blue",method="lm", aes(group=epoch)) + scale_colour_gradient()

size.pop.lm <- lm(size~POP2+pop, data=subset(na.omit(join(to.plot2,admix)),V2>1996))

## Joining by: id

summary(size.pop.lm)

## 
## Call:
## lm(formula = size ~ POP2 + pop, data = subset(na.omit(join(to.plot2, 
##     admix)), V2 > 1996))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.703 -11.911   0.389   9.355  52.105 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  772.619      4.202 183.859  < 2e-16 ***
## POP2         -13.732      5.810  -2.363   0.0191 *  
## popTexas     -12.597      2.347  -5.368 2.31e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.16 on 190 degrees of freedom
## Multiple R-squared:  0.1515, Adjusted R-squared:  0.1426 
## F-statistic: 16.97 on 2 and 190 DF,  p-value: 1.659e-07

partial.R2(lm(size~pop, data=subset(na.omit(join(to.plot2,admix))),V2>1996), size.pop.lm)

## Joining by: id

## [1] 0.02855891

summary(lm(size~POP2, data=subset(na.omit(join(to.plot2,admix)),V2==1995)))

## Joining by: id

## 
## Call:
## lm(formula = size ~ POP2, data = subset(na.omit(join(to.plot2, 
##     admix)), V2 == 1995))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.174 -11.198   0.399  10.774  29.320 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   770.48       2.82 273.185   <2e-16 ***
## POP2          -15.77      16.17  -0.976    0.336    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.64 on 37 degrees of freedom
## Multiple R-squared:  0.02508,    Adjusted R-squared:  -0.001271 
## F-statistic: 0.9517 on 1 and 37 DF,  p-value: 0.3356

library(ggpubr) #ggarrange

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:shapes':
## 
##     add

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:ff':
## 
##     add

climate <- read_csv("data/tx az climate data.csv") %>% mutate(site = ifelse(STATION == "USC00419559", "TX", "AZ"))

## Parsed with column specification:
## cols(
##   STATION = col_character(),
##   NAME = col_character(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   ELEVATION = col_double(),
##   DATE = col_date(format = ""),
##   DAPR = col_integer(),
##   MDPR = col_double(),
##   PRCP = col_double(),
##   SNOW = col_double(),
##   SNWD = col_double(),
##   TMAX = col_double(),
##   TMIN = col_double(),
##   TOBS = col_double(),
##   WT01 = col_character(),
##   WT05 = col_character()
## )

climateGrouped <- climate %>% mutate(month=gsub("\\d+-","", gsub("-\\d+$", "", DATE)), year = gsub("-\\d+-\\d+$", "", DATE)) %>% 
    group_by(year, month, site)

rainPlot <- ggplot(climateGrouped, aes(year, PRCP, color=month))+geom_violin()+facet_grid(site~., scales = "free_y") + ylab("Precipitation (cm)")+ggtitle("Rainfall")

maxTempPlot <- ggplot(climateGrouped, aes(year, TMAX, color=month))+geom_violin()+facet_grid(site~., scales = "free_y")+ ylab(expression('High temperature ('*~degree*C*')'))+ggtitle("Temperature")

ggarrange(rainPlot, maxTempPlot, ncol=1, nrow=2, labels = c("A", "B"), common.legend = TRUE, legend = "right" )

Granger causality

Granger causality tests whether information content in one signal can predict another signal using time series data.

popData <- to.plot3 %>% select(pop, year, African, size) %>% group_by(pop, year) %>% summarise(Afr = mean(African), wingSize = mean(size) ) %>% left_join(read.table("data/occupancy.txt", header=T) %>%  select(year, pop=population, occupied))

## Joining, by = c("pop", "year")

with(filter(popData, pop == "Arizona"), grangertest(Afr ~ occupied, order = 1))

## Granger causality test
## 
## Model 1: Afr ~ Lags(Afr, 1:1) + Lags(occupied, 1:1)
## Model 2: Afr ~ Lags(Afr, 1:1)
##   Res.Df Df     F   Pr(>F)   
## 1      4                     
## 2      5 -1 64.22 0.001315 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

with(filter(popData, pop == "Texas"), grangertest(Afr ~ occupied, order = 1))

## Granger causality test
## 
## Model 1: Afr ~ Lags(Afr, 1:1) + Lags(occupied, 1:1)
## Model 2: Afr ~ Lags(Afr, 1:1)
##   Res.Df Df      F Pr(>F)
## 1      6                 
## 2      7 -1 0.1033 0.7588

with(filter(popData, pop == "Arizona"), grangertest(occupied ~ Afr, order = 1))

## Granger causality test
## 
## Model 1: occupied ~ Lags(occupied, 1:1) + Lags(Afr, 1:1)
## Model 2: occupied ~ Lags(occupied, 1:1)
##   Res.Df Df      F Pr(>F)
## 1      4                 
## 2      5 -1 0.2642 0.6344

with(filter(popData, pop == "Texas"), grangertest(occupied ~ Afr, order = 1))

## Granger causality test
## 
## Model 1: occupied ~ Lags(occupied, 1:1) + Lags(Afr, 1:1)
## Model 2: occupied ~ Lags(occupied, 1:1)
##   Res.Df Df      F Pr(>F)
## 1      6                 
## 2      7 -1 0.0213 0.8887

with(filter(popData, pop == "Arizona"), grangertest(wingSize ~ Afr, order = 1))

## Granger causality test
## 
## Model 1: wingSize ~ Lags(wingSize, 1:1) + Lags(Afr, 1:1)
## Model 2: wingSize ~ Lags(wingSize, 1:1)
##   Res.Df Df      F Pr(>F)
## 1      4                 
## 2      5 -1 0.0246  0.883

with(filter(popData, pop == "Texas"), grangertest(wingSize ~ Afr, order = 1))

## Granger causality test
## 
## Model 1: wingSize ~ Lags(wingSize, 1:1) + Lags(Afr, 1:1)
## Model 2: wingSize ~ Lags(wingSize, 1:1)
##   Res.Df Df      F  Pr(>F)  
## 1      7                    
## 2      8 -1 4.5271 0.07091 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1