library(PopGenome)
library(bigmemory)
library(plyr)
library(scales)
library(asbio) #R2
library(ppcor) #correlation
library(stringr) #reged
library(reshape2)
library(lmodel2)
library(boot)
library(tidyverse)
go <- read.csv("data/unclustered_go.csv",header = T)
annotation <- read.csv("data/annotation.csv",header=T)
kegg <- read.csv("data/kegg.csv",header = F)
kegg <- merge(annotation[,c(1,3),drop=F], kegg, by.x = "NCBI", by.y = 1)[,2:3]
colnames(kegg) <- c("gene", "kegg")
az_early <- as.character(read.table("data/az_early.txt")[[1]])
az_late <- as.character(read.table("data/az_late.txt")[[1]])
tx_early <- as.character(read.table("data/tx_early.txt")[[1]])
tx_late <- as.character(read.table("data/tx_late.txt")[[1]])
variant <- read.table("data/variant.txt", header=F, col.names = c("name"))
Code run on the OIST cluster, and re-loaded
vcf <- readData("data/vcf/16", format="VCF", gffpath="data/gff/16")
vcf.test <- set.populations(vcf,list(az_early=az_early,az_late=az_late, tx_early=tx_early,tx_late=tx_late), diploid=TRUE)
vcf.test@populations[[1]]
vcf.test <- splitting.data(vcf.test, subsites="gene")
length(vcf.test@region.names)
vcf.test <- F_ST.stats(vcf.test, mode="nucleotide")
gene_names <- c()
fst <- c()
dxy <- c()
dx <- c()
for (i in 1:16) {
load(paste0("data/fst/",i,".RData"))
fst <- cbind(fst, genes1@hap.F_ST.pairwise)
dxy <- cbind(dxy, genes1@hap.diversity.between)
dx <- cbind(dx, t(genes1@hap.diversity.within))
info <- get.feature.names(genes1, gff.file=paste0("/Users/sasha/Dropbox (OIST)/projects/bee varroa/data/gff/",i,"/Group",i,".gff"), chr=paste0("LG",i))
gene_names <- c(gene_names,str_match(info, "ID=(GB[0-9]+)")[,2])
}
fst <- as.data.frame(t(fst))
colnames(fst) <- c("az_early_az_late", "az_early_tx_early", "az_early_tx_late", "az_late_tx_early", "az_late_tx_late", "tx_early_tx_late")
fst$names <- gene_names
fst <- filter(fst, names %in% variant$name)
full.pops <- lm(az_early_az_late ~ tx_early_tx_late + az_late_tx_late, data=fst)
summary(full.pops)
##
## Call:
## lm(formula = az_early_az_late ~ tx_early_tx_late + az_late_tx_late,
## data = fst)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.211377 -0.007209 -0.005599 0.003796 0.243532
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0055994 0.0004951 11.310 < 2e-16 ***
## tx_early_tx_late 0.8700692 0.0093083 93.472 < 2e-16 ***
## az_late_tx_late 0.5649043 0.0870577 6.489 9.51e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02845 on 4863 degrees of freedom
## (30 observations deleted due to missingness)
## Multiple R-squared: 0.6612, Adjusted R-squared: 0.6611
## F-statistic: 4746 on 2 and 4863 DF, p-value: < 2.2e-16
noafr <- lm(az_early_az_late ~ tx_early_tx_late , data=as.data.frame(fst))
partial.R2(noafr, full.pops)*100
## [1] 0.9062168
The similarity between post-Varroa population, i.e., the African contribution to similarity explains only a small percentage of the variance
with(fst, cor.test(az_early_az_late, tx_early_tx_late, method="s"))
##
## Spearman's rank correlation rho
##
## data: az_early_az_late and tx_early_tx_late
## S = 2267300000, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8820745
ggplot(subset(as.data.frame(fst), az_early_az_late != 0 | tx_early_tx_late !=0),aes(az_early_az_late,tx_early_tx_late))+geom_point()+theme_bw()+xlab(expression(paste(F["st"]," AZ pre vs. post")))+ylab(expression(paste(F["st"]," TX pre vs. post")))+geom_smooth(method="lm",se=F)+annotate("text",.05,.4,label = "r = 0.88")
with(as.data.frame(fst), cor.test(az_early_az_late, tx_early_tx_late, method="s"))
##
## Spearman's rank correlation rho
##
## data: az_early_az_late and tx_early_tx_late
## S = 2267300000, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8820745
dxy <- as.data.frame(t(dxy))
colnames(dxy) <- c("az_early_az_late", "az_early_tx_early", "az_early_tx_late", "az_late_tx_early", "az_late_tx_late", "tx_early_tx_late")
dxy$names <- gene_names
dxy <- filter(dxy, names %in% variant$name)
dx <- as.data.frame(t(dx))
colnames(dx) <- c("az_early", "az_late", "tx_early", "tx_late")
dx$names <- gene_names
dx <- filter(dx, names %in% variant$name)
cor(dxy[,1] / dx[,1], dxy[,6] / dx[,3], method="s", use = "complete.obs") # Re-doing with dxy
## [1] 0.8407106
filtered <- read.csv("data/filtered.csv", header=F, col.names = c("name","count"))
# the maximum is 660, of which 282 are in the pre- populations and 278 are in the post
filtered$african <- (382-(filtered$count-282))/382
with(left_join(data.frame(fst=fst[,1], name=fst[,"names"]), filtered, by=c("name")), cor.test(fst, african, method ="s"))
##
## Spearman's rank correlation rho
##
## data: fst and african
## S = 3.3505e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.7298048
left_join(data.frame(fst=fst[,1], name=fst[,"names"]), filtered, by=c("name")) %>% ggplot(aes(african, fst))+geom_point()+geom_smooth()+xlab("Proportion African genes per locus")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Just to check that we’re not getting some sort of spurious imputation artifact.
fst_pre <- c()
for (i in 1:16) {
load(paste0("data/fst_pre_beagle/",i,"_nuc.RData"))
fst_pre <- cbind(fst_pre, genes2@nuc.F_ST.pairwise)
}
fst_pre <- t(fst_pre)
colnames(fst_pre) <- c("az_early_az_late", "az_early_tx_early", "az_early_tx_late", "az_late_tx_early", "az_late_tx_late", "tx_early_tx_late")
summary(glm(az_early_az_late ~ tx_early_tx_late + az_late_tx_late, data=as.data.frame(fst_pre)))
##
## Call:
## glm(formula = az_early_az_late ~ tx_early_tx_late + az_late_tx_late,
## data = as.data.frame(fst_pre))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.243948 -0.000781 -0.000781 -0.000781 0.263291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0007810 0.0001309 5.966 2.49e-09 ***
## tx_early_tx_late 0.9233039 0.0042879 215.330 < 2e-16 ***
## az_late_tx_late 0.7102010 0.0516320 13.755 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.0002099234)
##
## Null deviance: 14.272 on 13245 degrees of freedom
## Residual deviance: 2.780 on 13243 degrees of freedom
## (39 observations deleted due to missingness)
## AIC: -74582
##
## Number of Fisher Scoring iterations: 2
summary(glm(az_early_az_late ~ tx_early_tx_late + az_late_tx_late, data=as.data.frame(fst)))
##
## Call:
## glm(formula = az_early_az_late ~ tx_early_tx_late + az_late_tx_late,
## data = as.data.frame(fst))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.211377 -0.007209 -0.005599 0.003796 0.243532
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0055994 0.0004951 11.310 < 2e-16 ***
## tx_early_tx_late 0.8700692 0.0093083 93.472 < 2e-16 ***
## az_late_tx_late 0.5649043 0.0870577 6.489 9.51e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.0008096245)
##
## Null deviance: 11.6215 on 4865 degrees of freedom
## Residual deviance: 3.9372 on 4863 degrees of freedom
## (30 observations deleted due to missingness)
## AIC: -20827
##
## Number of Fisher Scoring iterations: 2
with(as.data.frame(fst_pre), plot(az_early_az_late, tx_early_tx_late ))
only run on Sango
filtered_fst <- read.csv("data/filtered_fst.csv")
with(filtered_fst, cor.test(az_early_az_late, tx_early_tx_late,method="s"))
##
## Spearman's rank correlation rho
##
## data: az_early_az_late and tx_early_tx_late
## S = 1.2287e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3367131
with(filtered_fst, plot(az_early_az_late, tx_early_tx_late))
summary(glm(az_early_az_late ~ tx_early_tx_late, data=filtered_fst))
##
## Call:
## glm(formula = az_early_az_late ~ tx_early_tx_late, data = filtered_fst)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.20738 -0.03832 -0.03096 -0.00157 0.46630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.038315 0.001731 22.13 <2e-16 ***
## tx_early_tx_late 0.274149 0.014241 19.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.01148057)
##
## Null deviance: 59.430 on 4807 degrees of freedom
## Residual deviance: 55.176 on 4806 degrees of freedom
## (81 observations deleted due to missingness)
## AIC: -7829.3
##
## Number of Fisher Scoring iterations: 2
with(filtered_fst, cor.test(az_early_az_late, tx_early_tx_late,method="s"))
##
## Spearman's rank correlation rho
##
## data: az_early_az_late and tx_early_tx_late
## S = 1.2287e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3367131
Sanity check plots. We expect that removing novel haplotypes, which are presumably due to African immigration, will decrease the relatedness in post-varroa populaions
filtered_fst.melt <- melt(filtered_fst[,c("az_early_az_late","az_early_tx_early","tx_early_tx_late","az_late_tx_late")])
## No id variables; using all as measure variables
filtered_fst.melt$variable <- mapvalues(filtered_fst.melt$variable, levels(filtered_fst.melt$variable), c("AZ pre vs. post", "AZ & TX pre", "TX pre vs. post", "AZ & TX post"))
filtered_fst.melt$variable <- factor(filtered_fst.melt$variable, levels=levels(filtered_fst.melt$variable)[c(2,4,1,3)])
filtered_fst.melt$filtered <- "yes"
fst.melt <- melt(fst[,c("az_early_az_late","az_early_tx_early","tx_early_tx_late","az_late_tx_late")])
## No id variables; using all as measure variables
fst.melt$filtered <- "no"
ggplot(na.omit(subset(rbind(fst.melt,filtered_fst.melt),fst>10^-9)),aes(x=variable,y=value))+geom_boxplot()+scale_y_log10(labels=trans_format("log10", math_format(10^.x)))+theme_bw()+xlab("Population comparison")+ylab(expression(F["st"])) +theme(axis.title=element_text(face="bold"))+facet_grid(.~filtered, scales = "free_x")
Allelic frequencies computed using vcftools --freq2
in each population
freqs <- read_tsv("data/frequencies.txt.gz") %>% transmute(AZ = az_late - az_early, TX = tx_late - tx_early) %>% gather(key="pop", value = "late vs early")
## Parsed with column specification:
## cols(
## chrom = col_character(),
## pos = col_integer(),
## az_early = col_double(),
## az_late = col_double(),
## tx_early = col_double(),
## tx_late = col_double()
## )
colorCodes <- list("Arizona"=rgb(244,191,100, maxColorValue=255), "Texas"=rgb(99,177,244, maxColorValue=255))
ggplot(freqs, aes(`late vs early`, fill=pop))+geom_histogram(bins=50) + theme_bw()+scale_fill_manual(values=c(colorCodes[["Arizona"]], colorCodes[["Texas"]]))+ xlab("Post- vs. pre-Varroa allele frequency")+ylab("Number of sites")
ggplot(freqs, aes(`late vs early`/IQR(`late vs early`)/1.349, fill=pop))+geom_histogram(aes(y=..density..),binwidth=0.1) + theme_bw()+scale_fill_manual(values=c(colorCodes[["Arizona"]], colorCodes[["Texas"]]))+ xlab("Post- vs. pre-Varroa allele frequency")+ylab("Number of sites")+ stat_function(fun = dnorm, color="blue") + xlim(-5, 5)
go$az_early_az_late <- fst[go$gene,"az_early_az_late"]
go$tx_early_tx_late <- fst[go$gene,"tx_early_tx_late"]
subgo<- ddply(go, .(go), subset, length(gene) >=10 ) # remove any rare terms
lmfit <- function(formula, data, indices) {
d<-data[indices, ]
fit <- (lm(formula, data = d))
return(coef(fit))
}
Code below run on cluster
results <- boot(data = subgo, statistic = lmfit, R = 10000, formula = az_early_az_late~go+0, strata=subgo$go, parallel="multicore", ncpus=10)
results <- boot(data = subgo, statistic = lmfit, R = 10000, formula = tx_early_tx_late~go+0, strata=subgo$go, parallel="multicore", ncpus=10)
# load pre-computer bootstrapped data
load("data/az.RData")
az_go_boot <- results
load("data/tx.RData")
tx_go_boot <- results
az_go.lm <- lm(az_early_az_late~go+0, data=subgo)
az_go.sig <- data.frame(coef = numeric())
for (i in 1:length(az_go.lm$coefficients)) {
bci <- boot.ci(az_go_boot, type = "basic", index = i)
if (bci$basic[4]>0 & bci$basic[4] > 0) az_go.sig[gsub("go","",names(az_go.lm$coefficients)[i]),"coef"] = results$t0[i]
# print(sprintf("%s,%.4f,%.4f,%.4f", names(az_go.lm$coefficients)[i], results$t0[i], bci$basic[4], bci$basic[5]))
}
tx_go.lm <- lm(tx_early_tx_late~go+0, data=subgo)
tx_go.sig <- data.frame(coef = numeric())
for (i in 1:length(tx_go.lm$coefficients)) {
bci <- boot.ci(tx_go_boot, type = "basic", index = i)
if (bci$basic[4]>0 & bci$basic[4] > 0) tx_go.sig[gsub("go","",names(tx_go.lm$coefficients)[i]),"coef"] = results$t0[i]
}
all_go <- na.omit(merge(az_go.sig,tx_go.sig,by="row.names",all.x=TRUE))
all_go$aveEff <- rowMeans(all_go[,c(2,3)])
go.mlm <- lm(cbind(az_early_az_late, tx_early_tx_late)~go+0, data=subgo)
plot(summary(go.mlm)[[1]]$coefficients[,1], summary(go.mlm)[[2]]$coefficients[,1])
cor.test(summary(go.mlm)[[1]]$coefficients[,1], summary(go.mlm)[[2]]$coefficients[,1])
##
## Pearson's product-moment correlation
##
## data: summary(go.mlm)[[1]]$coefficients[, 1] and summary(go.mlm)[[2]]$coefficients[, 1]
## t = 20.655, df = 199, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7760801 0.8652878
## sample estimates:
## cor
## 0.8257815
kegg <- merge(kegg, fst, by.x = "gene", by.y = "names", all.x = T)
subkegg<- ddply(kegg, .(kegg), subset, length(gene) >= 10 ) # remove any rare terms
az_kegg_boot <- boot(data = subkegg, statistic = lmfit, R = 1000, formula = az_early_az_late~kegg+0, strata=subkegg$kegg, parallel="multicore", ncpus=2)
tx_kegg_boot <- boot(data = subkegg, statistic = lmfit, R = 1000, formula = tx_early_tx_late~kegg+0, strata=subkegg$kegg, parallel="multicore", ncpus=2)
save( az_kegg_boot, tx_kegg_boot, file="kegg_boot.RData")
load("data/kegg_boot.RData")
az_kegg.sig <- data.frame(coef = numeric())
for (i in 1:length(az_kegg_boot$t0)) {
bci <- boot.ci(az_kegg_boot, type = "basic", index = i)
if (bci$basic[4]>0 & bci$basic[4] > 0) az_kegg.sig[gsub("kegg","",names(az_kegg_boot$t0)[i]),"coef"] = az_kegg_boot$t0[i]
}
tx_kegg.sig <- data.frame(coef = numeric())
for (i in 1:length(az_kegg_boot$t0)) {
bci <- boot.ci(tx_kegg_boot, type = "basic", index = i)
if (bci$basic[4]>0 & bci$basic[4] > 0) tx_kegg.sig[gsub("kegg","",names(az_kegg_boot$t0)[i]),"coef"] = tx_kegg_boot$t0[i]
}
kegg.mlm <- lm(cbind(az_early_az_late, tx_early_tx_late)~kegg+0, data=subkegg)
plot(summary(kegg.mlm)[[1]]$coefficients[,1], summary(kegg.mlm)[[2]]$coefficients[,1])
cor.test(summary(kegg.mlm)[[1]]$coefficients[,1], summary(kegg.mlm)[[2]]$coefficients[,1])
##
## Pearson's product-moment correlation
##
## data: summary(kegg.mlm)[[1]]$coefficients[, 1] and summary(kegg.mlm)[[2]]$coefficients[, 1]
## t = 15.833, df = 105, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7728418 0.8878603
## sample estimates:
## cor
## 0.8395174
This analysis was written by Jatin Arora
library(ggplot2)
library(shapes)
##
## Attaching package: 'shapes'
## The following object is masked from 'package:ff':
##
## add
library(asbio)
library(Hmisc)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
## The following object is masked from 'package:bigmemory':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# convert from 2d matrix of coordinates to 3d matrix
make3dMatrix = function(dat){
k = 19 # number of landmarks
m = 2 # number of dimensions
ns = nrow(dat) # number of samples
input = array(0, dim=c(k,m,ns))
for(n in 1:ns){
i = 1
for(k in 1:19){
for(m in 1:2){
input[k,m,n] = unlist(dat[n,], use.names=F)[i]
i = i + 1
}
}
}
return(input)
}
#---------------------------------------------------------------------------------
# load africanisation data
admix = read.table("Tx_Az_morpho_Jatin/admixture_K3.txt",header=T)
file.create("stat.txt")
## [1] TRUE
to.plot = c()
for(pop in c("arizona", "texas")){
# load raw coordinates data in _format and _db formats
raw = read.delim(paste("Tx_Az_morpho_Jatin/", pop, "_coord_Jun_Format.txt", sep=""), header = F, sep = "\t", skip = 1)
db = read.delim(paste("Tx_Az_morpho_Jatin/", pop, "_coord_Jun_DB.txt", sep=""))
db = db[,-ncol(db)]
# load samples year information
yr = read.delim(paste("Tx_Az_morpho_Jatin/", pop, "_yrs.txt", sep=""), header = F)
# convert coordinate matrix into 3d one and do procrustes anaylsis
mat = make3dMatrix(dat = raw)
p = procGPA(x = mat, scale = T, distances = T)
# extract centroid sizes
centroid.size = data.frame(size = p$size)
centroid.size$id = sub("-[1-2]\\..*","",db$image)
centroid.size$id = sub("\\.[1-2]\\..*","",centroid.size$id)
centroid.admix = merge(x = centroid.size, y = admix, by = "id", all.x = T, all.y = F, sort = F)
oldnames <- colnames(centroid.admix)
centroid.admix <- aggregate(centroid.admix[,c(2:5)],by=list(centroid.admix$id),FUN=mean)
colnames(centroid.admix) <- oldnames
t = cor.test(centroid.admix$size,centroid.admix$POP2, method = "s", exact = F)
print(paste(pop,t$estimate,t$p.value))
# store centroid size thereby excluding drones
frame = merge(x = centroid.size, y = yr, by.x = "id", by.y = "V1", all.x = T, sort = F)
frame = frame[frame$size < 900,]
frame$pop = capitalize(pop)
to.plot = rbind(to.plot, frame)
}
## [1] "To speed up use option distances=FALSE"
## [1] "To speed up use option pcaoutput=FALSE"
## [1] "arizona -0.424589432273319 1.89597842553382e-07"
## [1] "To speed up use option distances=FALSE"
## [1] "To speed up use option pcaoutput=FALSE"
## [1] "texas -0.491836381207708 8.8563762311328e-16"
to.plot2 <- na.omit(aggregate(to.plot$size, by=list(to.plot$id),FUN=mean))
colnames(to.plot2) <- c("id", "size")
metadata <- unique(na.omit(to.plot[,c(1,3,4)]))
to.plot2 <- na.omit(merge(x = to.plot2, y = metadata, by = "id", all.x = T, all.y = T, sort = F))
with(subset(to.plot2, pop == "Arizona"), cor.test(size,V2,method="s"))
##
## Spearman's rank correlation rho
##
## data: size and V2
## S = 641400, p-value = 1.014e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.433034
with(subset(to.plot2, pop == "Texas"), cor.test(size,V2,method="s"))
##
## Spearman's rank correlation rho
##
## data: size and V2
## S = 3030800, p-value = 1.672e-10
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.4012539
#Figure S1
to.plot3 <- na.omit(join(to.plot2,admix))
## Joining by: id
to.plot3$pop <- factor(to.plot3$pop, levels = c("Texas", "Arizona"))
colnames(to.plot3)[3] <- "year"
colnames(to.plot3)[6] <- "African"
to.plot3$epoch <- "pre"
to.plot3$epoch[to.plot3$year>1995] <- "post"
ggplot(to.plot3,aes(as.factor(year),size,color=African))+facet_grid(pop~.,scales = "free_y")+geom_jitter(width = .3)+ xlab("Year") + ylab("Centroid size") + theme_bw()+theme(panel.grid.major = element_blank(), strip.background = element_rect(fill = "#FFFFFF", colour = "#FFFFFF"), axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),legend.position="top")+stat_smooth(color="blue",method="lm", aes(group=epoch)) + scale_colour_gradient()
size.pop.lm <- lm(size~POP2+pop, data=subset(na.omit(join(to.plot2,admix)),V2>1996))
## Joining by: id
summary(size.pop.lm)
##
## Call:
## lm(formula = size ~ POP2 + pop, data = subset(na.omit(join(to.plot2,
## admix)), V2 > 1996))
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.703 -11.911 0.389 9.355 52.105
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 772.619 4.202 183.859 < 2e-16 ***
## POP2 -13.732 5.810 -2.363 0.0191 *
## popTexas -12.597 2.347 -5.368 2.31e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.16 on 190 degrees of freedom
## Multiple R-squared: 0.1515, Adjusted R-squared: 0.1426
## F-statistic: 16.97 on 2 and 190 DF, p-value: 1.659e-07
partial.R2(lm(size~pop, data=subset(na.omit(join(to.plot2,admix))),V2>1996), size.pop.lm)
## Joining by: id
## [1] 0.02855891
summary(lm(size~POP2, data=subset(na.omit(join(to.plot2,admix)),V2==1995)))
## Joining by: id
##
## Call:
## lm(formula = size ~ POP2, data = subset(na.omit(join(to.plot2,
## admix)), V2 == 1995))
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.174 -11.198 0.399 10.774 29.320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 770.48 2.82 273.185 <2e-16 ***
## POP2 -15.77 16.17 -0.976 0.336
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.64 on 37 degrees of freedom
## Multiple R-squared: 0.02508, Adjusted R-squared: -0.001271
## F-statistic: 0.9517 on 1 and 37 DF, p-value: 0.3356
library(ggpubr) #ggarrange
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:shapes':
##
## add
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:ff':
##
## add
climate <- read_csv("data/tx az climate data.csv") %>% mutate(site = ifelse(STATION == "USC00419559", "TX", "AZ"))
## Parsed with column specification:
## cols(
## STATION = col_character(),
## NAME = col_character(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## ELEVATION = col_double(),
## DATE = col_date(format = ""),
## DAPR = col_integer(),
## MDPR = col_double(),
## PRCP = col_double(),
## SNOW = col_double(),
## SNWD = col_double(),
## TMAX = col_double(),
## TMIN = col_double(),
## TOBS = col_double(),
## WT01 = col_character(),
## WT05 = col_character()
## )
climateGrouped <- climate %>% mutate(month=gsub("\\d+-","", gsub("-\\d+$", "", DATE)), year = gsub("-\\d+-\\d+$", "", DATE)) %>%
group_by(year, month, site)
rainPlot <- ggplot(climateGrouped, aes(year, PRCP, color=month))+geom_violin()+facet_grid(site~., scales = "free_y") + ylab("Precipitation (cm)")+ggtitle("Rainfall")
maxTempPlot <- ggplot(climateGrouped, aes(year, TMAX, color=month))+geom_violin()+facet_grid(site~., scales = "free_y")+ ylab(expression('High temperature ('*~degree*C*')'))+ggtitle("Temperature")
ggarrange(rainPlot, maxTempPlot, ncol=1, nrow=2, labels = c("A", "B"), common.legend = TRUE, legend = "right" )
Granger causality tests whether information content in one signal can predict another signal using time series data.
popData <- to.plot3 %>% select(pop, year, African, size) %>% group_by(pop, year) %>% summarise(Afr = mean(African), wingSize = mean(size) ) %>% left_join(read.table("data/occupancy.txt", header=T) %>% select(year, pop=population, occupied))
## Joining, by = c("pop", "year")
with(filter(popData, pop == "Arizona"), grangertest(Afr ~ occupied, order = 1))
## Granger causality test
##
## Model 1: Afr ~ Lags(Afr, 1:1) + Lags(occupied, 1:1)
## Model 2: Afr ~ Lags(Afr, 1:1)
## Res.Df Df F Pr(>F)
## 1 4
## 2 5 -1 64.22 0.001315 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
with(filter(popData, pop == "Texas"), grangertest(Afr ~ occupied, order = 1))
## Granger causality test
##
## Model 1: Afr ~ Lags(Afr, 1:1) + Lags(occupied, 1:1)
## Model 2: Afr ~ Lags(Afr, 1:1)
## Res.Df Df F Pr(>F)
## 1 6
## 2 7 -1 0.1033 0.7588
with(filter(popData, pop == "Arizona"), grangertest(occupied ~ Afr, order = 1))
## Granger causality test
##
## Model 1: occupied ~ Lags(occupied, 1:1) + Lags(Afr, 1:1)
## Model 2: occupied ~ Lags(occupied, 1:1)
## Res.Df Df F Pr(>F)
## 1 4
## 2 5 -1 0.2642 0.6344
with(filter(popData, pop == "Texas"), grangertest(occupied ~ Afr, order = 1))
## Granger causality test
##
## Model 1: occupied ~ Lags(occupied, 1:1) + Lags(Afr, 1:1)
## Model 2: occupied ~ Lags(occupied, 1:1)
## Res.Df Df F Pr(>F)
## 1 6
## 2 7 -1 0.0213 0.8887
with(filter(popData, pop == "Arizona"), grangertest(wingSize ~ Afr, order = 1))
## Granger causality test
##
## Model 1: wingSize ~ Lags(wingSize, 1:1) + Lags(Afr, 1:1)
## Model 2: wingSize ~ Lags(wingSize, 1:1)
## Res.Df Df F Pr(>F)
## 1 4
## 2 5 -1 0.0246 0.883
with(filter(popData, pop == "Texas"), grangertest(wingSize ~ Afr, order = 1))
## Granger causality test
##
## Model 1: wingSize ~ Lags(wingSize, 1:1) + Lags(Afr, 1:1)
## Model 2: wingSize ~ Lags(wingSize, 1:1)
## Res.Df Df F Pr(>F)
## 1 7
## 2 8 -1 4.5271 0.07091 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1