require(Biostrings)
require(phyloseq)
require(tidyverse)
require(ggpubr)
setwd("~/storage/nal/")
Preprocessing:
Basic dada2 pipeline:
truncLen=c(185,185)
trimLeft=20
maxEE=c(6,8)
pool=FALSE
silva_nr_v138
delete bad taxa by decontam +
annosighted +
chloroplasts/mitho +
low reads/richness +
reads lenght check
tree from SEPP-QIIME2-plagin
sed -i ‘1s/^/#OTU_ID/’ otu_table.txt biom convert -i otu_table.txt -o otu_table.biom –to-hdf5 qiime tools import
–input-path rep_seq.fasta
–output-path rep_seq.qza
–type ‘FeatureData[Sequence]’ qiime fragment-insertion sepp
–i-representative-sequences rep_seq.qza
–i-reference-database ~/storage/somebases/sepp-refs-silva-128.qza
–o-tree insertion-tree.qza
–o-placements insertion-placements.qza
–p-threads 20
qiime tools import
–input-path otu_table.biom
–type ‘FeatureTable[Frequency]’
–input-format BIOMV210Format
–output-path feature-table.qza qiime fragment-insertion filter-features
–i-table feature-table.qza
–i-tree insertion-tree.qza
–o-filtered-table filtered_table.qza
–o-removed-table removed_table.qza  unzip -p filtered_table.qza > filtered_table.biom
biom convert -i filtered_table.biom -o filtered_table.txt
–to-tsv
sed -i ‘1d’ filtered_table.txt
unzip -p insertion-tree.qza /data/ > tree.nwk
I also did the PLSDA, but the result was a nightmare to interpret.
Decontam package also give me the irrevalent result.
And there were considerably more beta-plots and heatmaps in first version of this data analysis.
The quantitative data from DeSeq2 do not exactly match the data in the article supplement because of the different filtering across samples. This does not lead to a principal change in the ASVs composition and it did not result in any changes in the article itself, but plots look match better. The ILR analysis (philR) gives more unstable results, as a consequence, this analysis is discussed in the context of the taxonomic level of changing taxa in the article. With this approach, the results are constant with different strategies of filtering minors/samples with low representation.
Removing nonidentified on phylum level phylotypes, mithohondria/chloroplasts.
track <- read_tsv("track.tsv")
ps <- read_rds("ps.rds")
ps <- merge_phyloseq(ps, refseq(readDNAStringSet("rep_seq.fasta")))
d <- density(width(ps@refseq))
plot(d)
pop_taxa <- function(physeq, badTaxa){
allTaxa = taxa_names(physeq)
myTaxa <- allTaxa[!(allTaxa %in% badTaxa)]
return(prune_taxa(myTaxa, physeq))
}
delete_mit_chl_eu <- function(ps){
badTaxa <- taxa_names(subset_taxa(ps, Order=="Chloroplast"))
ps <- pop_taxa(ps, badTaxa)
badTaxa <- taxa_names(subset_taxa(ps, Family=="Mitochondria"))
ps <- pop_taxa(ps, badTaxa)
badTaxa <- taxa_names(subset_taxa(ps, Kingdom == "Eukaryota"))
ps <- pop_taxa(ps, badTaxa)
badTaxa <- taxa_names(subset_taxa(ps, is.na(Phylum)))
ps <- pop_taxa(ps, badTaxa)
return(ps)
}
ps.f <- delete_mit_chl_eu(ps)
d <- density(width(ps.f@refseq))
plot(d)
adding full metadata
map <- read_tsv("mapping_nalchic_new.tsv",
col_types = cols(
pH = col_character()
)
)
map <- column_to_rownames(map, "Sample")
site_column <- ps.f@sam_data %>%
as.data.frame() %>%
pull('Site')
map <- map %>%
mutate(Type = str_replace(Type, " /", "/")) %>%
mutate(`Corg,_%` = str_replace(`Corg,_%`, ",", ".")) %>%
mutate(`NO3_mg/kg` = str_replace(`NO3_mg/kg` , "," , ".")) %>%
mutate(pH = str_replace(pH, ",", ".")) %>%
mutate(final_concentration = str_replace(final_concentration, ",", ".")) %>%
mutate(pH = as.numeric(pH)) %>%
mutate(final_concentration = as.numeric(final_concentration)) %>%
mutate(`NO3_mg/kg` = as.numeric(`NO3_mg/kg`)) %>%
mutate(`Corg,_%` = as.numeric(`Corg,_%`)) %>%
mutate(`Day_sampling` = as.factor(`Day_sampling`)) %>%
mutate(
Quarry_or_Benchmark = case_when(
grepl('Quarry', What) ~ 'Quarry',
grepl('Benchmark', What) ~ 'Benchmark'
)
) %>%
mutate(
Region = case_when(
grepl('meadow_humus', Soil) ~ 'Urvan',
grepl('Carbonate_chernozem', Soil) ~ 'Progress',
grepl('Forest_gray', Soil) ~ 'Gerpegesh',
)
) %>%
relocate(Quarry_or_Benchmark, .before = Insolation) %>%
mutate(Repeat = site_column) %>%
relocate(Repeat, .after = Site)
map <- map %>% mutate_if(sapply(map, is.character), as.factor)
map_m <- map
colnames(map_m)
## [1] "Site" "Repeat" "Soil"
## [4] "Type" "Colour" "What"
## [7] "Quarry_or_Benchmark" "Insolation" "Feature"
## [10] "Temperature" "Humidity" "Height"
## [13] "Longitude" "Latitude" "pH"
## [16] "C,_%" "Corg,_%" "P_mg/kg"
## [19] "K_mg/kg" "NH4_mg/kg" "NO3_mg/kg"
## [22] "Weight_g" "SL1_mkl" "SX_mkl"
## [25] "EB_mkl" "Date_isolation" "final_concentration"
## [28] "Day_sampling" "Region"
colnames(map_m)[16] <- "C"
colnames(map_m)[17] <- "C_org"
colnames(map_m)[18] <- "P"
colnames(map_m)[19] <- "K"
colnames(map_m)[20] <- "NH4"
colnames(map_m)[21] <- "NO3"
colnames(map)
## [1] "Site" "Repeat" "Soil"
## [4] "Type" "Colour" "What"
## [7] "Quarry_or_Benchmark" "Insolation" "Feature"
## [10] "Temperature" "Humidity" "Height"
## [13] "Longitude" "Latitude" "pH"
## [16] "C,_%" "Corg,_%" "P_mg/kg"
## [19] "K_mg/kg" "NH4_mg/kg" "NO3_mg/kg"
## [22] "Weight_g" "SL1_mkl" "SX_mkl"
## [25] "EB_mkl" "Date_isolation" "final_concentration"
## [28] "Day_sampling" "Region"
the temp feature added - the feature of normalized temperature by days
ps.f <- prune_samples(!sample_data(ps)$Soil %in% c("Forest_gray"), ps.f)
ps.f <- prune_taxa(taxa_sums(ps.f) > 0, ps.f)
sample_data(ps.f) <- map_m
ps.nc <- prune_samples(sample_data(ps.f)$Day_sampling %in% c("1"), ps.f)
map3 <- data.frame(ps.nc@sam_data) %>% mutate(temp = scale(Temperature))
ps.nc <- merge_phyloseq(ps.nc, sample_data(map3))
ps.nc1 <- prune_samples(sample_data(ps.f)$Day_sampling %in% c("2", "3"), ps.f)
map3 <- data.frame(ps.nc1@sam_data) %>% mutate(temp = scale(Temperature))
ps.nc1 <- merge_phyloseq(ps.nc1, sample_data(map3))
ps.f <- merge_phyloseq(ps.nc, ps.nc1)
Richness/depth correlation./ Removing oultlier samples.
plot_rich_reads_samlenames_lm <- function(physeq){
rish <- estimate_richness(physeq, measures = "Observed")
reads.sum <- as.data.frame(sample_sums(physeq))
reads.summary <- cbind(rish, reads.sum)
colnames(reads.summary) <- c("otus","reads")
reads.summary["Repeat"] <-unlist(purrr::map(stringr::str_split(rownames(physeq@sam_data), "N_", 2), function(x) x[[2]]))
reads.summary["Site"] <- physeq@sam_data$Site
library(ggrepel)
require(ggforce)
p1 <- ggplot(data=reads.summary) +
geom_point(aes(y=otus, x=log2(reads), color=Site),size=3) +
geom_text_repel(aes(y=otus, x=log2(reads), label=paste0(Site, "_", Repeat))) +
theme_bw() +
geom_smooth(aes(y=otus, x=log2(reads), fill=Site, color=Site),method=lm, se=FALSE, ymin = 1) +
scale_x_continuous(sec.axis = sec_axis(sec.axis ~ 2**.))
# geom_mark_ellipse(aes(y = otus, x=reads, group = Repeats, label = Repeats, color = Repeats), label.fontsize = 10, label.buffer = unit(2, "mm"), label.minwidth = unit(5, "mm"),con.cap = unit(0.1, "mm"))
return(p1)
}
p.rich1 <- plot_rich_reads_samlenames_lm(ps.f)
ps.f2 <- subset_samples(ps.f, sample_names(ps.f) != 'N_95')
ps.f2 <- subset_samples(ps.f2, sample_names(ps.f2) != 'N_96')
ps.f2 <- subset_samples(ps.f2, sample_names(ps.f2) != 'N_89')
ps.f2 <- subset_samples(ps.f2, sample_names(ps.f2) != 'N_31')
# ps.f2 <- subset_samples(ps.f2, sample_names(ps.f2) != 'N_70')
ps.f2 <- subset_samples(ps.f2, Site != 'N8-2')
ps.f2 <- subset_samples(ps.f2, Site != 'N8-3')
p.rich <- plot_rich_reads_samlenames_lm(ps.f)
p.rich2 <- plot_rich_reads_samlenames_lm(ps.f2)
ggarrange(p.rich, p.rich2)
ps.f2 <- prune_taxa(taxa_sums(ps.f2) > 0, ps.f2)
ps.f <- ps.f2
Variance stabilization by DESeq2 package.
ps_vst <- function(ps, factor){
require(DESeq2)
require(phyloseq)
diagdds = phyloseq_to_deseq2(ps, as.formula(paste( "~", factor)))
diagdds = estimateSizeFactors(diagdds, type="poscounts")
diagdds = estimateDispersions(diagdds, fitType = "local")
pst <- varianceStabilizingTransformation(diagdds)
pst.dimmed <- t(as.matrix(assay(pst)))
pst.dimmed[pst.dimmed < 0.0] <- 0.0
ps.varstab <- ps
otu_table(ps.varstab) <- otu_table(pst.dimmed, taxa_are_rows = FALSE)
return(ps.varstab)
}
ps.varstab <- ps_vst(ps.f, "Repeat")
beta_custom_norm_NMDS_elli <- function(ps, seed = 7888, normtype="vst", Color="What", Group="Repeat"){
require(phyloseq)
require(ggplot2)
require(ggpubr)
require(DESeq2)
library(ggforce)
#normalisation. unifrac - rarefaction; wunifrac,bray - varstab
#variant with only bray
if (exists("ps.varstab") == FALSE){
diagdds = phyloseq_to_deseq2(ps, ~ Site)
diagdds = estimateSizeFactors(diagdds, type="poscounts")
diagdds = estimateDispersions(diagdds, fitType = "local")
if (normtype =="vst")
pst <- varianceStabilizingTransformation(diagdds)
if (normtype =="log")
pst <- rlogTransformation(diagdds)
pst.dimmed <- t(as.matrix(assay(pst)))
pst.dimmed[pst.dimmed < 0.0] <- 0.0
ps.varstab <- ps
otu_table(ps.varstab) <- otu_table(pst.dimmed, taxa_are_rows = FALSE)
}
ordination.b <- ordinate(ps.varstab, "NMDS", "bray")
p <- plot_ordination(ps.varstab,
ordination.b,
type="sample",
color = Color,
# title="NMDS - Bray-Curtis",
title=NULL,
axes = c(1,2) ) +
theme_bw() +
theme(text = element_text(size = 10)) +
geom_point(size = 3) +
geom_mark_ellipse(aes_string(group = Group, label = Group),
label.fontsize = 10,
label.buffer = unit(2, "mm"),
label.minwidth = unit(5, "mm"),
con.cap = unit(0.1, "mm"),
con.colour='gray') +
theme(legend.position = "none")
return(p)
}
p.beta <- beta_custom_norm_NMDS_elli(ps.f, Group="Site", Color="Site")
p.beta
Heatmaps with relative abundance by ampvis2 package
phyloseq_to_amp <- function(ps){
require(ampvis2)
require(tibble)
require(phyloseq)
colnames(ps@tax_table@.Data) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
OTU1 = as(otu_table(ps), "matrix")
OTU1 <- t(OTU1)
OTUdf = as.data.frame(OTU1)
taxa.ps <- as(tax_table(ps), "matrix")
taxa.df = as.data.frame(taxa.ps)
my_otu_table <- merge(OTUdf, taxa.df, by=0)
my_otu_table <- column_to_rownames(my_otu_table, var="Row.names")
my_metadata <- as_tibble(sample_data(ps), rownames=NA)
my_metadata <- rownames_to_column(my_metadata,var = "SampleID")
my_tree <- phy_tree(ps)
amp.ps <- amp_load(otutable = my_otu_table, metadata = my_metadata, tree = my_tree)
return(amp.ps)
}
amp <- phyloseq_to_amp(ps.f)
amp
## ampvis2 object with 4 elements.
## Summary of OTU table:
## Samples OTUs Total#Reads Min#Reads Max#Reads Median#Reads
## 83 10960 1759579 7397 36229 20959
## Avg#Reads
## 21199.75
##
## Assigned taxonomy:
## Kingdom Phylum Class Order Family
## 10960(100%) 10950(99.91%) 10607(96.78%) 9568(87.3%) 7678(70.05%)
## Genus Species
## 4388(40.04%) 232(2.12%)
##
## Metadata variables: 31
## SampleID, Site, Repeat, Soil, Type, Colour, What, Quarry_or_Benchmark, Insolation, Feature, Temperature, Humidity, Height, Longitude, Latitude, pH, C, C_org, P, K, NH4, NO3, Weight_g, SL1_mkl, SX_mkl, EB_mkl, Date_isolation, final_concentration, Day_sampling, Region, temp
p.heat.phyla <- amp_heatmap(amp,
tax_show = 15,
group_by = "Site",
tax_aggregate = "Phylum",
tax_class = "Proteobacteria") +
theme_bw() +
theme(text = element_text(size=15),
legend.position = "none",
axis.text.x=element_text(angle=45,hjust=1))
p.heat.phyla
Spltting phyloseq object by factors
ps.nc <- prune_samples(sample_data(ps.f)$Region %in% c("Urvan"), ps.f)
ps.nc <- prune_taxa(taxa_sums(ps.nc) > 0, ps.nc)
ps.cc <- prune_samples(sample_data(ps.f)$Region %in% c("Progress"), ps.f)
ps.cc <- prune_taxa(taxa_sums(ps.cc) > 0, ps.cc)
ps.cc2 <- prune_samples(sample_data(ps.f)$Site %in% c("N1", "N3", "N5", "N6", "N7"), ps.f)
ps.cc2 <- prune_taxa(taxa_sums(ps.cc2) > 0, ps.cc2)
ps.n13 <- prune_samples(sample_data(ps.f)$Site %in% c("N1", "N3"), ps.f)
ps.n13 <- prune_taxa(taxa_sums(ps.n13) > 0, ps.n13)
ps.cc2 <- prune_samples(sample_data(ps.f)$Site %in% c("N2", "N4", ""), ps.f)
ps.cc2 <- prune_taxa(taxa_sums(ps.cc2) > 0, ps.cc2)
PERMANOVA - Urvan
library(vegan)
physeq <- ps.nc
dist <- phyloseq::distance(physeq, "bray")
metadata <- as(sample_data(physeq@sam_data), "data.frame")
ad <- adonis2(dist ~ Quarry_or_Benchmark, data = metadata)
ad
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist ~ Quarry_or_Benchmark, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## Quarry_or_Benchmark 1 3.0915 0.26572 13.39 0.001 ***
## Residual 37 8.5429 0.73428
## Total 38 11.6344 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PERMANOVA - Progress
library(vegan)
physeq <- ps.cc
dist <- phyloseq::distance(physeq, "bray")
metadata <- as(sample_data(physeq@sam_data), "data.frame")
ad <- adonis2(dist ~ Quarry_or_Benchmark, data = metadata)
ad
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist ~ Quarry_or_Benchmark, data = metadata)
## Df SumOfSqs R2 F Pr(>F)
## Quarry_or_Benchmark 1 0.7377 0.10962 5.1706 0.001 ***
## Residual 42 5.9920 0.89038
## Total 43 6.7297 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
More Progress - UniFrac
dist <- phyloseq::distance(ps.cc2, "unifrac")
metadata <- as(sample_data(ps.cc2@sam_data), "data.frame")
adonis( dist ~ Soil, data = metadata)
##
## Call:
## adonis(formula = dist ~ Soil, data = metadata)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Soil 1 0.7148 0.71485 5.1936 0.22392 0.001 ***
## Residuals 18 2.4775 0.13764 0.77608
## Total 19 3.1924 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Distances caclulation from article https://doi.org/10.1080/00029890.2003.11919989
Bray-Curtis distance
Urvan
dist <- phyloseq::distance(ps.nc, "bray")
names.nc.b <- subset(ps.nc@sam_data, ps.nc@sam_data$Quarry_or_Benchmark %in% "Benchmark") %>%
rownames()
names.nc.q <- subset(ps.nc@sam_data, ps.nc@sam_data$Quarry_or_Benchmark %in% "Quarry") %>%
rownames()
usedist::dist_between_centroids(dist, names.nc.b, names.nc.q, squared = TRUE)
## [1] 0.3721286
Distances caclulation from article https://doi.org/10.1080/00029890.2003.11919989
Bray-Curtis distance
Progress
dist <- phyloseq::distance(ps.cc2, "bray")
names.cc.m <- subset(ps.cc2@sam_data, ps.cc2@sam_data$Soil %in% "meadow_humus-cryptogley") %>%
rownames()
names.cc.c <- subset(ps.cc2@sam_data, ps.cc2@sam_data$Soil %in% "Carbonate_chernozem") %>%
rownames()
usedist::dist_between_centroids(dist, names.cc.m, names.cc.c, squared = TRUE)
## [1] 0.3233268
Progress - Permanova
UniFrac
dist <- phyloseq::distance(ps.cc, "unifrac")
metadata <- as(sample_data(ps.cc@sam_data), "data.frame")
adonis( dist ~ Quarry_or_Benchmark, data = metadata)
##
## Call:
## adonis(formula = dist ~ Quarry_or_Benchmark, data = metadata)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Quarry_or_Benchmark 1 0.4995 0.49948 3.1287 0.06933 0.001 ***
## Residuals 42 6.7051 0.15965 0.93067
## Total 43 7.2046 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Boxplot
dist <- phyloseq::distance(ps.f, method = "bray")
groups <- ps.f@sam_data$Region
mod <- vegan::betadisper(dist, groups)
mod.box <- boxplot(mod)
Tukey post-hoc
TukeyHSD(mod)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = distances ~ group, data = df)
##
## $group
## diff lwr upr p adj
## Progress-Urvan -0.158795 -0.1817297 -0.1358603 0
library(ggpubr)
alpha.custom <- function(ps.f, arrga = "PD"){
require(picante)
pd <- pd(ps.f@otu_table@.Data, ps.f@phy_tree)
pd1 <- estimate_richness(ps.f, measures=c("Observed", "InvSimpson", "Shannon"))
pd <- cbind(pd, ps.f@sam_data, pd1 )
bmi <- levels(as.factor(pd$Quarry_or_Benchmark))
bmi.pairs <- combn(seq_along(bmi), 2, simplify = FALSE, FUN = function(i)bmi[i])
p1 <- ggviolin(pd, x = "Quarry_or_Benchmark", y = arrga,
add = "boxplot", fill = "Quarry_or_Benchmark") + stat_compare_means(comparisons = bmi.pairs, method = "wilcox.test", size=3) +
scale_x_discrete(limits = c("Quarry", "Benchmark")) +
theme(axis.title.x = element_blank(),
legend.title = element_blank(),
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1),
text = element_text(size=9))
return(p1)
return(p1)
}
alpha.custom2 <- function(ps.f, arrga = "PD"){
require(picante)
pd <- pd(ps.f@otu_table@.Data, ps.f@phy_tree)
pd1 <- estimate_richness(ps.f, measures=c("Observed", "InvSimpson", "Shannon"))
pd <- cbind(pd, ps.f@sam_data, pd1 )
bmi <- levels(as.factor(pd$Quarry_or_Benchmark))
bmi.pairs <- combn(seq_along(bmi), 2, simplify = FALSE, FUN = function(i)bmi[i])
p1 <- ggviolin(pd, x = "Quarry_or_Benchmark", y = arrga,
add = "boxplot", fill = "Quarry_or_Benchmark") +
stat_compare_means(comparisons = bmi.pairs, method = "wilcox.test", size=3) +
scale_x_discrete(limits = c("Quarry", "Benchmark")) +
theme(axis.title.x = element_blank(),
legend.title = element_blank(),
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1),
text = element_text(size=9))
return(p1)
}
p.alpha.Cdt.oo <- alpha.custom(ps.cc, arrga = "Observed")
p.alpha.Cdt.sh <- alpha.custom(ps.cc, arrga = "PD")
p.alpha.Cdt.is <- alpha.custom(ps.cc, arrga = "Shannon")
p.alpha.Cdt.pd <- alpha.custom(ps.cc, arrga = "InvSimpson")
p.alpha.Cdt.oo2 <- alpha.custom2(ps.nc, arrga = "Observed")
p.alpha.Cdt.sh2 <- alpha.custom2(ps.nc, arrga = "PD")
p.alpha.Cdt.is2 <- alpha.custom2(ps.nc, arrga = "Shannon")
p.alpha.Cdt.pd2 <- alpha.custom2(ps.nc, arrga = "InvSimpson")
p1 <- ggarrange(p.alpha.Cdt.oo, p.alpha.Cdt.sh,p.alpha.Cdt.is, p.alpha.Cdt.pd ,
p.alpha.Cdt.oo2, p.alpha.Cdt.sh2, p.alpha.Cdt.is2, p.alpha.Cdt.pd2,
ncol = 4 ,label.x = 0.3, nrow = 2, font.label = c(size = 10), labels = c("Progress", "", "", "", "Urvan"))
p1
ps.varstab.merged <- phyloseq::merge_samples(ps.varstab, "Repeat")
ps.no3 <- subset_taxa(ps.varstab.merged, taxa_names(ps.varstab) %in% c("Seq70", "Seq71", "Seq161", "Seq170", "Seq171"))
otus.no3 <- as.data.frame(ps.no3@otu_table)
otus.no3$NO3 <- ps.no3@sam_data$NO3
cor.test(otus.no3$Seq170, otus.no3$NO3, method="spearman")
##
## Spearman's rank correlation rho
##
## data: otus.no3$Seq170 and otus.no3$NO3
## S = 260.22, p-value = 0.000003078
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8310254
des_w_simper <- function(ps, factor){
require(DESeq2)
require(vegan)
require(tibble)
require(phyloseq)
diagdds = phyloseq_to_deseq2(ps, as.formula(paste( "~", factor)))
diagdds = estimateSizeFactors(diagdds, type="poscounts")
diagdds = estimateDispersions(diagdds, fitType = "local")
diagdds = DESeq(diagdds)
samp <-sample_data(ps)
aggdata <- t(aggregate.data.frame(as.data.frame(as.data.frame(t(diagdds@assays@data$mu))), by=list(samp[[factor]]), median))
colnames(aggdata) <- aggdata[1,]
aggdata <- aggdata[-1,]
res = results(diagdds)
res.df <- as.data.frame(res)
nice <- cbind(res.df, as.data.frame(ps@tax_table@.Data[rownames(res.df),]), as.data.frame(aggdata)[rownames(res.df),])
}
des.nc <- des_w_simper(ps.nc, "Quarry_or_Benchmark")
des.cc <- des_w_simper(ps.cc, "Quarry_or_Benchmark")
des.so <- des_w_simper(ps.f, "Region")
des.nc.filt <- des.nc %>%
filter(baseMean > 10) %>%
filter(padj < 0.05) %>%
arrange(log2FoldChange)
des.cc.filt <- des.cc %>%
filter(baseMean > 10) %>%
filter(padj < 0.05) %>%
arrange(log2FoldChange)
des.so.filt <- des.so %>%
filter(baseMean > 10) %>%
filter(padj < 0.05) %>%
arrange(log2FoldChange)
f1 <- ggplot(des.nc.filt, aes(x=log2FoldChange, y=baseMean)) +
geom_vline(xintercept = 0, size = 0.75, color = "red", alpha=0.3) +
labs(title="Urvan, Benchmark/Quarry") +
geom_point() +
theme_bw() +
theme(text = element_text(size=8))
f2 <- ggplot(des.cc.filt, aes(x=log2FoldChange, y=baseMean)) +
geom_vline(xintercept = 0, size = 0.75, color = "red", alpha=0.3) +
labs(title="Progress, Benchmark/Quarry") +
geom_point() +
theme_bw() +
theme(text = element_text(size=8))
f3 <- ggplot(des.so.filt, aes(x=log2FoldChange, y=baseMean)) +
geom_vline(xintercept = 0, size = 0.75, color = "red", alpha=0.3) +
labs(title="Urvan/Progress") +
geom_point() +
theme_bw() +
theme(text = element_text(size=8))
p.des <- ggarrange(f1,f2,f3,nrow = 1)
ggarrange(f1,f2,f3,nrow = 1)
library(phyloseq)
library(vegan)
library(ggrepel)
ps.merged <- phyloseq::merge_samples(ps.f, "Repeat")
physeq <- ps.merged
veganifyOTU <- function(physeq){
require(phyloseq)
if(taxa_are_rows(physeq)){physeq <- t(physeq)}
return(as(otu_table(physeq), "matrix"))
}
otus.ps.vegan <- veganifyOTU(physeq)
metadata <- as(sample_data(physeq), "data.frame")
cca <- vegan::cca(otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data=metadata)
kate.ggcca.sites <- function(vare.cca){
require(tidyverse)
biplot <- as.data.frame(vare.cca$CCA$biplot)
wa <- as.data.frame(vare.cca$CCA$wa)
biplot <- rownames_to_column(biplot, "Label") %>%
add_column(Score = rep("biplot", length(rownames(biplot))))
wa <- rownames_to_column(wa, "Label") %>%
add_column(Score = rep("sites", length(rownames(wa))))
fdat_amazing <- rbind(biplot, wa)
fdat_amazing <- fdat_amazing %>%
mutate(label_size = ifelse(Score == "biplot", 4.5, 3))
p <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) +
# geom_abline(intercept=seq(-100, 100, 25), slope=1, colour="grey", alpha=0.5) +
# geom_abline(intercept=seq(-100, 100, 25), slope=-1, colour="grey", alpha=0.5) +
geom_vline(xintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
geom_hline(yintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2, colour = factor(Score))) +
geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2),
size = 0.6, alpha=0.8, color = "red",arrow = arrow(angle = 3)) +
geom_text_repel(aes(x=CCA1, y=CCA2, label= Label),size=fdat_amazing$label_size) +
xlab(paste0(round(cca$CCA$eig[1], 3)*100, "% CCA1")) +
ylab(paste0(round(cca$CCA$eig[2], 3)*100, "% CCA2")) +
grids(linetype = "dashed") +
theme(legend.position = "none", panel.background = element_rect(fill = "white", colour = "grey50"))
return(p)
}
cca.sites <- kate.ggcca.sites(cca)
cca.sites
anova(cca, parallel = 10)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
## Df ChiSquare F Pr(>F)
## Model 8 2.3090 1.309 0.004 **
## Residual 12 2.6458
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca, by="terms", parallel = 10)
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
## Df ChiSquare F Pr(>F)
## pH 1 0.38347 1.7392 0.007 **
## C_org 1 0.38090 1.7275 0.010 **
## C 1 0.28553 1.2950 0.098 .
## K 1 0.28565 1.2955 0.087 .
## NH4 1 0.36507 1.6557 0.006 **
## NO3 1 0.13327 0.6044 0.904
## temp 1 0.30609 1.3882 0.029 *
## P 1 0.16901 0.7665 0.849
## Residual 12 2.64585
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca, by="mar", parallel = 10)
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
## Df ChiSquare F Pr(>F)
## pH 1 0.17681 0.8019 0.833
## C_org 1 0.22899 1.0385 0.447
## C 1 0.16358 0.7419 0.885
## K 1 0.18311 0.8305 0.752
## NH4 1 0.26038 1.1809 0.225
## NO3 1 0.17298 0.7845 0.797
## temp 1 0.30376 1.3777 0.026 *
## P 1 0.16901 0.7665 0.837
## Residual 12 2.64585
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif.cca(cca)
## pH C_org C K NH4 NO3 temp P
## 4.650906 2.856380 3.301995 2.502405 3.336293 3.941545 1.517910 2.196035
Reduce model
cca_w <- vegan::cca(otus.ps.vegan ~ pH + C_org + C + NH4 + NO3 + temp, data=metadata)
kate.ggcca.sites(cca_w)
anova(cca_w, parallel = 10)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + NH4 + NO3 + temp, data = metadata)
## Df ChiSquare F Pr(>F)
## Model 6 1.9110 1.4649 0.002 **
## Residual 14 3.0439
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_w, by="terms", parallel = 10)
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + NH4 + NO3 + temp, data = metadata)
## Df ChiSquare F Pr(>F)
## pH 1 0.38347 1.7638 0.002 **
## C_org 1 0.38090 1.7519 0.006 **
## C 1 0.28553 1.3133 0.066 .
## NH4 1 0.37965 1.7462 0.004 **
## NO3 1 0.17677 0.8130 0.724
## temp 1 0.30466 1.4013 0.023 *
## Residual 14 3.04386
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_w, by="mar", parallel = 10)
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + NH4 + NO3 + temp, data = metadata)
## Df ChiSquare F Pr(>F)
## pH 1 0.33436 1.5378 0.010 **
## C_org 1 0.27531 1.2663 0.111
## C 1 0.22773 1.0474 0.362
## NH4 1 0.37050 1.7041 0.005 **
## NO3 1 0.22242 1.0230 0.466
## temp 1 0.30466 1.4013 0.034 *
## Residual 14 3.04386
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif.cca(cca_w)
## pH C_org C NH4 NO3 temp
## 2.342850 2.070081 2.528777 2.173317 3.195453 1.513385
ps.ff <- phyloseq::filter_taxa(ps.f, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
ps.varstab <- ps_vst(ps.ff, "Repeat")
ps.varstab.merged <- phyloseq::merge_samples(ps.varstab, "Repeat")
ps.varstab.merged.family <- tax_glom(ps.varstab.merged, taxrank="Family")
physeq <- ps.varstab.merged.family
veganifyOTU <- function(physeq){
require(phyloseq)
if(taxa_are_rows(physeq)){physeq <- t(physeq)}
return(as(otu_table(physeq), "matrix"))
}
otus.ps.vegan <- veganifyOTU(physeq)
metadata <- as(sample_data(physeq), "data.frame")
cca_w_varstab_family <- vegan::cca(otus.ps.vegan ~ temp + pH + C_org + NH4, data=metadata)
kate.ggcca.species <- function(vare.cca, physeq){
biplot <- as.data.frame(vare.cca$CCA$biplot)
wa <- as.data.frame(vare.cca$CCA$v) %>%
rownames_to_column("ID")
seq <- wa
f <- as.data.frame(physeq@tax_table@.Data) %>%
dplyr::select("Family") %>%
rownames_to_column("ID")
wa <- full_join(wa, f) %>%
dplyr::select(-ID) %>%
dplyr::rename(Label = Family ) %>%
mutate(Score = "sites")
biplot <- rownames_to_column(biplot, "Label") %>%
add_column(Score = rep("biplot", length(rownames(biplot))))
fdat_amazing <- rbind(biplot, wa) %>%
mutate(Label = as.factor(Label))
p <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) +
geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2, colour = factor(Score))) +
geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2), alpha=0.8, color = "red",arrow = arrow(angle = 3)) +
geom_text_repel(aes(x=CCA1, y=CCA2, label= Label),size=4) +
theme(legend.position = "none", panel.background = element_rect(fill = "white", colour = "grey50"))
# geom_mark_ellipse(aes(group = Site, label = Site), label.fontsize = 10, label.buffer = unit(2, "mm"), label.minwidth = unit(5, "mm"),con.cap = unit(0.1, "mm"))
return(p)
}
kate.ggcca.species(cca_w_varstab_family, physeq)
ps.ff <- phyloseq::filter_taxa(ps.f, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
ps.varstab <- ps_vst(ps.ff, "Repeat")
ps.varstab.merged <- phyloseq::merge_samples(ps.varstab, "Repeat")
physeq <- ps.varstab.merged
veganifyOTU <- function(physeq){
require(phyloseq)
if(taxa_are_rows(physeq)){physeq <- t(physeq)}
return(as(otu_table(physeq), "matrix"))
}
otus.ps.vegan <- veganifyOTU(physeq)
metadata <- as(sample_data(physeq), "data.frame")
# different vars of fichas
# cca_w_varstab_asv <- vegan::cca(otus.ps.vegan ~ Humidity + pH + C_org + C + K + NH4 + NO3 + temp, data=metadata)
# cca_w_varstab_asv <- vegan::cca(otus.ps.vegan ~ pH + C_org + NH4 + temp, data=metadata)
cca_w_varstab_asv <- vegan::cca(otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data=metadata)
wa <- as.data.frame(cca_w_varstab_asv$CCA$v) %>%
rownames_to_column("ID")
taxa.pruned <- as.data.frame(ps.varstab@tax_table@.Data) %>%
rownames_to_column("ID")
taxa.pruned <- taxa.pruned %>%
mutate_all(as.character)
# old.tips <- tree$tip.label
# matches <- regmatches(tree$tip.label, gregexpr("[[:digit:]]+", tree$tip.label))
# taxa.pruned$number <- as.numeric(unlist(matches))
# Shitty taxa formating part
taxa.pruned$taxa <- ifelse(is.na(taxa.pruned$Genus),
ifelse(is.na(taxa.pruned$Family),
ifelse(is.na(taxa.pruned$Order),
ifelse(is.na(taxa.pruned$Class), taxa.pruned$Phylum, taxa.pruned$Class) , taxa.pruned$Order), taxa.pruned$Family), taxa.pruned$Genus)
taxa.pruned[taxa.pruned == "Burkholderia-Caballeronia-Paraburkholderia"] <- "Burkholderia"
taxa.pruned[taxa.pruned == "Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium"] <- "Pararhizobium"
taxa.pruned$taxa2 <- ifelse(is.na(taxa.pruned$Species),
with(taxa.pruned, paste0(taxa)),
with(taxa.pruned, paste0(taxa, " ", Species )))
taxa.pruned$phylum <- ifelse(taxa.pruned$Phylum == "Proteobacteria", with(taxa.pruned, paste0(taxa.pruned$Class)), with(taxa.pruned, paste0(taxa.pruned$Phylum)))
taxa.pruned$Label <- paste0(taxa.pruned$ID, "_", taxa.pruned$taxa2)
wa <- full_join(wa, taxa.pruned, by="ID") %>%
mutate(Score = "sites")
#For the top 50 most abundant taxa, skip if use des res
sw <- summarize_all(as.data.frame(ps.varstab@otu_table), sum)
head_asv <- as_tibble(t(sw), rownames = "ID") %>%
arrange(desc(V1)) %>% top_n(n = 100) %>%
pull(ID)
wa <- filter(wa, ID %in% head_asv)
# maybe only different by des? Picture will be nice.
# wa <- filter(wa, ID %in% row.des)
# wa <- filter(wa, ID %in% row.des.so)
biplot <- as.data.frame(cca_w_varstab_asv$CCA$biplot)
biplot <- rownames_to_column(biplot, "Label") %>%
add_column(Score = rep("biplot", length(rownames(biplot))))
fdat_amazing <- plyr::rbind.fill(biplot, wa) %>%
mutate(Label = as.factor(Label))
fdat_amazing <- fdat_amazing %>%
mutate(label_size = ifelse(Score == "biplot", 4.5, 3))
p.cca.species <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) +
geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2)) +
geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2), alpha=0.8, color = "red", arrow = arrow(angle = 3)) +
geom_text_repel(aes(x=CCA1, y=CCA2, label= Label, colour = phylum), size=fdat_amazing$label_size) +
xlab(paste0(round(cca$CCA$eig[1], 3)*100, "% CCA1")) +
ylab(paste0(round(cca$CCA$eig[2], 3)*100, "% CCA2")) +
grids(linetype = "dashed") +
geom_vline(xintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
geom_hline(yintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
theme(legend.position = "top", panel.background = element_rect(fill = "white", colour = "grey50"))
p.cca.species
anova(cca_w_varstab_asv, parallel = 10)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
## Df ChiSquare F Pr(>F)
## Model 8 0.146738 2.6301 0.001 ***
## Residual 12 0.083686
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_w_varstab_asv, by="terms", parallel = 10)
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
## Df ChiSquare F Pr(>F)
## pH 1 0.031458 4.5109 0.001 ***
## C_org 1 0.024206 3.4709 0.002 **
## C 1 0.022726 3.2588 0.002 **
## K 1 0.021816 3.1282 0.003 **
## NH4 1 0.019506 2.7970 0.002 **
## NO3 1 0.007373 1.0573 0.369
## temp 1 0.013866 1.9883 0.038 *
## P 1 0.005786 0.8297 0.630
## Residual 12 0.083686
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_w_varstab_asv, by="mar", parallel = 10)
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
## Df ChiSquare F Pr(>F)
## pH 1 0.006969 0.9993 0.449
## C_org 1 0.012179 1.7463 0.056 .
## C 1 0.006591 0.9451 0.462
## K 1 0.013264 1.9020 0.036 *
## NH4 1 0.012674 1.8174 0.044 *
## NO3 1 0.007950 1.1399 0.305
## temp 1 0.013969 2.0031 0.046 *
## P 1 0.005786 0.8297 0.600
## Residual 12 0.083686
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif.cca(cca_w_varstab_asv)
## pH C_org C K NH4 NO3 temp P
## 4.293374 2.760598 3.263601 2.449316 3.376127 3.765782 1.543511 2.181794
wa <- as.data.frame(cca_w_varstab_asv$CCA$v) %>%
rownames_to_column("ID")
taxa.pruned <- as.data.frame(ps.varstab@tax_table@.Data) %>%
rownames_to_column("ID")
taxa.pruned <- taxa.pruned %>% mutate_all(as.character)
taxa.pruned$phylum_pro <- ifelse(taxa.pruned$Phylum == "Proteobacteria", with(taxa.pruned, paste0(taxa.pruned$Class)), with(taxa.pruned, paste0(taxa.pruned$Phylum)))
wa <- full_join(wa, taxa.pruned, by="ID") %>%
mutate(Score = "sites") %>%
dplyr::rename(Label = ID)
biplot <- as.data.frame(cca_w_varstab_asv$CCA$biplot)
biplot <- rownames_to_column(biplot, "Label") %>%
add_column(Score = rep("biplot", length(rownames(biplot))))
fdat_amazing <- plyr::rbind.fill(biplot, wa) %>%
mutate(Label = as.factor(Label))
p.cca.too.much <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) +
geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2, color = phylum_pro)) +
geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2), alpha=0.8, color = "red",arrow = arrow(angle = 3)) +
geom_label(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), mapping = aes(x=CCA1, y=CCA2, label=Label)) +
theme(legend.position = "top", panel.background = element_rect(fill = "white", colour = "grey50"))
# geom_mark_ellipse(aes(group = Site, label = Site), label.fontsize = 10, label.buffer = unit(2, "mm"), label.minwidth = unit(5, "mm"),con.cap = unit(0.1, "mm"))
p.cca.too.much
library(rnaturalearth)
library(rnaturalearthdata)
library(tidyverse)
library(sf)
library(rgeos)
library(ggrepel)
world <- ne_countries(scale = "large", returnclass = "sf")
spdf_france_geounit <- ne_states(geounit = 'russia', returnclass = "sf")
probdata = tibble(site = c("Quarry on Chernozem", "Quarry on Gleyic"),
longitude = c(43.51137, 43.77252),
latitude = c(43.82965, 43.35022),
xlabel = c(40, 47),
ylabel = c(44.3, 43))
ggplot(data = world) +
geom_sf() +
geom_sf(data = spdf_france_geounit, fill="green", alpha = 0.1 ) +
# geom_sf(data = lakes ) +
coord_sf(xlim = c(35, 50), ylim = c(40, 50), expand = FALSE) +
geom_point(data = probdata, aes(x = longitude, y = latitude), size = 3,
shape = 20, colour = "red") +
geom_text(data = probdata, aes(x = xlabel, y = ylabel, label = site), size = 5, colour="red") +
theme_bw()
library(ggVennDiagram)
get_list_from_ps <- function(physeq, amaz_fac) {
my_keys <- levels(sample_data(physeq)[[amaz_fac]])
mylist <- vector(mode="list", length=length(my_keys))
names(mylist) <- my_keys
for (i in levels(sample_data(physeq)[[amaz_fac]])) {
x <- prune_samples(sample_data(physeq)[[amaz_fac]] %in% i, physeq)
x <- prune_taxa(taxa_sums(x) > 0, x)
mylist[[i]] <- taxa_names(x)
}
return(mylist)
}
physeq <- ps.f
ggVennDiagram(get_list_from_ps(physeq, "What")) +
ggplot2::scale_fill_viridis_c(option = "D", begin=0.2)
amp.cc <- phyloseq_to_amp(ps.cc)
p.venn.cc <- amp_venn(amp.cc,
group_by = "Quarry_or_Benchmark",
normalise = TRUE,
cut_a = 0.000001,
cut_f=0.000001,
)
p.venn.cc
library(philr)
library(glmnet)
GP <- ps.cc
GP.f <- phyloseq::filter_taxa(GP, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
GP <- GP.f
phy_tree(GP) <- makeNodeLabel(phy_tree(GP), method="number", prefix='n')
GP <- transform_sample_counts(GP, function(x) x+1)
otu.table <- otu_table(GP)
tree <- phy_tree(GP)
metadata <- sample_data(GP)
tax <- tax_table(GP)
# is.binary(tree)
# is.rooted(tree)
gp.philr <- philr(otu.table, tree,
part.weights='enorm.x.gm.counts',
ilr.weights='blw.sqrt')
gp.dist <- dist(gp.philr, method="euclidean")
glmmod <- glmnet(gp.philr, sample_data(GP)$Quarry_or_Benchmark , alpha=1, family="binomial")
cv.res <- cv.glmnet(gp.philr, as.double(sample_data(GP)$Quarry_or_Benchmark), nfolds = 3, family = "binomial", type.measure = "mse")
top.coords <- as.matrix(coefficients(glmmod, cv.res$lambda.min))
top.coords <- rownames(top.coords)[which(top.coords != 0)]
top.coords <- top.coords[2:length(top.coords)]
tc.names <- sapply(top.coords, function(x) name.balance(tree, tax, x))
tc.names
## n35
## "Genus_Massilia/Species_timonae"
## n113
## "Species_aerolata/Genus_Skermanella"
## n221
## "Family_Gemmatimonadaceae/Family_Gemmatimonadaceae"
## n367
## "Order_Blfdi19/Genus_P3OB-42"
## n549
## "Genus_Lechevalieria/Genus_Lechevalieria"
## n583
## "Order_Bacillales/Order_Bacillales"
## n1043
## "Family_Gemmatimonadaceae/Family_Gemmatimonadaceae"
## n1136
## "Family_Chitinophagaceae/Family_Chitinophagaceae"
## n1150
## "Family_Chitinophagaceae/Family_Chitinophagaceae"
gp.philr.long <- convert_to_long(gp.philr, get_variable(GP, 'Quarry_or_Benchmark')) %>%
filter(coord %in% top.coords)
# gp.philr.long$taxa <- paste0("n",str_sub(gp.philr.long$coord, 2)) %>%
# as.factor() %>%
# recode(!!!tc.names)
balance_taxa <- levels(gp.philr.long$taxa)
library(naturalsort)
ggplot(gp.philr.long, aes(x=labels, y=value)) +
geom_boxplot(fill='lightgrey') +
facet_grid(.~coord, scales='free_x') +
xlab('Quarry or Benchmark') + ylab('Balance Value') +
theme_bw()
library(ggtree)
plot_philr <- function(){
tc.nn <- name.to.nn(tree, top.coords)
tc.colors <- scales::viridis_pal()(length(top.coords))
p <- ggtree(tree, layout='fan')
top.seq <- seq(1, length(top.coords))
for (i in top.seq){
p <- p + geom_balance(node=tc.nn[i], fill=tc.colors[i], alpha=0.6)
}
for (i in top.coords){
p <- annotate_balance(tree, i, p=p,
labels = c(paste0(i, "+"),
paste0(i, "-")),
offset.text=0.15,
bar=FALSE)
}
p
}
plot_philr()
votes <- philr::name.balance(tree, tax, 'n583', return.votes = c('up', 'down'))
message("up.votes")
sort(votes[[c('up.votes', 'Order')]])
## Bacillales
## 2
message("down.votes")
sort(votes[[c('down.votes', 'Order')]])
## Bacillales
## 1
library(philr)
library(glmnet)
GP <- ps.nc
GP.f <- phyloseq::filter_taxa(GP, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
GP <- GP.f
phy_tree(GP) <- makeNodeLabel(phy_tree(GP), method="number", prefix='n')
GP <- transform_sample_counts(GP, function(x) x+1)
otu.table <- otu_table(GP)
tree <- phy_tree(GP)
metadata <- sample_data(GP)
tax <- tax_table(GP)
# is.binary(tree)
# is.rooted(tree)
gp.philr <- philr(otu.table, tree,
part.weights='enorm.x.gm.counts',
ilr.weights='blw.sqrt')
gp.dist <- dist(gp.philr, method="euclidean")
glmmod <- glmnet(gp.philr, sample_data(GP)$Quarry_or_Benchmark , alpha=1, family="binomial")
cv.res <- cv.glmnet(gp.philr, as.double(sample_data(GP)$Quarry_or_Benchmark), nfolds = 3, family = "binomial", type.measure = "mse")
top.coords <- as.matrix(coefficients(glmmod, cv.res$lambda.min))
top.coords <- rownames(top.coords)[which(top.coords != 0)]
top.coords <- top.coords[2:length(top.coords)]
tc.names <- sapply(top.coords, function(x) name.balance(tree, tax, x))
tc.names
## n7
## "Family_Nitrososphaeraceae/Family_Nitrososphaeraceae"
## n23
## "Family_Nitrososphaeraceae/Family_Nitrososphaeraceae"
## n115
## "Genus_Sphingomonas/Genus_Sphingomonas"
## n334
## "Genus_Nitrospira/Genus_Nitrospira"
## n358
## "Order_Gaiellales/Order_Solirubrobacterales"
## n386
## "Order_Gaiellales/Order_Gaiellales"
## n387
## "Order_Gaiellales/Genus_Gaiella"
## n612
## "Family_Microbacteriaceae/Genus_Agromyces"
## n742
## "Genus_Bacillus/Class_Bacilli"
## n787
## "Kingdom_Bacteria/Kingdom_Bacteria"
## n931
## "Order_Rokubacteriales/Order_Rokubacteriales"
## n1040
## "Class_Planctomycetes/Genus_Pir4_lineage"
## n1125
## "Phylum_Acidobacteriota/Phylum_Acidobacteriota"
gp.philr.long <- convert_to_long(gp.philr, get_variable(GP, 'Quarry_or_Benchmark')) %>%
filter(coord %in% top.coords)
# gp.philr.long$taxa <- paste0("n",str_sub(gp.philr.long$coord, 2)) %>%
# as.factor() %>%
# recode(!!!tc.names)
balance_taxa <- levels(gp.philr.long$taxa)
library(naturalsort)
ggplot(gp.philr.long, aes(x=labels, y=value)) +
geom_boxplot(fill='lightgrey') +
facet_grid(.~coord, scales='free_x') +
xlab('Quarry or Benchmark') + ylab('Balance Value') +
theme_bw()
plot_philr()
Some trick to understand what going on - philr output is not easy walk.
votes <- name.balance(tree, tax, 'n358', return.votes = c('up', 'down'))
message("up.votes")
sort(votes[[c('up.votes', 'Order')]])
## Gaiellales
## 33
message("down.votes")
sort(votes[[c('down.votes', 'Order')]])
## Solirubrobacterales
## 58
n358 - splitting inside Actinomycetotas order Thermoleophilia. Solirubrobacterales more quarry specific than Gaiellales.
Visualisations of abundance inside Thermoleophilia order.
ps.merged <- phyloseq::merge_samples(GP, "Quarry_or_Benchmark")
ps.merged@sam_data$Quarry_or_Benchmark <- as.factor(c("Benchmark", "Quarry"))
colorspalette <- c("#fde725", "#35b779")
part_tree_ph <- function(order){
ps.strange <- prune_taxa(
rownames(filter(
as.data.frame(ps.merged@tax_table@.Data),Class==order)),
ps.merged)
p <- plot_tree(ps.strange,
ladderize="FALSE",
color="Quarry_or_Benchmark",
label.tips="Genus",
size="abundance",
sizebase=2,
plot.margin=0.5,
base.spacing=0.03) +
scale_color_manual(values=c(colorspalette))
return(p)
}
part_tree_ph("Thermoleophilia")
ps.strange <- prune_taxa(
rownames(filter(
as.data.frame(GP@tax_table@.Data),Class=="Thermoleophilia")),
ps.merged)
amp <- phyloseq_to_amp(ps.strange)
amp_heatmap(amp, tax_show = 30, group_by = "Quarry_or_Benchmark", tax_aggregate = "Genus", tax_class = "Proteobacteria", tax_add = "Order") + theme_bw() + theme(text = element_text(size=15), legend.position = "none") + theme(axis.text.x=element_text(angle=45,hjust=1))
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggtree_2.0.4 naturalsort_0.1.3
## [3] glmnet_4.1-3 Matrix_1.4-0
## [5] philr_1.12.0 ggVennDiagram_1.2.0
## [7] rgeos_0.5-9 sp_1.4-6
## [9] sf_1.0-5 rnaturalearthdata_0.1.0
## [11] rnaturalearth_0.1.0 picante_1.8.2
## [13] nlme_3.1-155 ape_5.6-1
## [15] vegan_2.5-7 lattice_0.20-45
## [17] permute_0.9-5 ampvis2_2.6.4
## [19] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [21] DelayedArray_0.12.3 BiocParallel_1.20.1
## [23] matrixStats_0.61.0 Biobase_2.46.0
## [25] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [27] ggforce_0.3.3 ggrepel_0.9.1
## [29] ggpubr_0.4.0 forcats_0.5.1
## [31] stringr_1.4.0 dplyr_1.0.7
## [33] purrr_0.3.4 readr_2.1.1
## [35] tidyr_1.1.4 tibble_3.1.6
## [37] ggplot2_3.3.5 tidyverse_1.3.1
## [39] phyloseq_1.30.0 Biostrings_2.54.0
## [41] XVector_0.26.0 IRanges_2.20.2
## [43] S4Vectors_0.24.4 BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.1.1 RSQLite_2.2.9
## [4] AnnotationDbi_1.48.0 htmlwidgets_1.5.4 grid_3.6.3
## [7] munsell_0.5.0 units_0.7-2 codetools_0.2-18
## [10] withr_2.4.3 colorspace_2.0-2 highr_0.9
## [13] knitr_1.37 rstudioapi_0.13 wk_0.6.0
## [16] ggsignif_0.6.3 labeling_0.4.2 GenomeInfoDbData_1.2.2
## [19] polyclip_1.10-0 bit64_4.0.5 farver_2.1.0
## [22] rhdf5_2.30.1 treeio_1.10.0 coda_0.19-4
## [25] vctrs_0.3.8 generics_0.1.1 xfun_0.29
## [28] R6_2.5.1 doParallel_1.0.16 RVenn_1.1.0
## [31] locfit_1.5-9.4 bitops_1.0-7 cachem_1.0.6
## [34] assertthat_0.2.1 scales_1.1.1 vroom_1.5.7
## [37] nnet_7.3-16 gtable_0.3.0 phangorn_2.7.1
## [40] rlang_0.4.12 genefilter_1.68.0 splines_3.6.3
## [43] rgdal_1.5-28 rstatix_0.7.0 lazyeval_0.2.2
## [46] broom_0.7.11 checkmate_2.0.0 BiocManager_1.30.16
## [49] s2_1.0.7 yaml_2.2.1 reshape2_1.4.4
## [52] abind_1.4-5 modelr_0.1.8 backports_1.4.1
## [55] Hmisc_4.6-0 tools_3.6.3 statnet.common_4.5.0
## [58] ellipsis_0.3.2 jquerylib_0.1.4 biomformat_1.14.0
## [61] RColorBrewer_1.1-2 proxy_0.4-26 ggnet_0.1.0
## [64] Rcpp_1.0.8 plyr_1.8.6 base64enc_0.1-3
## [67] zlibbioc_1.32.0 classInt_0.4-3 RCurl_1.98-1.5
## [70] rpart_4.1-15 cowplot_1.1.1 haven_2.4.3
## [73] cluster_2.1.2 fs_1.5.2 magrittr_2.0.1
## [76] data.table_1.14.2 reprex_2.0.1 rnaturalearthhires_0.2.0
## [79] hms_1.1.1 patchwork_1.1.1 evaluate_0.14
## [82] xtable_1.8-4 XML_3.99-0.3 jpeg_0.1-9
## [85] readxl_1.3.1 shape_1.4.6 gridExtra_2.3
## [88] compiler_3.6.3 KernSmooth_2.23-20 crayon_1.4.2
## [91] htmltools_0.5.2 mgcv_1.8-38 tzdb_0.2.0
## [94] Formula_1.2-4 geneplotter_1.64.0 lubridate_1.8.0
## [97] DBI_1.1.2 tweenr_1.0.2 dbplyr_2.1.1
## [100] MASS_7.3-54 ade4_1.7-18 car_3.0-12
## [103] cli_3.1.0 quadprog_1.5-8 igraph_1.2.11
## [106] pkgconfig_2.0.3 usedist_0.4.0.9000 rvcheck_0.1.8
## [109] foreign_0.8-76 plotly_4.10.0 xml2_1.3.3
## [112] foreach_1.5.1 annotate_1.64.0 bslib_0.3.1
## [115] multtest_2.42.0 rvest_1.0.2 yulab.utils_0.0.4
## [118] digest_0.6.29 fastmatch_1.1-3 rmarkdown_2.11
## [121] cellranger_1.1.0 tidytree_0.3.7 htmlTable_2.4.0
## [124] lifecycle_1.0.1 jsonlite_1.7.3 Rhdf5lib_1.8.0
## [127] carData_3.0-5 network_1.17.1 viridisLite_0.4.0
## [130] fansi_1.0.2 pillar_1.6.4 fastmap_1.1.0
## [133] httr_1.4.2 survival_3.2-13 glue_1.6.0
## [136] png_0.1-7 iterators_1.0.13 bit_4.0.4
## [139] class_7.3-19 stringi_1.7.6 sass_0.4.0
## [142] blob_1.2.2 latticeExtra_0.6-29 memoise_2.0.1
## [145] e1071_1.7-9