require(Biostrings)
require(phyloseq)
require(tidyverse)
require(ggpubr)
setwd("~/storage/nal/")

Preprocessing:

Basic dada2 pipeline:
truncLen=c(185,185)
trimLeft=20
maxEE=c(6,8)
pool=FALSE
silva_nr_v138
delete bad taxa by decontam +
annosighted +
chloroplasts/mitho +
low reads/richness +
reads lenght check

tree from SEPP-QIIME2-plagin

sed -i ‘1s/^/#OTU_ID/’ otu_table.txt biom convert -i otu_table.txt -o otu_table.biom –to-hdf5 qiime tools import
–input-path rep_seq.fasta
–output-path rep_seq.qza
–type ‘FeatureData[Sequence]’ qiime fragment-insertion sepp
–i-representative-sequences rep_seq.qza
–i-reference-database ~/storage/somebases/sepp-refs-silva-128.qza
–o-tree insertion-tree.qza
–o-placements insertion-placements.qza
–p-threads 20
qiime tools import
–input-path otu_table.biom
–type ‘FeatureTable[Frequency]’
–input-format BIOMV210Format
–output-path feature-table.qza qiime fragment-insertion filter-features
–i-table feature-table.qza
–i-tree insertion-tree.qza
–o-filtered-table filtered_table.qza
–o-removed-table removed_table.qza   unzip -p filtered_table.qza > filtered_table.biom
biom convert -i filtered_table.biom -o filtered_table.txt
–to-tsv
sed -i ‘1d’ filtered_table.txt
unzip -p insertion-tree.qza /data/ > tree.nwk

I also did the PLSDA, but the result was a nightmare to interpret.
Decontam package also give me the irrevalent result.
And there were considerably more beta-plots and heatmaps in first version of this data analysis.

The quantitative data from DeSeq2 do not exactly match the data in the article supplement because of the different filtering across samples. This does not lead to a principal change in the ASVs composition and it did not result in any changes in the article itself, but plots look match better. The ILR analysis (philR) gives more unstable results, as a consequence, this analysis is discussed in the context of the taxonomic level of changing taxa in the article. With this approach, the results are constant with different strategies of filtering minors/samples with low representation.

0.1 Import RDS data. Read mapping file.

Removing nonidentified on phylum level phylotypes, mithohondria/chloroplasts.

track <- read_tsv("track.tsv")
ps <- read_rds("ps.rds")
ps <- merge_phyloseq(ps, refseq(readDNAStringSet("rep_seq.fasta")))

d <- density(width(ps@refseq))
plot(d)

pop_taxa <- function(physeq, badTaxa){
  allTaxa = taxa_names(physeq)
  myTaxa <- allTaxa[!(allTaxa %in% badTaxa)]
  return(prune_taxa(myTaxa, physeq))
}

delete_mit_chl_eu <- function(ps){
  badTaxa <- taxa_names(subset_taxa(ps, Order=="Chloroplast"))
  ps <- pop_taxa(ps, badTaxa)
  badTaxa <- taxa_names(subset_taxa(ps, Family=="Mitochondria"))
  ps <- pop_taxa(ps, badTaxa)
  badTaxa <- taxa_names(subset_taxa(ps, Kingdom == "Eukaryota"))
  ps <- pop_taxa(ps, badTaxa)
  badTaxa <- taxa_names(subset_taxa(ps, is.na(Phylum)))
  ps <- pop_taxa(ps, badTaxa)
  return(ps)
}

ps.f <- delete_mit_chl_eu(ps)

d <- density(width(ps.f@refseq))
plot(d)

adding full metadata

map <- read_tsv("mapping_nalchic_new.tsv", 
                col_types = cols(
                  pH = col_character()
                )
              )

map <- column_to_rownames(map, "Sample")

site_column <- ps.f@sam_data %>% 
  as.data.frame() %>% 
  pull('Site')

map <- map %>% 
  mutate(Type = str_replace(Type, " /", "/")) %>% 
  mutate(`Corg,_%` = str_replace(`Corg,_%`, ",", ".")) %>%
  mutate(`NO3_mg/kg` = str_replace(`NO3_mg/kg` , "," , ".")) %>% 
  mutate(pH = str_replace(pH, ",", ".")) %>% 
  mutate(final_concentration = str_replace(final_concentration, ",", ".")) %>% 
  mutate(pH = as.numeric(pH)) %>% 
  mutate(final_concentration = as.numeric(final_concentration)) %>% 
  mutate(`NO3_mg/kg` = as.numeric(`NO3_mg/kg`)) %>% 
  mutate(`Corg,_%` = as.numeric(`Corg,_%`)) %>%
  mutate(`Day_sampling` = as.factor(`Day_sampling`)) %>%
  mutate(
  Quarry_or_Benchmark = case_when(
    grepl('Quarry', What) ~ 'Quarry',
    grepl('Benchmark', What) ~ 'Benchmark'
  ) 
  ) %>%
      mutate(
    Region = case_when(
      grepl('meadow_humus', Soil) ~ 'Urvan',
      grepl('Carbonate_chernozem', Soil) ~ 'Progress',
      grepl('Forest_gray', Soil) ~ 'Gerpegesh',
    )
  ) %>% 
  relocate(Quarry_or_Benchmark, .before = Insolation) %>%
  mutate(Repeat = site_column) %>%
  relocate(Repeat, .after = Site)

map <- map %>%  mutate_if(sapply(map, is.character), as.factor)

map_m <- map
colnames(map_m)
##  [1] "Site"                "Repeat"              "Soil"               
##  [4] "Type"                "Colour"              "What"               
##  [7] "Quarry_or_Benchmark" "Insolation"          "Feature"            
## [10] "Temperature"         "Humidity"            "Height"             
## [13] "Longitude"           "Latitude"            "pH"                 
## [16] "C,_%"                "Corg,_%"             "P_mg/kg"            
## [19] "K_mg/kg"             "NH4_mg/kg"           "NO3_mg/kg"          
## [22] "Weight_g"            "SL1_mkl"             "SX_mkl"             
## [25] "EB_mkl"              "Date_isolation"      "final_concentration"
## [28] "Day_sampling"        "Region"
colnames(map_m)[16] <- "C"
colnames(map_m)[17] <- "C_org"
colnames(map_m)[18] <- "P"
colnames(map_m)[19] <- "K"
colnames(map_m)[20] <- "NH4"
colnames(map_m)[21] <- "NO3"

colnames(map)
##  [1] "Site"                "Repeat"              "Soil"               
##  [4] "Type"                "Colour"              "What"               
##  [7] "Quarry_or_Benchmark" "Insolation"          "Feature"            
## [10] "Temperature"         "Humidity"            "Height"             
## [13] "Longitude"           "Latitude"            "pH"                 
## [16] "C,_%"                "Corg,_%"             "P_mg/kg"            
## [19] "K_mg/kg"             "NH4_mg/kg"           "NO3_mg/kg"          
## [22] "Weight_g"            "SL1_mkl"             "SX_mkl"             
## [25] "EB_mkl"              "Date_isolation"      "final_concentration"
## [28] "Day_sampling"        "Region"

the temp feature added - the feature of normalized temperature by days

ps.f <- prune_samples(!sample_data(ps)$Soil %in% c("Forest_gray"), ps.f)
ps.f  <- prune_taxa(taxa_sums(ps.f) > 0, ps.f)
sample_data(ps.f) <- map_m

ps.nc <- prune_samples(sample_data(ps.f)$Day_sampling %in% c("1"), ps.f)
map3 <- data.frame(ps.nc@sam_data) %>%  mutate(temp = scale(Temperature))
ps.nc <- merge_phyloseq(ps.nc, sample_data(map3))

ps.nc1 <- prune_samples(sample_data(ps.f)$Day_sampling %in% c("2", "3"), ps.f)
map3 <- data.frame(ps.nc1@sam_data) %>%  mutate(temp = scale(Temperature))
ps.nc1 <- merge_phyloseq(ps.nc1, sample_data(map3))

ps.f <- merge_phyloseq(ps.nc, ps.nc1)

Richness/depth correlation./ Removing oultlier samples.

plot_rich_reads_samlenames_lm <- function(physeq){
  rish <- estimate_richness(physeq, measures = "Observed")
  reads.sum <- as.data.frame(sample_sums(physeq))
  reads.summary <- cbind(rish, reads.sum)
  colnames(reads.summary) <- c("otus","reads")
  reads.summary["Repeat"] <-unlist(purrr::map(stringr::str_split(rownames(physeq@sam_data), "N_", 2), function(x) x[[2]]))
  reads.summary["Site"] <- physeq@sam_data$Site
  library(ggrepel)
  require(ggforce)
  p1 <- ggplot(data=reads.summary) + 
    geom_point(aes(y=otus, x=log2(reads), color=Site),size=3) + 
    geom_text_repel(aes(y=otus, x=log2(reads), label=paste0(Site, "_", Repeat))) + 
    theme_bw() +
    geom_smooth(aes(y=otus, x=log2(reads), fill=Site, color=Site),method=lm, se=FALSE, ymin = 1) + 
    scale_x_continuous(sec.axis = sec_axis(sec.axis ~ 2**.)) 
  # geom_mark_ellipse(aes(y = otus, x=reads, group = Repeats, label = Repeats, color = Repeats), label.fontsize = 10, label.buffer = unit(2, "mm"), label.minwidth = unit(5, "mm"),con.cap = unit(0.1, "mm"))

  return(p1)
}

p.rich1 <- plot_rich_reads_samlenames_lm(ps.f)

ps.f2 <- subset_samples(ps.f, sample_names(ps.f) != 'N_95')
ps.f2 <- subset_samples(ps.f2, sample_names(ps.f2) != 'N_96')
ps.f2 <- subset_samples(ps.f2, sample_names(ps.f2) != 'N_89')
ps.f2 <- subset_samples(ps.f2, sample_names(ps.f2) != 'N_31')
# ps.f2 <- subset_samples(ps.f2, sample_names(ps.f2) != 'N_70')
ps.f2 <- subset_samples(ps.f2, Site != 'N8-2')
ps.f2 <- subset_samples(ps.f2, Site != 'N8-3')


p.rich <- plot_rich_reads_samlenames_lm(ps.f)
p.rich2 <- plot_rich_reads_samlenames_lm(ps.f2)

ggarrange(p.rich, p.rich2)

ps.f2  <- prune_taxa(taxa_sums(ps.f2) > 0, ps.f2)
ps.f <- ps.f2

0.2 vst dataset

Variance stabilization by DESeq2 package.

ps_vst <- function(ps, factor){
  
  require(DESeq2)
  require(phyloseq)
  
  diagdds = phyloseq_to_deseq2(ps, as.formula(paste( "~", factor)))              
  diagdds = estimateSizeFactors(diagdds, type="poscounts")
  diagdds = estimateDispersions(diagdds, fitType = "local") 
  pst <- varianceStabilizingTransformation(diagdds)
  pst.dimmed <- t(as.matrix(assay(pst))) 
  pst.dimmed[pst.dimmed < 0.0] <- 0.0
  ps.varstab <- ps
  otu_table(ps.varstab) <- otu_table(pst.dimmed, taxa_are_rows = FALSE) 
  return(ps.varstab)
}

ps.varstab <- ps_vst(ps.f, "Repeat")

0.2.1 Beta - NMDS

beta_custom_norm_NMDS_elli <- function(ps, seed = 7888, normtype="vst", Color="What", Group="Repeat"){
  require(phyloseq)
  require(ggplot2)
  require(ggpubr)
  require(DESeq2)
  library(ggforce)
  
  #normalisation. unifrac - rarefaction; wunifrac,bray - varstab
  #variant with only bray
  
  if (exists("ps.varstab") == FALSE){
    diagdds = phyloseq_to_deseq2(ps, ~ Site)                  
    diagdds = estimateSizeFactors(diagdds, type="poscounts")
    diagdds = estimateDispersions(diagdds, fitType = "local") 
    if (normtype =="vst")
      pst <- varianceStabilizingTransformation(diagdds)
    if (normtype =="log") 
      pst <- rlogTransformation(diagdds)
    
    pst.dimmed <- t(as.matrix(assay(pst))) 
    pst.dimmed[pst.dimmed < 0.0] <- 0.0
    ps.varstab <- ps
    otu_table(ps.varstab) <- otu_table(pst.dimmed, taxa_are_rows = FALSE) 
  }

  ordination.b <- ordinate(ps.varstab, "NMDS", "bray")
  p  <-  plot_ordination(ps.varstab,
                         ordination.b,
                         type="sample",
                         color = Color,
                         # title="NMDS - Bray-Curtis",
                         title=NULL,
                         axes = c(1,2) ) + 
    theme_bw() + 
    theme(text = element_text(size = 10)) + 
    geom_point(size = 3) +
    geom_mark_ellipse(aes_string(group = Group, label = Group),
                      label.fontsize = 10,
                      label.buffer = unit(2, "mm"),
                      label.minwidth = unit(5, "mm"),
                      con.cap = unit(0.1, "mm"),
                      con.colour='gray') +
    theme(legend.position = "none") 

  return(p)
}

p.beta <- beta_custom_norm_NMDS_elli(ps.f, Group="Site", Color="Site")
p.beta

0.2.2 Heatmaps ampvis2

Heatmaps with relative abundance by ampvis2 package

phyloseq_to_amp <- function(ps){
    require(ampvis2)
    require(tibble)
    require(phyloseq)
    colnames(ps@tax_table@.Data) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
    OTU1 = as(otu_table(ps), "matrix")
    OTU1 <- t(OTU1)
    OTUdf = as.data.frame(OTU1)
    taxa.ps <- as(tax_table(ps), "matrix")
    taxa.df = as.data.frame(taxa.ps)
    my_otu_table <- merge(OTUdf, taxa.df, by=0)
    my_otu_table <- column_to_rownames(my_otu_table, var="Row.names")

    my_metadata <- as_tibble(sample_data(ps), rownames=NA)
    my_metadata <- rownames_to_column(my_metadata,var = "SampleID")
    my_tree <- phy_tree(ps)
    amp.ps <- amp_load(otutable = my_otu_table, metadata = my_metadata, tree = my_tree)
    return(amp.ps)
}


amp <- phyloseq_to_amp(ps.f)
amp
## ampvis2 object with 4 elements. 
## Summary of OTU table:
##      Samples         OTUs  Total#Reads    Min#Reads    Max#Reads Median#Reads 
##           83        10960      1759579         7397        36229        20959 
##    Avg#Reads 
##     21199.75 
## 
## Assigned taxonomy:
##       Kingdom        Phylum         Class         Order        Family 
##   10960(100%) 10950(99.91%) 10607(96.78%)   9568(87.3%)  7678(70.05%) 
##         Genus       Species 
##  4388(40.04%)    232(2.12%) 
## 
## Metadata variables: 31 
##  SampleID, Site, Repeat, Soil, Type, Colour, What, Quarry_or_Benchmark, Insolation, Feature, Temperature, Humidity, Height, Longitude, Latitude, pH, C, C_org, P, K, NH4, NO3, Weight_g, SL1_mkl, SX_mkl, EB_mkl, Date_isolation, final_concentration, Day_sampling, Region, temp
p.heat.phyla <- amp_heatmap(amp,
            tax_show = 15,
            group_by = "Site",
            tax_aggregate = "Phylum",
            tax_class = "Proteobacteria") +
  theme_bw() +
  theme(text = element_text(size=15),
        legend.position = "none", 
        axis.text.x=element_text(angle=45,hjust=1))


p.heat.phyla

0.2.3 Split ps

Spltting phyloseq object by factors

ps.nc <- prune_samples(sample_data(ps.f)$Region %in% c("Urvan"), ps.f)
ps.nc  <- prune_taxa(taxa_sums(ps.nc) > 0, ps.nc)

ps.cc <- prune_samples(sample_data(ps.f)$Region %in% c("Progress"), ps.f)
ps.cc  <- prune_taxa(taxa_sums(ps.cc) > 0, ps.cc)

ps.cc2 <- prune_samples(sample_data(ps.f)$Site %in% c("N1", "N3", "N5", "N6", "N7"), ps.f)
ps.cc2  <- prune_taxa(taxa_sums(ps.cc2) > 0, ps.cc2)

ps.n13 <- prune_samples(sample_data(ps.f)$Site %in% c("N1", "N3"), ps.f)
ps.n13  <- prune_taxa(taxa_sums(ps.n13) > 0, ps.n13)

ps.cc2 <- prune_samples(sample_data(ps.f)$Site %in% c("N2", "N4", ""), ps.f)
ps.cc2  <- prune_taxa(taxa_sums(ps.cc2) > 0, ps.cc2)

0.3 Beta stat

PERMANOVA - Urvan

library(vegan)

physeq <- ps.nc
dist <- phyloseq::distance(physeq, "bray")
metadata <- as(sample_data(physeq@sam_data), "data.frame")
ad <- adonis2(dist ~ Quarry_or_Benchmark, data = metadata)

ad
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist ~ Quarry_or_Benchmark, data = metadata)
##                     Df SumOfSqs      R2     F Pr(>F)    
## Quarry_or_Benchmark  1   3.0915 0.26572 13.39  0.001 ***
## Residual            37   8.5429 0.73428                 
## Total               38  11.6344 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

PERMANOVA - Progress

library(vegan)
physeq <- ps.cc
dist <- phyloseq::distance(physeq, "bray")
metadata <- as(sample_data(physeq@sam_data), "data.frame")
ad <- adonis2(dist ~ Quarry_or_Benchmark, data = metadata)

ad
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist ~ Quarry_or_Benchmark, data = metadata)
##                     Df SumOfSqs      R2      F Pr(>F)    
## Quarry_or_Benchmark  1   0.7377 0.10962 5.1706  0.001 ***
## Residual            42   5.9920 0.89038                  
## Total               43   6.7297 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

More Progress - UniFrac

dist <- phyloseq::distance(ps.cc2, "unifrac")
metadata <- as(sample_data(ps.cc2@sam_data), "data.frame")
adonis( dist ~ Soil, data = metadata)
## 
## Call:
## adonis(formula = dist ~ Soil, data = metadata) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Soil       1    0.7148 0.71485  5.1936 0.22392  0.001 ***
## Residuals 18    2.4775 0.13764         0.77608           
## Total     19    3.1924                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Distances caclulation from article https://doi.org/10.1080/00029890.2003.11919989
Bray-Curtis distance
Urvan

dist <- phyloseq::distance(ps.nc, "bray")
names.nc.b <- subset(ps.nc@sam_data, ps.nc@sam_data$Quarry_or_Benchmark %in% "Benchmark") %>%
  rownames()
names.nc.q <- subset(ps.nc@sam_data, ps.nc@sam_data$Quarry_or_Benchmark %in% "Quarry") %>%
  rownames()

usedist::dist_between_centroids(dist, names.nc.b, names.nc.q, squared = TRUE)
## [1] 0.3721286

Distances caclulation from article https://doi.org/10.1080/00029890.2003.11919989
Bray-Curtis distance
Progress

dist <- phyloseq::distance(ps.cc2, "bray")
names.cc.m <- subset(ps.cc2@sam_data, ps.cc2@sam_data$Soil %in% "meadow_humus-cryptogley") %>%
  rownames()
names.cc.c <- subset(ps.cc2@sam_data, ps.cc2@sam_data$Soil %in% "Carbonate_chernozem") %>%
  rownames()

usedist::dist_between_centroids(dist, names.cc.m, names.cc.c, squared = TRUE)
## [1] 0.3233268

Progress - Permanova
UniFrac

dist <- phyloseq::distance(ps.cc, "unifrac")
metadata <- as(sample_data(ps.cc@sam_data), "data.frame")
adonis( dist ~ Quarry_or_Benchmark, data = metadata)
## 
## Call:
## adonis(formula = dist ~ Quarry_or_Benchmark, data = metadata) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                     Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Quarry_or_Benchmark  1    0.4995 0.49948  3.1287 0.06933  0.001 ***
## Residuals           42    6.7051 0.15965         0.93067           
## Total               43    7.2046                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

0.4 Heterogenity

Boxplot

dist <- phyloseq::distance(ps.f, method = "bray")
groups <- ps.f@sam_data$Region
mod <- vegan::betadisper(dist, groups)
mod.box <- boxplot(mod)

Tukey post-hoc

TukeyHSD(mod)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = distances ~ group, data = df)
## 
## $group
##                     diff        lwr        upr p adj
## Progress-Urvan -0.158795 -0.1817297 -0.1358603     0

0.4.1 Alpha plot

library(ggpubr)

alpha.custom <- function(ps.f, arrga = "PD"){
  
  require(picante)
  
  pd <- pd(ps.f@otu_table@.Data, ps.f@phy_tree)
  
  pd1 <- estimate_richness(ps.f, measures=c("Observed", "InvSimpson", "Shannon"))
  pd <- cbind(pd, ps.f@sam_data, pd1 )
  
  bmi <- levels(as.factor(pd$Quarry_or_Benchmark))
  bmi.pairs <- combn(seq_along(bmi), 2, simplify = FALSE, FUN = function(i)bmi[i])
  p1 <- ggviolin(pd, x = "Quarry_or_Benchmark", y = arrga,
                 add = "boxplot", fill = "Quarry_or_Benchmark") + stat_compare_means(comparisons = bmi.pairs, method = "wilcox.test", size=3) +
    scale_x_discrete(limits = c("Quarry", "Benchmark")) +
    theme(axis.title.x = element_blank(),
          legend.title = element_blank(),
          legend.position = "none",
          axis.text.x = element_text(angle = 45, hjust = 1),
          text = element_text(size=9))
  return(p1)
  return(p1)
}

alpha.custom2 <- function(ps.f, arrga = "PD"){
  
  require(picante)
  
  pd <- pd(ps.f@otu_table@.Data, ps.f@phy_tree)
  
  pd1 <- estimate_richness(ps.f, measures=c("Observed", "InvSimpson", "Shannon"))
  pd <- cbind(pd, ps.f@sam_data, pd1 )
  
  bmi <- levels(as.factor(pd$Quarry_or_Benchmark))
  bmi.pairs <- combn(seq_along(bmi), 2, simplify = FALSE, FUN = function(i)bmi[i])
  p1 <- ggviolin(pd, x = "Quarry_or_Benchmark", y = arrga,
                 add = "boxplot", fill = "Quarry_or_Benchmark") +
    stat_compare_means(comparisons = bmi.pairs, method = "wilcox.test", size=3) +
    scale_x_discrete(limits = c("Quarry", "Benchmark")) +
    theme(axis.title.x = element_blank(),
          legend.title = element_blank(),
          legend.position = "none",
          axis.text.x = element_text(angle = 45, hjust = 1),
          text = element_text(size=9))
  return(p1)
}

p.alpha.Cdt.oo <- alpha.custom(ps.cc, arrga = "Observed")
p.alpha.Cdt.sh <- alpha.custom(ps.cc, arrga = "PD")
p.alpha.Cdt.is <- alpha.custom(ps.cc, arrga = "Shannon")
p.alpha.Cdt.pd <- alpha.custom(ps.cc, arrga = "InvSimpson")

p.alpha.Cdt.oo2 <- alpha.custom2(ps.nc, arrga = "Observed")
p.alpha.Cdt.sh2 <- alpha.custom2(ps.nc, arrga = "PD")
p.alpha.Cdt.is2 <- alpha.custom2(ps.nc, arrga = "Shannon")
p.alpha.Cdt.pd2 <- alpha.custom2(ps.nc, arrga = "InvSimpson")

p1 <- ggarrange(p.alpha.Cdt.oo, p.alpha.Cdt.sh,p.alpha.Cdt.is, p.alpha.Cdt.pd , 
                p.alpha.Cdt.oo2, p.alpha.Cdt.sh2, p.alpha.Cdt.is2, p.alpha.Cdt.pd2,
                ncol = 4 ,label.x = 0.3, nrow = 2, font.label = c(size = 10), labels = c("Progress", "", "", "", "Urvan")) 
p1

0.4.2 Correlation between NO3 and the some major ASVs

ps.varstab.merged <- phyloseq::merge_samples(ps.varstab, "Repeat")
ps.no3 <- subset_taxa(ps.varstab.merged, taxa_names(ps.varstab) %in% c("Seq70", "Seq71", "Seq161", "Seq170", "Seq171"))
otus.no3 <- as.data.frame(ps.no3@otu_table)
otus.no3$NO3 <- ps.no3@sam_data$NO3
cor.test(otus.no3$Seq170, otus.no3$NO3, method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  otus.no3$Seq170 and otus.no3$NO3
## S = 260.22, p-value = 0.000003078
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8310254

0.4.3 DESeq2

des_w_simper <- function(ps, factor){
  require(DESeq2)
  require(vegan)
  require(tibble)
  require(phyloseq)
  
  diagdds = phyloseq_to_deseq2(ps, as.formula(paste( "~", factor)))                  
  diagdds = estimateSizeFactors(diagdds, type="poscounts")
  diagdds = estimateDispersions(diagdds, fitType = "local") 
  diagdds = DESeq(diagdds)
  samp <-sample_data(ps)
  aggdata <- t(aggregate.data.frame(as.data.frame(as.data.frame(t(diagdds@assays@data$mu))), by=list(samp[[factor]]), median))
  colnames(aggdata) <- aggdata[1,]
  aggdata <- aggdata[-1,]
  res = results(diagdds)
  res.df <- as.data.frame(res)
  nice <- cbind(res.df, as.data.frame(ps@tax_table@.Data[rownames(res.df),]), as.data.frame(aggdata)[rownames(res.df),])
} 
des.nc <- des_w_simper(ps.nc, "Quarry_or_Benchmark")
des.cc <- des_w_simper(ps.cc, "Quarry_or_Benchmark")
des.so <- des_w_simper(ps.f, "Region")


des.nc.filt <- des.nc %>% 
  filter(baseMean > 10) %>% 
  filter(padj < 0.05) %>% 
  arrange(log2FoldChange)
des.cc.filt <- des.cc %>% 
  filter(baseMean > 10) %>% 
  filter(padj < 0.05) %>% 
  arrange(log2FoldChange)
des.so.filt <- des.so %>% 
  filter(baseMean > 10) %>% 
  filter(padj < 0.05) %>% 
  arrange(log2FoldChange)

f1 <- ggplot(des.nc.filt, aes(x=log2FoldChange, y=baseMean)) +
  geom_vline(xintercept = 0, size = 0.75, color = "red", alpha=0.3) +
  labs(title="Urvan, Benchmark/Quarry") +
  geom_point() +
  theme_bw() +
  theme(text = element_text(size=8))
  

f2 <- ggplot(des.cc.filt, aes(x=log2FoldChange, y=baseMean)) +
  geom_vline(xintercept = 0, size = 0.75, color = "red", alpha=0.3) +
  labs(title="Progress, Benchmark/Quarry") +
  geom_point() +
  theme_bw() +
  theme(text = element_text(size=8))

f3 <- ggplot(des.so.filt, aes(x=log2FoldChange, y=baseMean)) +
  geom_vline(xintercept = 0, size = 0.75, color = "red", alpha=0.3) +
  labs(title="Urvan/Progress") +
  geom_point() +
  theme_bw() +
  theme(text = element_text(size=8))


p.des <- ggarrange(f1,f2,f3,nrow = 1)
ggarrange(f1,f2,f3,nrow = 1)

0.5 CCA

0.5.1 For sites

library(phyloseq)             
library(vegan)
library(ggrepel)

ps.merged <- phyloseq::merge_samples(ps.f, "Repeat")
physeq <- ps.merged

veganifyOTU <- function(physeq){
  require(phyloseq)
  if(taxa_are_rows(physeq)){physeq <- t(physeq)}
  return(as(otu_table(physeq), "matrix"))
}

otus.ps.vegan <- veganifyOTU(physeq)
metadata <- as(sample_data(physeq), "data.frame") 
cca <- vegan::cca(otus.ps.vegan ~  pH + C_org + C +  K + NH4 + NO3 + temp + P, data=metadata)

kate.ggcca.sites <- function(vare.cca){

require(tidyverse)
biplot <- as.data.frame(vare.cca$CCA$biplot)
wa <- as.data.frame(vare.cca$CCA$wa)

biplot <- rownames_to_column(biplot, "Label") %>% 
  add_column(Score = rep("biplot", length(rownames(biplot))))
wa <- rownames_to_column(wa, "Label") %>% 
  add_column(Score = rep("sites", length(rownames(wa))))
fdat_amazing <- rbind(biplot, wa) 
fdat_amazing <- fdat_amazing %>%
  mutate(label_size = ifelse(Score == "biplot", 4.5, 3))
 
p <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) + 
  # geom_abline(intercept=seq(-100, 100, 25), slope=1, colour="grey", alpha=0.5) +
  # geom_abline(intercept=seq(-100, 100, 25), slope=-1, colour="grey", alpha=0.5) +
  geom_vline(xintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_hline(yintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2, colour = factor(Score))) + 
  geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2),
               size = 0.6, alpha=0.8, color = "red",arrow = arrow(angle = 3)) +
  geom_text_repel(aes(x=CCA1, y=CCA2, label= Label),size=fdat_amazing$label_size) + 
  xlab(paste0(round(cca$CCA$eig[1], 3)*100, "% CCA1")) +
  ylab(paste0(round(cca$CCA$eig[2], 3)*100, "% CCA2")) +
  grids(linetype = "dashed") +
  theme(legend.position = "none", panel.background = element_rect(fill = "white", colour = "grey50"))
  return(p)
}

cca.sites <- kate.ggcca.sites(cca)
cca.sites

0.5.1.1 Stats

anova(cca, parallel = 10)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
##          Df ChiSquare     F Pr(>F)   
## Model     8    2.3090 1.309  0.004 **
## Residual 12    2.6458                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca, by="terms", parallel = 10)
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
##          Df ChiSquare      F Pr(>F)   
## pH        1   0.38347 1.7392  0.007 **
## C_org     1   0.38090 1.7275  0.010 **
## C         1   0.28553 1.2950  0.098 . 
## K         1   0.28565 1.2955  0.087 . 
## NH4       1   0.36507 1.6557  0.006 **
## NO3       1   0.13327 0.6044  0.904   
## temp      1   0.30609 1.3882  0.029 * 
## P         1   0.16901 0.7665  0.849   
## Residual 12   2.64585                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca, by="mar", parallel = 10)
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
##          Df ChiSquare      F Pr(>F)  
## pH        1   0.17681 0.8019  0.833  
## C_org     1   0.22899 1.0385  0.447  
## C         1   0.16358 0.7419  0.885  
## K         1   0.18311 0.8305  0.752  
## NH4       1   0.26038 1.1809  0.225  
## NO3       1   0.17298 0.7845  0.797  
## temp      1   0.30376 1.3777  0.026 *
## P         1   0.16901 0.7665  0.837  
## Residual 12   2.64585                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif.cca(cca)
##       pH    C_org        C        K      NH4      NO3     temp        P 
## 4.650906 2.856380 3.301995 2.502405 3.336293 3.941545 1.517910 2.196035

0.5.2 Try another CCA models

Reduce model

cca_w <- vegan::cca(otus.ps.vegan ~  pH + C_org + C + NH4 + NO3 + temp, data=metadata)

kate.ggcca.sites(cca_w)

0.5.2.1 Stats

anova(cca_w, parallel = 10)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + NH4 + NO3 + temp, data = metadata)
##          Df ChiSquare      F Pr(>F)   
## Model     6    1.9110 1.4649  0.002 **
## Residual 14    3.0439                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_w, by="terms", parallel = 10)
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + NH4 + NO3 + temp, data = metadata)
##          Df ChiSquare      F Pr(>F)   
## pH        1   0.38347 1.7638  0.002 **
## C_org     1   0.38090 1.7519  0.006 **
## C         1   0.28553 1.3133  0.066 . 
## NH4       1   0.37965 1.7462  0.004 **
## NO3       1   0.17677 0.8130  0.724   
## temp      1   0.30466 1.4013  0.023 * 
## Residual 14   3.04386                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_w, by="mar", parallel = 10)
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + NH4 + NO3 + temp, data = metadata)
##          Df ChiSquare      F Pr(>F)   
## pH        1   0.33436 1.5378  0.010 **
## C_org     1   0.27531 1.2663  0.111   
## C         1   0.22773 1.0474  0.362   
## NH4       1   0.37050 1.7041  0.005 **
## NO3       1   0.22242 1.0230  0.466   
## temp      1   0.30466 1.4013  0.034 * 
## Residual 14   3.04386                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif.cca(cca_w)
##       pH    C_org        C      NH4      NO3     temp 
## 2.342850 2.070081 2.528777 2.173317 3.195453 1.513385

0.5.3 CCA for taxons - family level

ps.ff <-  phyloseq::filter_taxa(ps.f, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
ps.varstab <- ps_vst(ps.ff, "Repeat")
ps.varstab.merged <- phyloseq::merge_samples(ps.varstab, "Repeat")
ps.varstab.merged.family <- tax_glom(ps.varstab.merged, taxrank="Family")

physeq <- ps.varstab.merged.family

veganifyOTU <- function(physeq){
  require(phyloseq)
  if(taxa_are_rows(physeq)){physeq <- t(physeq)}
  return(as(otu_table(physeq), "matrix"))
}

otus.ps.vegan <- veganifyOTU(physeq)
metadata <- as(sample_data(physeq), "data.frame") 
cca_w_varstab_family <- vegan::cca(otus.ps.vegan ~  temp + pH + C_org + NH4,  data=metadata)

kate.ggcca.species <- function(vare.cca, physeq){

  biplot <- as.data.frame(vare.cca$CCA$biplot)
  
  wa <- as.data.frame(vare.cca$CCA$v) %>% 
    rownames_to_column("ID")
  seq <- wa
  f <- as.data.frame(physeq@tax_table@.Data) %>% 
    dplyr::select("Family") %>% 
    rownames_to_column("ID")
  
  wa <- full_join(wa, f) %>% 
    dplyr::select(-ID) %>% 
    dplyr::rename(Label = Family ) %>% 
    mutate(Score = "sites")
  
  biplot <- rownames_to_column(biplot, "Label") %>% 
    add_column(Score = rep("biplot", length(rownames(biplot))))
  fdat_amazing <- rbind(biplot, wa) %>% 
    mutate(Label = as.factor(Label))
  
  p <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) + 
    geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2, colour = factor(Score))) + 
    geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2), alpha=0.8, color = "red",arrow = arrow(angle = 3)) +
    geom_text_repel(aes(x=CCA1, y=CCA2, label= Label),size=4) + 
    theme(legend.position = "none", panel.background = element_rect(fill = "white", colour = "grey50"))
    # geom_mark_ellipse(aes(group = Site, label = Site), label.fontsize = 10, label.buffer = unit(2, "mm"), label.minwidth = unit(5, "mm"),con.cap = unit(0.1, "mm"))
    return(p)
}

kate.ggcca.species(cca_w_varstab_family, physeq) 

0.5.4 ASV level

ps.ff <-  phyloseq::filter_taxa(ps.f, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
ps.varstab <- ps_vst(ps.ff, "Repeat")
ps.varstab.merged <- phyloseq::merge_samples(ps.varstab, "Repeat")
physeq <- ps.varstab.merged

veganifyOTU <- function(physeq){
  require(phyloseq)
  if(taxa_are_rows(physeq)){physeq <- t(physeq)}
  return(as(otu_table(physeq), "matrix"))
}


otus.ps.vegan <- veganifyOTU(physeq)
metadata <- as(sample_data(physeq), "data.frame") 

# different vars of fichas
# cca_w_varstab_asv <- vegan::cca(otus.ps.vegan ~  Humidity + pH + C_org + C +  K + NH4 + NO3 + temp,  data=metadata)
# cca_w_varstab_asv <- vegan::cca(otus.ps.vegan ~  pH + C_org + NH4 + temp,  data=metadata)

cca_w_varstab_asv <- vegan::cca(otus.ps.vegan ~  pH + C_org + C +  K + NH4 + NO3 + temp + P,  data=metadata)

wa <- as.data.frame(cca_w_varstab_asv$CCA$v) %>% 
  rownames_to_column("ID")

taxa.pruned <- as.data.frame(ps.varstab@tax_table@.Data) %>% 
  rownames_to_column("ID")

taxa.pruned <- taxa.pruned %>%  
  mutate_all(as.character)

# old.tips <- tree$tip.label
# matches <- regmatches(tree$tip.label, gregexpr("[[:digit:]]+", tree$tip.label))
# taxa.pruned$number <- as.numeric(unlist(matches))
# Shitty taxa formating part

taxa.pruned$taxa <- ifelse(is.na(taxa.pruned$Genus),
                            ifelse(is.na(taxa.pruned$Family),
                            ifelse(is.na(taxa.pruned$Order), 
                            ifelse(is.na(taxa.pruned$Class), taxa.pruned$Phylum, taxa.pruned$Class) , taxa.pruned$Order), taxa.pruned$Family), taxa.pruned$Genus)
taxa.pruned[taxa.pruned == "Burkholderia-Caballeronia-Paraburkholderia"] <- "Burkholderia"
taxa.pruned[taxa.pruned == "Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium"] <- "Pararhizobium"

taxa.pruned$taxa2 <- ifelse(is.na(taxa.pruned$Species),
                            with(taxa.pruned, paste0(taxa)),
                            with(taxa.pruned, paste0(taxa, " ", Species )))
taxa.pruned$phylum <- ifelse(taxa.pruned$Phylum == "Proteobacteria", with(taxa.pruned, paste0(taxa.pruned$Class)), with(taxa.pruned, paste0(taxa.pruned$Phylum)))
taxa.pruned$Label <- paste0(taxa.pruned$ID, "_", taxa.pruned$taxa2)

wa <- full_join(wa, taxa.pruned, by="ID") %>% 
  mutate(Score = "sites")

#For the top 50 most abundant taxa, skip if use des res
sw <- summarize_all(as.data.frame(ps.varstab@otu_table), sum)
head_asv <- as_tibble(t(sw), rownames = "ID") %>% 
  arrange(desc(V1)) %>% top_n(n = 100) %>% 
  pull(ID)

wa <- filter(wa, ID %in% head_asv)
# maybe only different by des? Picture will be nice.
# wa <- filter(wa, ID %in% row.des)
# wa <- filter(wa, ID %in% row.des.so)

biplot <- as.data.frame(cca_w_varstab_asv$CCA$biplot)
biplot <- rownames_to_column(biplot, "Label") %>% 
  add_column(Score = rep("biplot", length(rownames(biplot))))

fdat_amazing <- plyr::rbind.fill(biplot, wa) %>% 
  mutate(Label = as.factor(Label))
fdat_amazing <- fdat_amazing %>%
  mutate(label_size = ifelse(Score == "biplot", 4.5, 3))

p.cca.species <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) + 
  geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2)) + 
  geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2), alpha=0.8, color = "red", arrow = arrow(angle = 3)) +
  geom_text_repel(aes(x=CCA1, y=CCA2, label= Label, colour = phylum), size=fdat_amazing$label_size) + 
  xlab(paste0(round(cca$CCA$eig[1], 3)*100, "% CCA1")) +
  ylab(paste0(round(cca$CCA$eig[2], 3)*100, "% CCA2")) +
  grids(linetype = "dashed") +
  geom_vline(xintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_hline(yintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  theme(legend.position = "top", panel.background = element_rect(fill = "white", colour = "grey50"))


p.cca.species

0.5.4.1 Stats

anova(cca_w_varstab_asv, parallel = 10)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
##          Df ChiSquare      F Pr(>F)    
## Model     8  0.146738 2.6301  0.001 ***
## Residual 12  0.083686                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_w_varstab_asv, by="terms", parallel = 10)
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
##          Df ChiSquare      F Pr(>F)    
## pH        1  0.031458 4.5109  0.001 ***
## C_org     1  0.024206 3.4709  0.002 ** 
## C         1  0.022726 3.2588  0.002 ** 
## K         1  0.021816 3.1282  0.003 ** 
## NH4       1  0.019506 2.7970  0.002 ** 
## NO3       1  0.007373 1.0573  0.369    
## temp      1  0.013866 1.9883  0.038 *  
## P         1  0.005786 0.8297  0.630    
## Residual 12  0.083686                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_w_varstab_asv, by="mar", parallel = 10)
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus.ps.vegan ~ pH + C_org + C + K + NH4 + NO3 + temp + P, data = metadata)
##          Df ChiSquare      F Pr(>F)  
## pH        1  0.006969 0.9993  0.449  
## C_org     1  0.012179 1.7463  0.056 .
## C         1  0.006591 0.9451  0.462  
## K         1  0.013264 1.9020  0.036 *
## NH4       1  0.012674 1.8174  0.044 *
## NO3       1  0.007950 1.1399  0.305  
## temp      1  0.013969 2.0031  0.046 *
## P         1  0.005786 0.8297  0.600  
## Residual 12  0.083686                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif.cca(cca_w_varstab_asv)
##       pH    C_org        C        K      NH4      NO3     temp        P 
## 4.293374 2.760598 3.263601 2.449316 3.376127 3.765782 1.543511 2.181794

0.5.5 The last CCA plot

wa <- as.data.frame(cca_w_varstab_asv$CCA$v) %>% 
  rownames_to_column("ID")

taxa.pruned <- as.data.frame(ps.varstab@tax_table@.Data) %>% 
  rownames_to_column("ID")
taxa.pruned <- taxa.pruned %>%  mutate_all(as.character)
taxa.pruned$phylum_pro <- ifelse(taxa.pruned$Phylum == "Proteobacteria", with(taxa.pruned, paste0(taxa.pruned$Class)), with(taxa.pruned, paste0(taxa.pruned$Phylum)))

wa <- full_join(wa, taxa.pruned, by="ID") %>% 
  mutate(Score = "sites") %>% 
  dplyr::rename(Label = ID)

biplot <- as.data.frame(cca_w_varstab_asv$CCA$biplot)
biplot <- rownames_to_column(biplot, "Label") %>% 
  add_column(Score = rep("biplot", length(rownames(biplot))))


fdat_amazing <- plyr::rbind.fill(biplot, wa) %>% 
  mutate(Label = as.factor(Label))

p.cca.too.much <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) + 
  geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2, color = phylum_pro)) + 
  geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2), alpha=0.8, color = "red",arrow = arrow(angle = 3)) +
  geom_label(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), mapping = aes(x=CCA1, y=CCA2, label=Label)) +
  theme(legend.position = "top", panel.background = element_rect(fill = "white", colour = "grey50")) 
  # geom_mark_ellipse(aes(group = Site, label = Site), label.fontsize = 10, label.buffer = unit(2, "mm"), label.minwidth = unit(5, "mm"),con.cap = unit(0.1, "mm"))
  
p.cca.too.much

0.5.6 Plot of geographic map

library(rnaturalearth)
library(rnaturalearthdata)
library(tidyverse)
library(sf)
library(rgeos)
library(ggrepel)

world <- ne_countries(scale = "large", returnclass = "sf")
spdf_france_geounit <- ne_states(geounit = 'russia', returnclass = "sf")
probdata = tibble(site = c("Quarry on Chernozem", "Quarry on Gleyic"),
                  longitude = c(43.51137, 43.77252), 
                  latitude =  c(43.82965, 43.35022), 
                  xlabel = c(40, 47), 
                  ylabel = c(44.3, 43))

 ggplot(data = world) + 
  geom_sf() +
  geom_sf(data = spdf_france_geounit, fill="green",  alpha = 0.1 ) +
  # geom_sf(data = lakes ) +
  coord_sf(xlim = c(35, 50), ylim = c(40, 50), expand = FALSE) +
  geom_point(data = probdata, aes(x = longitude, y = latitude), size = 3, 
  shape = 20, colour = "red") +
  geom_text(data = probdata, aes(x = xlabel, y = ylabel, label = site), size = 5, colour="red") + 
  theme_bw() 

0.6 Venn

0.6.1 Custom venns by ggVenn package

library(ggVennDiagram)


get_list_from_ps <- function(physeq, amaz_fac) {
  my_keys <- levels(sample_data(physeq)[[amaz_fac]])
  mylist <- vector(mode="list", length=length(my_keys))
  names(mylist) <- my_keys
  for (i in levels(sample_data(physeq)[[amaz_fac]])) { 
    x <- prune_samples(sample_data(physeq)[[amaz_fac]] %in% i, physeq)
    x <- prune_taxa(taxa_sums(x) > 0, x)
    mylist[[i]] <- taxa_names(x)
  }
  return(mylist)
}

physeq <- ps.f

ggVennDiagram(get_list_from_ps(physeq, "What")) + 
  ggplot2::scale_fill_viridis_c(option = "D", begin=0.2)

0.6.2 Ampvis2 venns

amp.cc <- phyloseq_to_amp(ps.cc)

p.venn.cc <- amp_venn(amp.cc,
         group_by = "Quarry_or_Benchmark",
         normalise = TRUE,
         cut_a = 0.000001,
         cut_f=0.000001,
            )

p.venn.cc

0.7 philR

0.7.1 Progress

library(philr)
library(glmnet)

GP <- ps.cc
GP.f <-  phyloseq::filter_taxa(GP, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
GP <- GP.f
phy_tree(GP) <- makeNodeLabel(phy_tree(GP), method="number", prefix='n')
GP <- transform_sample_counts(GP, function(x) x+1)

otu.table <- otu_table(GP)
tree <- phy_tree(GP)
metadata <- sample_data(GP)
tax <- tax_table(GP)
# is.binary(tree)
# is.rooted(tree)

gp.philr <- philr(otu.table, tree, 
                  part.weights='enorm.x.gm.counts', 
                  ilr.weights='blw.sqrt')

gp.dist <- dist(gp.philr, method="euclidean")
glmmod <- glmnet(gp.philr, sample_data(GP)$Quarry_or_Benchmark , alpha=1, family="binomial")

cv.res <- cv.glmnet(gp.philr, as.double(sample_data(GP)$Quarry_or_Benchmark), nfolds = 3, family = "binomial", type.measure = "mse")

top.coords <- as.matrix(coefficients(glmmod, cv.res$lambda.min))
top.coords <- rownames(top.coords)[which(top.coords != 0)]
top.coords <- top.coords[2:length(top.coords)]
tc.names <- sapply(top.coords, function(x) name.balance(tree, tax, x))
tc.names
##                                                 n35 
##                    "Genus_Massilia/Species_timonae" 
##                                                n113 
##                "Species_aerolata/Genus_Skermanella" 
##                                                n221 
## "Family_Gemmatimonadaceae/Family_Gemmatimonadaceae" 
##                                                n367 
##                       "Order_Blfdi19/Genus_P3OB-42" 
##                                                n549 
##           "Genus_Lechevalieria/Genus_Lechevalieria" 
##                                                n583 
##                 "Order_Bacillales/Order_Bacillales" 
##                                               n1043 
## "Family_Gemmatimonadaceae/Family_Gemmatimonadaceae" 
##                                               n1136 
##   "Family_Chitinophagaceae/Family_Chitinophagaceae" 
##                                               n1150 
##   "Family_Chitinophagaceae/Family_Chitinophagaceae"
gp.philr.long <- convert_to_long(gp.philr, get_variable(GP, 'Quarry_or_Benchmark')) %>%
  filter(coord %in% top.coords)

# gp.philr.long$taxa <- paste0("n",str_sub(gp.philr.long$coord, 2)) %>% 
#   as.factor() %>% 
#   recode(!!!tc.names)

balance_taxa <- levels(gp.philr.long$taxa)

library(naturalsort)

ggplot(gp.philr.long, aes(x=labels, y=value)) +
  geom_boxplot(fill='lightgrey') +
  facet_grid(.~coord, scales='free_x') +
  xlab('Quarry or Benchmark') + ylab('Balance Value') +
  theme_bw() 

library(ggtree)

plot_philr <- function(){
  
  tc.nn <- name.to.nn(tree, top.coords)
  tc.colors <- scales::viridis_pal()(length(top.coords))
  p <- ggtree(tree, layout='fan')
  top.seq <- seq(1, length(top.coords))
  
  for (i in top.seq){
    p <- p + geom_balance(node=tc.nn[i], fill=tc.colors[i], alpha=0.6)
  }
  
  for (i in top.coords){
    p <- annotate_balance(tree, i, p=p,
                          labels = c(paste0(i, "+"),
                                     paste0(i, "-")),
                   offset.text=0.15,
                   bar=FALSE)
  }
  
  p
  
}   

plot_philr()

votes <- philr::name.balance(tree, tax, 'n583', return.votes = c('up', 'down'))
message("up.votes")
sort(votes[[c('up.votes', 'Order')]])
## Bacillales 
##          2
message("down.votes")
sort(votes[[c('down.votes', 'Order')]])
## Bacillales 
##          1

0.7.2 Urvan

library(philr)
library(glmnet)

GP <- ps.nc
GP.f <-  phyloseq::filter_taxa(GP, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
GP <- GP.f
phy_tree(GP) <- makeNodeLabel(phy_tree(GP), method="number", prefix='n')
GP <- transform_sample_counts(GP, function(x) x+1)

otu.table <- otu_table(GP)
tree <- phy_tree(GP)
metadata <- sample_data(GP)
tax <- tax_table(GP)
# is.binary(tree)
# is.rooted(tree)

gp.philr <- philr(otu.table, tree, 
                  part.weights='enorm.x.gm.counts', 
                  ilr.weights='blw.sqrt')

gp.dist <- dist(gp.philr, method="euclidean")
glmmod <- glmnet(gp.philr, sample_data(GP)$Quarry_or_Benchmark , alpha=1, family="binomial")

cv.res <- cv.glmnet(gp.philr, as.double(sample_data(GP)$Quarry_or_Benchmark), nfolds = 3, family = "binomial", type.measure = "mse")


top.coords <- as.matrix(coefficients(glmmod, cv.res$lambda.min))
top.coords <- rownames(top.coords)[which(top.coords != 0)]
top.coords <- top.coords[2:length(top.coords)]
tc.names <- sapply(top.coords, function(x) name.balance(tree, tax, x))
tc.names
##                                                    n7 
## "Family_Nitrososphaeraceae/Family_Nitrososphaeraceae" 
##                                                   n23 
## "Family_Nitrososphaeraceae/Family_Nitrososphaeraceae" 
##                                                  n115 
##               "Genus_Sphingomonas/Genus_Sphingomonas" 
##                                                  n334 
##                   "Genus_Nitrospira/Genus_Nitrospira" 
##                                                  n358 
##          "Order_Gaiellales/Order_Solirubrobacterales" 
##                                                  n386 
##                   "Order_Gaiellales/Order_Gaiellales" 
##                                                  n387 
##                      "Order_Gaiellales/Genus_Gaiella" 
##                                                  n612 
##            "Family_Microbacteriaceae/Genus_Agromyces" 
##                                                  n742 
##                        "Genus_Bacillus/Class_Bacilli" 
##                                                  n787 
##                   "Kingdom_Bacteria/Kingdom_Bacteria" 
##                                                  n931 
##         "Order_Rokubacteriales/Order_Rokubacteriales" 
##                                                 n1040 
##             "Class_Planctomycetes/Genus_Pir4_lineage" 
##                                                 n1125 
##       "Phylum_Acidobacteriota/Phylum_Acidobacteriota"
gp.philr.long <- convert_to_long(gp.philr, get_variable(GP, 'Quarry_or_Benchmark')) %>%
  filter(coord %in% top.coords)

# gp.philr.long$taxa <- paste0("n",str_sub(gp.philr.long$coord, 2)) %>% 
#   as.factor() %>% 
#   recode(!!!tc.names)

balance_taxa <- levels(gp.philr.long$taxa)

library(naturalsort)
ggplot(gp.philr.long, aes(x=labels, y=value)) +
  geom_boxplot(fill='lightgrey') +
  facet_grid(.~coord, scales='free_x') +
  xlab('Quarry or Benchmark') + ylab('Balance Value') +
  theme_bw() 

plot_philr()

Some trick to understand what going on - philr output is not easy walk.

votes <- name.balance(tree, tax, 'n358', return.votes = c('up', 'down'))
message("up.votes")
sort(votes[[c('up.votes', 'Order')]])
## Gaiellales 
##         33
message("down.votes")
sort(votes[[c('down.votes', 'Order')]])
## Solirubrobacterales 
##                  58

n358 - splitting inside Actinomycetotas order Thermoleophilia. Solirubrobacterales more quarry specific than Gaiellales.
Visualisations of abundance inside Thermoleophilia order.

ps.merged <- phyloseq::merge_samples(GP, "Quarry_or_Benchmark")
ps.merged@sam_data$Quarry_or_Benchmark <- as.factor(c("Benchmark", "Quarry"))

colorspalette <- c("#fde725", "#35b779")

part_tree_ph <- function(order){
  ps.strange <- prune_taxa(
    rownames(filter(
      as.data.frame(ps.merged@tax_table@.Data),Class==order)),
    ps.merged)
  p <- plot_tree(ps.strange,
            ladderize="FALSE",
            color="Quarry_or_Benchmark", 
            label.tips="Genus",
            size="abundance",
            sizebase=2,
            plot.margin=0.5,
            base.spacing=0.03) + 
    scale_color_manual(values=c(colorspalette))
  return(p)
}

part_tree_ph("Thermoleophilia")

ps.strange <- prune_taxa(
    rownames(filter(
      as.data.frame(GP@tax_table@.Data),Class=="Thermoleophilia")),
    ps.merged)

amp <- phyloseq_to_amp(ps.strange)
amp_heatmap(amp, tax_show = 30, group_by = "Quarry_or_Benchmark", tax_aggregate = "Genus", tax_class = "Proteobacteria", tax_add = "Order") + theme_bw() + theme(text = element_text(size=15), legend.position = "none") + theme(axis.text.x=element_text(angle=45,hjust=1))

0.8 Package versions

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggtree_2.0.4                naturalsort_0.1.3          
##  [3] glmnet_4.1-3                Matrix_1.4-0               
##  [5] philr_1.12.0                ggVennDiagram_1.2.0        
##  [7] rgeos_0.5-9                 sp_1.4-6                   
##  [9] sf_1.0-5                    rnaturalearthdata_0.1.0    
## [11] rnaturalearth_0.1.0         picante_1.8.2              
## [13] nlme_3.1-155                ape_5.6-1                  
## [15] vegan_2.5-7                 lattice_0.20-45            
## [17] permute_0.9-5               ampvis2_2.6.4              
## [19] DESeq2_1.26.0               SummarizedExperiment_1.16.1
## [21] DelayedArray_0.12.3         BiocParallel_1.20.1        
## [23] matrixStats_0.61.0          Biobase_2.46.0             
## [25] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
## [27] ggforce_0.3.3               ggrepel_0.9.1              
## [29] ggpubr_0.4.0                forcats_0.5.1              
## [31] stringr_1.4.0               dplyr_1.0.7                
## [33] purrr_0.3.4                 readr_2.1.1                
## [35] tidyr_1.1.4                 tibble_3.1.6               
## [37] ggplot2_3.3.5               tidyverse_1.3.1            
## [39] phyloseq_1.30.0             Biostrings_2.54.0          
## [41] XVector_0.26.0              IRanges_2.20.2             
## [43] S4Vectors_0.24.4            BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2               tidyselect_1.1.1         RSQLite_2.2.9           
##   [4] AnnotationDbi_1.48.0     htmlwidgets_1.5.4        grid_3.6.3              
##   [7] munsell_0.5.0            units_0.7-2              codetools_0.2-18        
##  [10] withr_2.4.3              colorspace_2.0-2         highr_0.9               
##  [13] knitr_1.37               rstudioapi_0.13          wk_0.6.0                
##  [16] ggsignif_0.6.3           labeling_0.4.2           GenomeInfoDbData_1.2.2  
##  [19] polyclip_1.10-0          bit64_4.0.5              farver_2.1.0            
##  [22] rhdf5_2.30.1             treeio_1.10.0            coda_0.19-4             
##  [25] vctrs_0.3.8              generics_0.1.1           xfun_0.29               
##  [28] R6_2.5.1                 doParallel_1.0.16        RVenn_1.1.0             
##  [31] locfit_1.5-9.4           bitops_1.0-7             cachem_1.0.6            
##  [34] assertthat_0.2.1         scales_1.1.1             vroom_1.5.7             
##  [37] nnet_7.3-16              gtable_0.3.0             phangorn_2.7.1          
##  [40] rlang_0.4.12             genefilter_1.68.0        splines_3.6.3           
##  [43] rgdal_1.5-28             rstatix_0.7.0            lazyeval_0.2.2          
##  [46] broom_0.7.11             checkmate_2.0.0          BiocManager_1.30.16     
##  [49] s2_1.0.7                 yaml_2.2.1               reshape2_1.4.4          
##  [52] abind_1.4-5              modelr_0.1.8             backports_1.4.1         
##  [55] Hmisc_4.6-0              tools_3.6.3              statnet.common_4.5.0    
##  [58] ellipsis_0.3.2           jquerylib_0.1.4          biomformat_1.14.0       
##  [61] RColorBrewer_1.1-2       proxy_0.4-26             ggnet_0.1.0             
##  [64] Rcpp_1.0.8               plyr_1.8.6               base64enc_0.1-3         
##  [67] zlibbioc_1.32.0          classInt_0.4-3           RCurl_1.98-1.5          
##  [70] rpart_4.1-15             cowplot_1.1.1            haven_2.4.3             
##  [73] cluster_2.1.2            fs_1.5.2                 magrittr_2.0.1          
##  [76] data.table_1.14.2        reprex_2.0.1             rnaturalearthhires_0.2.0
##  [79] hms_1.1.1                patchwork_1.1.1          evaluate_0.14           
##  [82] xtable_1.8-4             XML_3.99-0.3             jpeg_0.1-9              
##  [85] readxl_1.3.1             shape_1.4.6              gridExtra_2.3           
##  [88] compiler_3.6.3           KernSmooth_2.23-20       crayon_1.4.2            
##  [91] htmltools_0.5.2          mgcv_1.8-38              tzdb_0.2.0              
##  [94] Formula_1.2-4            geneplotter_1.64.0       lubridate_1.8.0         
##  [97] DBI_1.1.2                tweenr_1.0.2             dbplyr_2.1.1            
## [100] MASS_7.3-54              ade4_1.7-18              car_3.0-12              
## [103] cli_3.1.0                quadprog_1.5-8           igraph_1.2.11           
## [106] pkgconfig_2.0.3          usedist_0.4.0.9000       rvcheck_0.1.8           
## [109] foreign_0.8-76           plotly_4.10.0            xml2_1.3.3              
## [112] foreach_1.5.1            annotate_1.64.0          bslib_0.3.1             
## [115] multtest_2.42.0          rvest_1.0.2              yulab.utils_0.0.4       
## [118] digest_0.6.29            fastmatch_1.1-3          rmarkdown_2.11          
## [121] cellranger_1.1.0         tidytree_0.3.7           htmlTable_2.4.0         
## [124] lifecycle_1.0.1          jsonlite_1.7.3           Rhdf5lib_1.8.0          
## [127] carData_3.0-5            network_1.17.1           viridisLite_0.4.0       
## [130] fansi_1.0.2              pillar_1.6.4             fastmap_1.1.0           
## [133] httr_1.4.2               survival_3.2-13          glue_1.6.0              
## [136] png_0.1-7                iterators_1.0.13         bit_4.0.4               
## [139] class_7.3-19             stringi_1.7.6            sass_0.4.0              
## [142] blob_1.2.2               latticeExtra_0.6-29      memoise_2.0.1           
## [145] e1071_1.7-9