0.1 libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(phyloseq)
library(ampvis2)

0.1.1 import

Сырые данные - выход из DADA2 пайплайна.
В той же папке - статистика dada2

Импорт уже готовых RDS объектов

setwd("~/storage/pulkovo/d2")


ps.ff <- readRDS("ps.ff")
ancom <- readRDS("ancom")
ancom_ff <- readRDS("ancom_ff")

0.1.2 functions

Функции используемые для дальнейших визуализаций.

plot_rich_reads_samlenames_lm <- function(physeq, group = "Site", label = "Repeat"){
  rish <- estimate_richness(physeq, measures = "Observed")
  reads.sum <- as.data.frame(sample_sums(physeq))
  reads.summary <- cbind(rish, reads.sum)
  colnames(reads.summary) <- c("otus","reads")
  reads.summary["Repeat"] <-unlist(purrr::map(stringr::str_split(rownames(physeq@sam_data), "\\.", 2), function(x) x[[2]]))
  reads.summary["Site"] <- physeq@sam_data[[group]]
  library(ggrepel)
  require(ggforce)
  p1 <- ggplot(data=reads.summary) + 
    geom_point(aes(y=otus, x=log2(reads), color=Site),size=3) + 
    geom_text_repel(aes(y=otus, x=log2(reads), label=paste0(Repeat))) + 
    theme_bw() +
    geom_smooth(aes(y=otus, x=log2(reads), fill=Site, color=Site),method=lm, se=FALSE, ymin = 1) + 
    scale_x_continuous(sec.axis = sec_axis(sec.axis ~ 2**.)) 

  return(p1)
}

beta_custom_norm_NMDS_elli_w <- function(ps, seed = 7888, normtype="vst", Color="What", Group="Repeat"){
  require(phyloseq)
  require(ggplot2)
  require(ggpubr)
  library(ggforce)
  
  
  ordination.b <- ordinate(ps, "NMDS", "bray")
  mds <- as.data.frame(ordination.b$points)
  p  <-  plot_ordination(ps,
                         ordination.b,
                         type="sample",
                         color = Color,
                         title="NMDS - Bray-Curtis",
                         # title=NULL,
                         axes = c(1,2) ) + 
    theme_bw() + 
    theme(text = element_text(size = 10)) + 
    geom_point(size = 3) +
    annotate("text",
    x=min(mds$MDS1) + abs(min(mds$MDS1))/7,
    y=max(mds$MDS2),
    label=paste0("Stress -- ", round(ordination.b$stress, 3))) +
    geom_mark_ellipse(aes_string(group = Group, label = Group),
                      label.fontsize = 10,
                      label.buffer = unit(2, "mm"),
                      label.minwidth = unit(5, "mm"),
                      con.cap = unit(0.1, "mm"),
                      con.colour='gray') +
    theme(legend.position = "none") +
    ggplot2::scale_fill_viridis_c(option = "H")
  
  return(p)
}

plot_alpha_w_toc <- function(ps, group, metric) {
  
  require(phyloseq)
  require(ggplot2)
  
  ps_a <- prune_taxa(taxa_sums(ps) > 0, ps)
  
  er <- estimate_richness(ps_a)
  df_er <- cbind(ps_a@sam_data, er)
  df_er <- df_er %>% select(c(group, metric))
  stat.test <- aov(as.formula(paste0(metric, "~", group)), data = df_er) %>%
    rstatix::tukey_hsd()
  y <-  seq(max(er[[metric]]), length=length(stat.test$p.adj), by=max(er[[metric]]/20))

  plot_richness(ps_a, x=group, measures=metric) + 
    geom_boxplot() +
    geom_point(size=1.2, alpha=0.3) +
    ggpubr::stat_pvalue_manual(
      stat.test, 
      label = "p.adj.signif", 
      y.position = y) +
    theme_light() + 
    scale_color_brewer(palette="Dark2") +
    theme(axis.text.x = element_text(angle = 90),
          axis.title.x=element_blank()) +
    labs(y=paste(metric, "index")) 
}

phyloseq_to_ampvis2 <- function(physeq) {
  #check object for class
  if(!any(class(physeq) %in% "phyloseq"))
    stop("physeq object must be of class \"phyloseq\"", call. = FALSE)
  
  #ampvis2 requires taxonomy and abundance table, phyloseq checks for the latter
  if(is.null(physeq@tax_table))
    stop("No taxonomy found in the phyloseq object and is required for ampvis2", call. = FALSE)
  
  #OTUs must be in rows, not columns
  if(phyloseq::taxa_are_rows(physeq))
    abund <- as.data.frame(phyloseq::otu_table(physeq)@.Data)
  else
    abund <- as.data.frame(t(phyloseq::otu_table(physeq)@.Data))
  
  #tax_table is assumed to have OTUs in rows too
  tax <- phyloseq::tax_table(physeq)@.Data
  
  #merge by rownames (OTUs)
  otutable <- merge(
    abund,
    tax,
    by = 0,
    all.x = TRUE,
    all.y = FALSE,
    sort = FALSE
  )
  colnames(otutable)[1] <- "OTU"
  
  #extract sample_data (metadata)
  if(!is.null(physeq@sam_data)) {
    metadata <- data.frame(
      phyloseq::sample_data(physeq),
      row.names = phyloseq::sample_names(physeq), 
      stringsAsFactors = FALSE, 
      check.names = FALSE
    )
    
    #check if any columns match exactly with rownames
    #if none matched assume row names are sample identifiers
    samplesCol <- unlist(lapply(metadata, function(x) {
      identical(x, rownames(metadata))}))
    
    if(any(samplesCol)) {
      #error if a column matched and it's not the first
      if(!samplesCol[[1]])
        stop("Sample ID's must be in the first column in the sample metadata, please reorder", call. = FALSE)
    } else {
      #assume rownames are sample identifiers, merge at the end with name "SampleID"
      if(any(colnames(metadata) %in% "SampleID"))
        stop("A column in the sample metadata is already named \"SampleID\" but does not seem to contain sample ID's", call. = FALSE)
      metadata$SampleID <- rownames(metadata)
      
      #reorder columns so SampleID is the first
      metadata <- metadata[, c(which(colnames(metadata) %in% "SampleID"), 1:(ncol(metadata)-1L)), drop = FALSE]
    }
  } else
    metadata <- NULL
  
  #extract phylogenetic tree, assumed to be of class "phylo"
  if(!is.null(physeq@phy_tree)) {
    tree <- phyloseq::phy_tree(physeq)
  } else
    tree <- NULL
  
  #extract OTU DNA sequences, assumed to be of class "XStringSet"
  if(!is.null(physeq@refseq)) {
    #convert XStringSet to DNAbin using a temporary file (easiest)
    fastaTempFile <- tempfile(pattern = "ampvis2_", fileext = ".fa")
    Biostrings::writeXStringSet(physeq@refseq, filepath = fastaTempFile)
  } else
    fastaTempFile <- NULL
  
  #load as normally with amp_load
  ampvis2::amp_load(
    otutable = otutable,
    metadata = metadata,
    tree = tree,
    fasta = fastaTempFile
  )
}

filter_chrmitnas <- function(physeq){
  ps_object <- physeq 
  ps_object <- subset_taxa(ps_object, Phylum != "NA")
  
  # ps_object@tax_table[is.na(ps_object@tax_table)] <- TRUEps@tax_table@.Data
  ps_object <- subset_taxa(ps_object,
                           !(Family  == "Mitochondria" |
                               Class   == "Chloroplast" |
                               Order   == "Chloroplast"))
  # ps_object@tax_table <- dplyr::na_if(ps_object@tax_table, TRUE)
  ps.f <- ps_object
  
  out_old <- capture.output(physeq)[4] %>% 
    str_extract(regex("([1-9]).*(?= by)"))
  out_new <- capture.output(ps.f)[4] %>% 
    str_extract(regex("([1-9]).*(?= by)"))
  message(paste0("old = ", out_old))
  message(paste0("filtered = ", out_new))
  
  return(ps.f)
  
}

merge_samples_mean <- function(physeq, group){
  group_sums <- as.matrix(table(sample_data(physeq)[ ,group]))[,1]
  merged <- merge_samples(physeq, group)
  x <- as.matrix(otu_table(merged))
  if(taxa_are_rows(merged)){ x<-t(x) }
  out <- t(x/group_sums)
  out <- otu_table(out, taxa_are_rows = TRUE)
  otu_table(merged) <- out
  return(merged)
}

lsf.str()
## beta_custom_norm_NMDS_elli_w : function (ps, seed = 7888, normtype = "vst", Color = "What", Group = "Repeat")  
## filter_chrmitnas : function (physeq)  
## merge_samples_mean : function (physeq, group)  
## phyloseq_to_ampvis2 : function (physeq)  
## plot_alpha_w_toc : function (ps, group, metric)  
## plot_rich_reads_samlenames_lm : function (physeq, group = "Site", label = "Repeat")

Фильтрация образцов с низкой представленностью.
filter_chrmitnas - чистит датасет он NA на уровне фил, хлоропласты и митохондрии.

ps.f <- filter_chrmitnas(ps)

plot_rich_reads_samlenames_lm(ps.f, group = "Site", label = "Sample")  
## Warning: ggrepel: 37 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ps.ff <- prune_samples(!sample_names(ps.f) %in% paste0("Andronov.SEQ130.",c(34, 5, 15)), ps.f)
ps.ff  <- prune_taxa(taxa_sums(ps.ff) > 0, ps.ff)


plot_rich_reads_samlenames_lm(ps.ff, group = "Site", label = "Repeat") 
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

0.2 Beta

beta_custom_norm_NMDS_elli_w(ps.ff, Color = "Site", Group = "Site") 
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.040883 
## Run 1 stress 0.04088289 
## ... New best solution
## ... Procrustes: rmse 0.00005635666  max resid 0.0003171908 
## ... Similar to previous best
## Run 2 stress 0.0408829 
## ... Procrustes: rmse 0.0000564756  max resid 0.0003347198 
## ... Similar to previous best
## Run 3 stress 0.04088298 
## ... Procrustes: rmse 0.00004553785  max resid 0.0002334572 
## ... Similar to previous best
## Run 4 stress 0.04088293 
## ... Procrustes: rmse 0.00002476076  max resid 0.0001333129 
## ... Similar to previous best
## Run 5 stress 0.04088292 
## ... Procrustes: rmse 0.00001734785  max resid 0.00009972248 
## ... Similar to previous best
## Run 6 stress 0.04088298 
## ... Procrustes: rmse 0.00004594101  max resid 0.0002502187 
## ... Similar to previous best
## Run 7 stress 0.04088289 
## ... Procrustes: rmse 0.00005431418  max resid 0.0003273973 
## ... Similar to previous best
## Run 8 stress 0.0408829 
## ... Procrustes: rmse 0.00001471045  max resid 0.0000883607 
## ... Similar to previous best
## Run 9 stress 0.04088289 
## ... Procrustes: rmse 0.000005584187  max resid 0.00002531028 
## ... Similar to previous best
## Run 10 stress 0.04088294 
## ... Procrustes: rmse 0.00003328422  max resid 0.000185079 
## ... Similar to previous best
## Run 11 stress 0.04088294 
## ... Procrustes: rmse 0.00003144656  max resid 0.0001756437 
## ... Similar to previous best
## Run 12 stress 0.04088312 
## ... Procrustes: rmse 0.0000934482  max resid 0.0005028334 
## ... Similar to previous best
## Run 13 stress 0.040883 
## ... Procrustes: rmse 0.00005659016  max resid 0.000306771 
## ... Similar to previous best
## Run 14 stress 0.04088308 
## ... Procrustes: rmse 0.00008217108  max resid 0.0004411722 
## ... Similar to previous best
## Run 15 stress 0.04088298 
## ... Procrustes: rmse 0.00004980549  max resid 0.000275673 
## ... Similar to previous best
## Run 16 stress 0.04088293 
## ... Procrustes: rmse 0.00002461955  max resid 0.0001399969 
## ... Similar to previous best
## Run 17 stress 0.04088306 
## ... Procrustes: rmse 0.00007503539  max resid 0.0004052732 
## ... Similar to previous best
## Run 18 stress 0.1919038 
## Run 19 stress 0.0408831 
## ... Procrustes: rmse 0.00008681952  max resid 0.0004698693 
## ... Similar to previous best
## Run 20 stress 0.04088288 
## ... New best solution
## ... Procrustes: rmse 0.00001858335  max resid 0.0001015945 
## ... Similar to previous best
## *** Best solution repeated 1 times

То же, но с рарефакшеном.

ps.r <- rarefy_even_depth(ps.ff,rngseed = 1221)
beta_custom_norm_NMDS_elli_w(ps.r, Color = "Site", Group = "Site") 
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.03281294 
## Run 1 stress 0.03281294 
## ... New best solution
## ... Procrustes: rmse 0.00003300971  max resid 0.0001301202 
## ... Similar to previous best
## Run 2 stress 0.03281293 
## ... New best solution
## ... Procrustes: rmse 0.000001794148  max resid 0.00000636754 
## ... Similar to previous best
## Run 3 stress 0.03281305 
## ... Procrustes: rmse 0.00006464719  max resid 0.0002555404 
## ... Similar to previous best
## Run 4 stress 0.03281293 
## ... New best solution
## ... Procrustes: rmse 0.000003331408  max resid 0.00001223961 
## ... Similar to previous best
## Run 5 stress 0.03281296 
## ... Procrustes: rmse 0.00002995194  max resid 0.0001188467 
## ... Similar to previous best
## Run 6 stress 0.03281301 
## ... Procrustes: rmse 0.00005133915  max resid 0.0002033569 
## ... Similar to previous best
## Run 7 stress 0.032813 
## ... Procrustes: rmse 0.00005186784  max resid 0.0002055802 
## ... Similar to previous best
## Run 8 stress 0.03281301 
## ... Procrustes: rmse 0.00005504832  max resid 0.0002181154 
## ... Similar to previous best
## Run 9 stress 0.03281294 
## ... Procrustes: rmse 0.00001107934  max resid 0.00004250849 
## ... Similar to previous best
## Run 10 stress 0.03281294 
## ... Procrustes: rmse 0.00000897434  max resid 0.00002738668 
## ... Similar to previous best
## Run 11 stress 0.03281301 
## ... Procrustes: rmse 0.00004782254  max resid 0.0001885485 
## ... Similar to previous best
## Run 12 stress 0.03281304 
## ... Procrustes: rmse 0.00006446968  max resid 0.0002554987 
## ... Similar to previous best
## Run 13 stress 0.03281293 
## ... New best solution
## ... Procrustes: rmse 0.00001661198  max resid 0.00006464267 
## ... Similar to previous best
## Run 14 stress 0.03281305 
## ... Procrustes: rmse 0.00008591437  max resid 0.0003396772 
## ... Similar to previous best
## Run 15 stress 0.03281303 
## ... Procrustes: rmse 0.00007177757  max resid 0.000282788 
## ... Similar to previous best
## Run 16 stress 0.03281304 
## ... Procrustes: rmse 0.0000820854  max resid 0.0003245479 
## ... Similar to previous best
## Run 17 stress 0.03281295 
## ... Procrustes: rmse 0.00003182524  max resid 0.0001257578 
## ... Similar to previous best
## Run 18 stress 0.03281302 
## ... Procrustes: rmse 0.00007563761  max resid 0.0002991288 
## ... Similar to previous best
## Run 19 stress 0.03281297 
## ... Procrustes: rmse 0.00004755598  max resid 0.0001879429 
## ... Similar to previous best
## Run 20 stress 0.03281294 
## ... Procrustes: rmse 0.00002613964  max resid 0.0001031611 
## ... Similar to previous best
## *** Best solution repeated 8 times

0.3 alpha

без разрежения

p1 <- plot_alpha_w_toc(ps = ps.ff, group = "Site", metric = "Observed")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(group)
## 
##   # Now:
##   data %>% select(all_of(group))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(metric)
## 
##   # Now:
##   data %>% select(all_of(metric))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
p2 <- plot_alpha_w_toc(ps = ps.ff, group = "Site", metric = "Shannon")
p3 <- plot_alpha_w_toc(ps = ps.ff, group = "Site", metric = "InvSimpson")
ggarrange(p1, p2, p3, nrow = 1)

с разряжением

p1 <- plot_alpha_w_toc(ps = ps.r, group = "Site", metric = "Observed")
p2 <- plot_alpha_w_toc(ps = ps.r, group = "Site", metric = "Shannon")
p3 <- plot_alpha_w_toc(ps = ps.r, group = "Site", metric = "InvSimpson")
ggarrange(p1, p2, p3, nrow = 1)

0.4 amp simple

мажорные филотипы - относительная представленность

amp <- phyloseq_to_ampvis2(ps.ff)

amp
## ampvis2 object with 4 elements. 
## Summary of OTU table:
## [1]       51    10696   884451     8346    30720    16831 17342.18
## 
## Assigned taxonomy:
##       Kingdom        Phylum         Class         Order        Family 
##   10696(100%)   10696(100%)   10696(100%)   10696(100%) 10629(99.37%) 
##         Genus       Species 
##  6124(57.26%)    291(2.72%) 
## 
## Metadata variables: 10 
##  SampleID, Na_water, K_water, Cl, pH_salt, pH_water, C_total, Electro, Repeat, Site
amp_heatmap(amp, tax_show = 60, 
      group_by = "Site", 
      tax_aggregate = "OTU",
      tax_add = "Genus", 
      round =  2, 
      normalise=TRUE, 
      showRemainingTaxa = FALSE)
## Warning: Transformation introduced infinite values in discrete y-axis

уровень фил

названия фил не исправлены на

Oren, Aharon, and George M. Garrity. “Valid Publication of the Names of Forty-Two Phyla of Prokaryotes.” International Journal of Systematic and Evolutionary Microbiology 71, no. 10 (October 20, 2021). https://doi.org/10.1099/ijsem.0.005056. 

можно их исправить например при помощи чего-то похожего на код ниже:

taxa <- read_tsv("taxa.txt") %>% 
  column_to_rownames("...1") %>% 
  mutate(Phylum = str_replace_all(Phylum, c("Firmicutes" = "Bacillota",
                                            "Chloroflexi" = "Chloroflexota",
                                            "Proteobacteria" = "Pseudomonadota",
                                            "Desulfobacterota" = "Thermodesulfobacteriota",
                                            "Crenarchaeota" = "Thermoproteota",
                                            "Cyanobacteria" = "Cyanobacteriota"))) %>% 
  as.matrix() 
amp_heatmap(amp,
            tax_show = 12,
            group_by = "Site",
            tax_aggregate = "Phylum",
            tax_add = "Kingdom",
            normalise=TRUE,
            showRemainingTaxa = TRUE)

0.4.1 Soil chemestry and metadata

PERMANOVA - как химия влияет на микробиом

ps.merged <- merge_samples(ps.ff, "Repeat")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
d_meta <- ps.merged@sam_data %>% 
  data.frame() %>% 
  select(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro")) %>% 
  scale() %>% 
  as.data.frame()

d_otu <- phyloseq::distance(ps.merged, "bray")

dec <- sort(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"), decreasing = TRUE )
sort <- sort(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"), decreasing = FALSE )

f_dec <- as.formula(paste0("d_otu ~", noquote(paste(dec, collapse=" + "))))
f_sort <- as.formula(paste0("d_otu ~", noquote(paste(sort, collapse=" + "))))


list(
  order_forward = vegan::adonis2(f_dec, data = d_meta),
  order_backward = vegan::adonis2(f_sort, data = d_meta)
)
## $order_forward
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = f_dec, data = d_meta)
##          Df SumOfSqs      R2      F Pr(>F)    
## pH.salt   1  0.77383 0.46702 9.8349  0.001 ***
## Na.water  1  0.34096 0.20578 4.3334  0.022 *  
## K.water   1  0.13181 0.07955 1.6752  0.271    
## Electro   1  0.07642 0.04612 0.9713  0.495    
## Cl        1  0.11950 0.07212 1.5187  0.259    
## C.total   1  0.05705 0.03443 0.7251  0.616    
## Residual  2  0.15736 0.09497                  
## Total     8  1.65694 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $order_backward
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = f_sort, data = d_meta)
##          Df SumOfSqs      R2      F Pr(>F)    
## C.total   1  0.69264 0.41802 8.8030  0.001 ***
## Cl        1  0.31875 0.19237 4.0511  0.020 *  
## Electro   1  0.13994 0.08446 1.7785  0.202    
## K.water   1  0.16173 0.09761 2.0554  0.151    
## Na.water  1  0.07581 0.04575 0.9634  0.500    
## pH.salt   1  0.11072 0.06682 1.4072  0.293    
## Residual  2  0.15736 0.09497                  
## Total     8  1.65694 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

химия - электропроводность(мера засоленности)
сипользован Z-score!

ps.ff@sam_data %>% 
  data.frame() %>%
  # rownames_to_column("shit") %>% 
  # column_to_rownames("Name") %>% 
  select(which(sapply(., is.numeric)), Site) %>% 
  group_by(Site) %>% 
  summarise(across(everything(), mean),
            .groups = 'drop') %>% 
  column_to_rownames("Site") %>% 
  scale() %>% 
  heatmaply::ggheatmap()
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust vegan

0.5 CCA

cca по сэмплам

library(ggrepel)

ps.merged <- phyloseq::merge_samples(ps.ff, "Repeat")
physeq <- ps.ff


veganifyOTU <- function(physeq){
  require(phyloseq)
  if(taxa_are_rows(physeq)){physeq <- t(physeq)}
  return(as(otu_table(physeq), "matrix"))
}

otus.ps.vegan <- veganifyOTU(physeq)
rownames(otus.ps.vegan) <- ps.ff@sam_data$Repeat
metadata <- physeq@sam_data %>% 
  data.frame() %>% 
  select(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"))

dec <- sort(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"), decreasing = TRUE )
sort <- sort(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"), decreasing = FALSE )

f_dec <- as.formula(paste0("otus.ps.vegan ~", noquote(paste(dec, collapse=" + "))))
f_sort <- as.formula(paste0("otus.ps.vegan ~", noquote(paste(sort, collapse=" + "))))


cca_dec <- vegan::cca(f_dec, data=metadata)
cca_sort <- vegan::cca(f_sort, data=metadata)

kate.ggcca.sites <- function(vare.cca){

library(tidyverse)
biplot <- as.data.frame(vare.cca$CCA$biplot)
wa <- as.data.frame(vare.cca$CCA$wa)

biplot <- rownames_to_column(biplot, "Label") %>% 
  add_column(Score = rep("biplot", length(rownames(biplot))))
wa <- rownames_to_column(wa, "Label") %>% 
  add_column(Score = rep("sites", length(rownames(wa))))
fdat_amazing <- rbind(biplot, wa) 
fdat_amazing <- fdat_amazing %>%
  mutate(label_size = ifelse(Score == "biplot", 4.5, 3))
 
p <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) + 
  # geom_abline(intercept=seq(-100, 100, 25), slope=1, colour="grey", alpha=0.5) +
  # geom_abline(intercept=seq(-100, 100, 25), slope=-1, colour="grey", alpha=0.5) +
  geom_vline(xintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_hline(yintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2, colour = factor(Score))) + 
  geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2),
               size = 0.6, alpha=0.8, color = "red",arrow = arrow(angle = 3)) +
  geom_text_repel(aes(x=CCA1, y=CCA2, label= Label),size=fdat_amazing$label_size) + 
  xlab(paste0(round(vare.cca$CCA$eig[1], 3)*100, "% CCA1")) +
  ylab(paste0(round(vare.cca$CCA$eig[2], 3)*100, "% CCA2")) +
  grids(linetype = "dashed") +
  theme(legend.position = "none", panel.background = element_rect(fill = "white", colour = "grey50"))
  return(p)
}

cca.sites <- kate.ggcca.sites(cca_dec)
cca.sites

тест на мультиколлинеарность

vegan::vif.cca(cca_dec)
##    pH.salt   Na.water    K.water    Electro         Cl    C.total 
##  12.164455  27.848814   1.678713 191.952968 215.325139  30.290554

ссa по филотипам - отрисованы только 100 мажорных филотипов - цвет - к какой филе относятся

library(ggrepel)

ps.fff <-  phyloseq::filter_taxa(ps.ff, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
# ps.varstab <- ps_vst(ps.fff, "type")
ps.varstab <- ps.ff
# ps.varstab.merged <- phyloseq::merge_samples(ps.varstab, "Repeat")
# ps.varstab.merged.family <- tax_glom(ps.fff, taxrank="Family")

physeq <- ps.fff
ps.varstab <- physeq
veganifyOTU <- function(physeq){
  require(phyloseq)
  if(taxa_are_rows(physeq)){physeq <- t(physeq)}
  return(as(otu_table(physeq), "matrix"))
}

otus.ps.vegan <- veganifyOTU(physeq)
metadata <- as(sample_data(physeq), "data.frame") 
cca_w_varstab_asv <- vegan::cca(f_dec,  data=metadata)

wa <- as.data.frame(cca_w_varstab_asv$CCA$v) %>% 
  rownames_to_column("ID")

taxa.pruned <- as.data.frame(ps.varstab@tax_table@.Data) %>% 
  rownames_to_column("ID")

taxa.pruned <- taxa.pruned %>%  
  mutate_all(as.character)

# old.tips <- tree$tip.label
# matches <- regmatches(tree$tip.label, gregexpr("[[:digit:]]+", tree$tip.label))
# taxa.pruned$number <- as.numeric(unlist(matches))
# Shitty taxa formating part

taxa.pruned$taxa <- ifelse(is.na(taxa.pruned$Genus),
                            ifelse(is.na(taxa.pruned$Family),
                            ifelse(is.na(taxa.pruned$Order), 
                            ifelse(is.na(taxa.pruned$Class), taxa.pruned$Phylum, taxa.pruned$Class) , taxa.pruned$Order), taxa.pruned$Family), taxa.pruned$Genus)
taxa.pruned[taxa.pruned == "Burkholderia-Caballeronia-Paraburkholderia"] <- "Burkholderia"
taxa.pruned[taxa.pruned == "Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium"] <- "Pararhizobium"

taxa.pruned$taxa2 <- ifelse(is.na(taxa.pruned$Species),
                            with(taxa.pruned, paste0(taxa)),
                            with(taxa.pruned, paste0(taxa, " ", Species )))
taxa.pruned$phylum <- ifelse(taxa.pruned$Phylum == "Proteobacteria", with(taxa.pruned, paste0(taxa.pruned$Class)), with(taxa.pruned, paste0(taxa.pruned$Phylum)))
taxa.pruned$Label <- paste0(taxa.pruned$ID, "_", taxa.pruned$taxa2)

wa <- full_join(wa, taxa.pruned, by="ID") %>% 
  mutate(Score = "sites")

#For the top 50 most abundant taxa, skip if use des res
head_asv <- ps.varstab@otu_table@.Data %>% 
  data.frame %>% 
  colSums() %>% 
  sort(decreasing = TRUE) %>% 
  head(n=100) %>% 
  names()
  

wa <- filter(wa, ID %in% head_asv)
# maybe only different by des? Picture will be nice.
# wa <- filter(wa, ID %in% row.des)
# wa <- filter(wa, ID %in% row.des.so)

biplot <- as.data.frame(cca_w_varstab_asv$CCA$biplot)
biplot <- rownames_to_column(biplot, "Label") %>% 
  add_column(Score = rep("biplot", length(rownames(biplot))))

fdat_amazing <- plyr::rbind.fill(biplot, wa) %>% 
  
  mutate(Label = as.factor(Label))
fdat_amazing <- fdat_amazing %>%
  mutate(label_size = ifelse(Score == "biplot", 4.5, 3))

p.cca.species <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) + 
  geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2)) + 
  geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2), alpha=0.8, color = "red", arrow = arrow(angle = 3)) +
  geom_text_repel(aes(x=CCA1, y=CCA2, label= Label, colour = phylum), size=fdat_amazing$label_size) + 
  xlab(paste0(round(cca_w_varstab_asv$CCA$eig[1], 3)*100, "% CCA1")) +
  ylab(paste0(round(cca_w_varstab_asv$CCA$eig[2], 3)*100, "% CCA2")) +
  grids(linetype = "dashed") +
  geom_vline(xintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_hline(yintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  theme(legend.position = "top", panel.background = element_rect(fill = "white", colour = "grey50"))


p.cca.species
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

0.6 DA - ANCOM-BC

Первоначальная версия - сильная фильтрация   Только филотипы распределенные более-менее везде + фильтрация внутри анкома на количество ридов

ps.inall <- phyloseq::filter_taxa(ps.ff, function(x) sum(x > 10) > (0.1*length(x)), TRUE)

amp.inall <- phyloseq_to_ampvis2(ps.inall)
amp_heatmap(amp.inall,
            tax_show = 40,
            group_by = "Site",
            tax_aggregate = "OTU",
            tax_add = "Genus",
            normalise=TRUE,
            showRemainingTaxa = TRUE)
## Warning: Transformation introduced infinite values in discrete y-axis

# ancom_da_glob <- ANCOMBC::ancombc(phyloseq = ps.inall,
#                          formula = "Site",
#                          p_adj_method = "fdr",
#                          lib_cut = 1000,
#                          group = "Site",
#                          struc_zero = TRUE,
#                          neg_lb = TRUE,
#                          tol = 1e-5,
#                          max_iter = 100,
#                          conserve = TRUE,
#                          alpha = 0.05,
#                          global = TRUE)

set.seed(123)
ancom <-  ANCOMBC::ancombc2(data = ps.inall, 
  tax_level = NULL,
  fix_formula = "Site",
  rand_formula = NULL,
  p_adj_method = "fdr",
  pseudo = 0, 
  pseudo_sens = TRUE,
  prv_cut = 0.10,
  lib_cut = 1000,
  s0_perc = 0.05,
  group = "Site",
  struc_zero = TRUE,
  neg_lb = TRUE,
  alpha = 0.05,
  n_cl = 15, 
  verbose = TRUE,
  global = TRUE, 
  pairwise = TRUE, 
  dunnet = TRUE, 
  trend = FALSE,
  iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
  em_control = list(tol = 1e-5, max_iter = 100),
  lme_control = lme4::lmerControl(),
  mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
  trend_control = list(contrast = list(matrix(c(1, 0, -1, 1),
    nrow = 2,
    byrow = TRUE),
    matrix(c(-1, 0, 1, -1),
    nrow = 2,
    byrow = TRUE)),
   node = list(2, 2),
   solver = "ECOS",
  B = 100)
  )

Более мягкая фильтрация.

set.seed(5882)
ancom_ff <-  ANCOMBC::ancombc2(data = ps.ff, 
  tax_level = NULL,
  fix_formula = "Site",
  rand_formula = NULL,
  p_adj_method = "fdr",
  pseudo = 0, 
  pseudo_sens = TRUE,
  prv_cut = 0.10,
  s0_perc = 0.05,
  group = "Site",
  struc_zero = TRUE,
  neg_lb = TRUE,
  alpha = 0.05,
  n_cl = 20, 
  verbose = TRUE,
  global = TRUE, 
  pairwise = TRUE, 
  dunnet = TRUE, 
  trend = FALSE,
  iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
  em_control = list(tol = 1e-5, max_iter = 100),
  lme_control = lme4::lmerControl(),
  mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
  trend_control = list(contrast = list(matrix(c(1, 0, -1, 1),
    nrow = 2,
    byrow = TRUE),
    matrix(c(-1, 0, 1, -1),
    nrow = 2,
    byrow = TRUE)),
   node = list(2, 2),
   solver = "ECOS",
  B = 100)
  )

# saveRDS(ancom_ff, "ancom_ff")

Нормализация библиотек. Процедура описана в

http://www.bioconductor.org/packages/release/bioc/vignettes/ANCOMBC/inst/doc/ANCOMBC.html

раздел “Bias-corrected abundances”

Статья с теорией(математика там вроде в сапплементе):

Lin, H., Peddada, S.D. Analysis of compositions of microbiomes with bias correction. Nat Commun 11, 3514 (2020). https://doi.org/10.1038/s41467-020-17041-7
ps.an - phyloseq объект со скорректированными прочтениями + log

samp_frac <-  ancom_ff$samp_frac

# Replace NA with 0
samp_frac[is.na(samp_frac)] <-  0 
# Add pesudo-count (1) to avoid taking the log of 0
log_obs_abn = log(ancom_ff$feature_table + 1)
# Adjust the log observed abundances
log_corr_abn = t(t(log_obs_abn) - samp_frac)
# Show the first 6 sample


ps.an <- prune_taxa(taxa_names(ps.ff) %in% rownames(log_corr_abn), ps.ff)
otu_table(ps.an) <- otu_table(t(log_corr_abn), taxa_are_rows = FALSE)

log_corr_abn %>%
  DT::datatable(caption = "Bias-corrected log observed abundances")

0.6.1 Vulcano

Сравнение - относительно Ашана.
Читать lfc_Sitehigh_road так:
log-fold change между sample Ашан и high_road

sum <- log_corr_abn %>% 
  as.data.frame() %>% 
  rowSums() 

ps.an.merged <- merge_samples(ps.an, "Site")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
mean.d <- ps.an.merged@otu_table %>% 
  t() %>% 
  data.frame() %>% 
  mutate(
    lfc_Sitehigh_road = (Ashan + high_road)/2,
    lfc_SiteMiddle = (Ashan + Middle)/2,
    lfc_SiteMiddle_Sitehigh_road = (Middle + high_road)/2
  ) %>% 
rownames_to_column("taxon") %>%  
  select(c("taxon","lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road")) %>% 
  pivot_longer(!taxon, names_to = "pair", values_to = "abundance")

sum.d <- data.frame(taxon = names(sum), sum = sum, row.names = NULL)

lfc <- ancom_ff$res_pair %>% 
  select(c("taxon", "lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road"))

data_for_plot <- ancom_ff$res_pair %>% 
  select(c("taxon","lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road")) %>% 
  pivot_longer(c("lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road"),
               names_to = "pair",
               values_to = "lfc") %>% 
  left_join(mean.d, by=c("taxon", "pair"))

data_for_plot %>% 
  ggplot(aes(x=abundance, y=lfc, label=taxon, color=pair)) + 
  geom_point(alpha = 0.4) +
  # geom_smooth(aes(y=value, x=mean, fill=pair, color=pair),
              # method=lm, 
              # method.args = list(family = "quasipoisson"), 
              # se=TRUE) 
  coord_flip() +
  theme_bw() +
  scale_color_brewer(palette="Dark2") +
  facet_wrap(~pair) +
  theme(legend.position="bottom") 

0.6.2 treemap

размер плитки - не abundance(обязательно реализую, но не сейчас), а количество достоверно меняющихся филотипов
должно быть больше альфапротеобактерий(?) по литературе
или у нас влияние другого фактора, а не засоленности или проблема анализа/визуализации

taxa_family <- ps.ff@tax_table %>% 
  as.data.frame() %>% 
  # select(-c("Genus", "Species")) %>% 
  rownames_to_column("ID") %>% 
  # select(-ID) %>% 
  distinct()

ancom_ff$res %>% 
  filter(diff_Sitehigh_road == TRUE) %>% 
  select(c("taxon", "lfc_Sitehigh_road")) %>% 
  dplyr::rename("ID" = "taxon") %>% 
  left_join(taxa_family, by="ID") %>% 
  mutate(`more in` = ifelse(lfc_Sitehigh_road > 0, "Highroad", "Ashan") %>% as.factor()) %>% 
  group_by(Genus, `more in`, Phylum ) %>% 
  summarise(area = n()) %>% 
  mutate(Genus = str_replace_na(Genus, replacement = ' ')) %>% 
  # DT::datatable() %>% 
  ggplot(aes(area = area, 
             fill = `more in`, 
             subgroup = Phylum, 
             label = Genus)) +
  treemapify::geom_treemap() +
  treemapify::geom_treemap_subgroup_border(color = "white") +
  treemapify::geom_treemap_subgroup_text(place = "centre", 
                                         grow = T, 
                                         alpha = 0.5, 
                                         colour = "black", 
                                         fontface = "italic", 
                                         min.size = 0) +
  treemapify::geom_treemap_text(colour = "white", 
                                place = "topleft", 
                                grow = T, 
                                reflow = T, 
                                layout = 'squarified') +
  scale_fill_brewer(palette="Dark2") +
  theme(legend.position="bottom") 
## `summarise()` has grouped output by 'Genus', 'more in'. You can override using
## the `.groups` argument.

0.7 DA analysis from Andronov

сравнены только Ашан и high_road

# т.к. phyloseq не даёт брать средние от ридов пришлось использовать кастомную функцию(украдено)
ps.mean <- merge_samples_mean(ps.ff, "Site")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
fn <- function(x, y) (x/y)

otus_filtered <- ps.mean@otu_table@.Data %>% 
    data.frame() %>% 
    select(-Middle) %>% 
    filter_all(all_vars(. > 0))

dist_a <- otus_filtered %>% 
  select(c("Ashan")) %>% 
  proxy::dist(method = fn)
## Registered S3 methods overwritten by 'proxy':
##   method               from    
##   print.registry_field registry
##   print.registry_entry registry
dist_b <- otus_filtered %>% 
  select(c("high_road")) %>% 
  proxy::dist(method = fn)

res <- as.matrix(log(dist_a / dist_b)) %>% 
  as.data.frame() %>%
  select(where(is.numeric)) %>%
  summarise(across(everything(), mean)) %>% 
  pivot_longer(cols = everything()) %>% 
  arrange(value)

ggplot(res, aes(x=value)) + 
  geom_density() +
  labs(title = "денсити-плот значений статистики метода ЕЕ") +
  theme_bw()

0.7.1 volcano

похоже у метода будут сильные проблемы с FDR(?)
статистики метода ЕЕ ~ lfc

res_mod <- res %>% 
  dplyr::rename("taxon" = "name", "lfc" = "value") 

ps.merged.repeat <- merge_samples(ps.ff, "Site")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
ps.merged.repeat@otu_table %>% 
  t() %>% 
  data.frame() %>% 
  select(-Middle) %>% 
  rownames_to_column("taxon") %>% 
  left_join(res_mod, by=c("taxon")) %>% 
  mutate(`more in` = ifelse(lfc > 0, "Ashan", "high_road") %>% as.factor()) %>% 
  pivot_longer(!c(taxon, `more in`, lfc), names_to = "Site", values_to = "abundance") %>% 
  mutate(abundance = log2(abundance)) %>% 
  mutate(Site = as.factor(Site)) %>% 
  ggplot(aes(y=lfc, x=abundance)) + 
  geom_point(alpha = 0.5) +
  # geom_smooth(aes(y=value, x=mean, fill=pair, color=pair),
              # method=lm, 
              # method.args = list(family = "quasipoisson"), 
              # se=TRUE) +
  coord_flip() +
  theme_bw()
## Warning: Removed 10454 rows containing missing values (`geom_point()`).

0.7.2 treemap

то же по данным ЕЕ
в качестве порога(вместо p-value) я взял слева и справа по 2.5 перцентилю
FDR тоже нет, я его не вводил, но p-value у нас нет

taxa_family <- ps.ff@tax_table %>% 
  as.data.frame() %>% 
  # select(-c("Genus", "Species")) %>% 
  rownames_to_column("ID") %>% 
  # select(-ID) %>% 
  distinct()

# ps.merged.mean@otu_table@.Data %>% 
  # DT::datatable()

# res %>%
#   DT::datatable()

quantile(res$value, c(0.025, .975))
##      2.5%     97.5% 
## -2.446014  2.818329
percentiles <- quantile(res$value, c(0.025, .975))

res %>% 
  filter( value > percentiles[[2]] | value < percentiles[[1]] ) %>% 
  dplyr::rename("ID" = "name") %>% 
  left_join(taxa_family, by="ID") %>% 
  mutate(`more in` = ifelse(value > 0, "Highroad", "Ashan") %>% as.factor()) %>% 
   group_by(Genus, `more in`, Phylum ) %>% 
  summarise(area = n()) %>% 
  mutate(Genus = str_replace_na(Genus, replacement = ' ')) %>% 
  ggplot(aes(area = area, 
             fill = `more in`, 
             subgroup = Phylum, 
             label = Genus)) +
  treemapify::geom_treemap() +
  treemapify::geom_treemap_subgroup_border(color = "white") +
  treemapify::geom_treemap_subgroup_text(place = "centre", 
                                         grow = T, 
                                         alpha = 0.5, 
                                         colour = "black", 
                                         fontface = "italic", 
                                         min.size = 0) +
  treemapify::geom_treemap_text(colour = "white", 
                                place = "topleft", 
                                grow = T, 
                                reflow = T, 
                                layout = 'squarified') +
  scale_fill_brewer(palette="Dark2") +
  theme(legend.position="bottom")
## `summarise()` has grouped output by 'Genus', 'more in'. You can override using
## the `.groups` argument.

может быть дело в том, что нет фильтрации данных?
добавлена стандартная фильтрация для композиционных анализов, выкинуто то, что встречается только в отдельных образцах(было 10696 - стало 888)
*это ps.filt

ps.filt  <- phyloseq::filter_taxa(ps.ff, function(x) sum(x > 10) > (0.1*length(x)), TRUE)

# т.к. phyloseq не даёт брать средние от ридов пришлось использовать кастомную функцию(украдено)
ps.mean <- merge_samples_mean(ps.filt, "Site")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
fn <- function(x, y) (x/y)

otus_filtered <- ps.mean@otu_table@.Data %>% 
    data.frame() %>% 
    select(-Middle) %>% 
    filter_all(all_vars(. > 0))

dist_a <- otus_filtered %>% 
  select(c("Ashan")) %>% 
  proxy::dist(method = fn)
  
dist_b <- otus_filtered %>% 
  select(c("high_road")) %>% 
  proxy::dist(method = fn)

res <- as.matrix(log(dist_a / dist_b)) %>% 
  as.data.frame() %>%
  select(where(is.numeric)) %>%
  summarise(across(everything(), mean)) %>% 
  pivot_longer(cols = everything()) %>% 
  arrange(value)

ggplot(res, aes(x=value)) + 
  geom_density() +
  labs(title = "денсити-плот значений статистики метода ЕЕ") +
  theme_bw()

возможно стоит добавить стабилизацию дисперсии? Или я неправильно работаю с алгоритмом.

res_mod <- res %>% 
  dplyr::rename("taxon" = "name",
                "lfc" = "value") 

ps.merged.repeat <- merge_samples(ps.filt, "Site")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
ps.merged.repeat@otu_table %>% 
  t() %>% 
  data.frame() %>% 
  select(-Middle) %>% 
  rownames_to_column("taxon") %>% 
  left_join(res_mod, by=c("taxon")) %>% 
  mutate(`more in` = ifelse(lfc > 0, "Ashan", "high_road") %>% as.factor()) %>% 
  pivot_longer(!c(taxon, `more in`, lfc), names_to = "Site", values_to = "abundance") %>% 
  mutate(abundance = log2(abundance)) %>% 
  mutate(Site = as.factor(Site)) %>% 
  ggplot(aes(y=lfc, x=abundance)) + 
  geom_point(alpha = 0.5) +
  # geom_smooth(aes(y=value, x=mean, fill=pair, color=pair),
              # method=lm, 
              # method.args = list(family = "quasipoisson"), 
              # se=TRUE) +
  coord_flip() +
  theme_bw()
## Warning: Removed 286 rows containing missing values (`geom_point()`).

0.8 Andronov concept analysis pipeline

0.8.1 1 square

среднее относительная представленность в сайтах по тому растет или нет
пояснительная картинка внизу
Довольно сильно зафильтровано, т.к. иначе сильно перекошено т.к. средние не работают на таких данных

ps.rel  <-  phyloseq::transform_sample_counts(ps.filt, function(x) x / sum(x) * 100)
# ps.merged.mean <- merge_samples_mean(ps.filt, "Site")
ps.merged.mean <- merge_samples_mean(ps.rel, "Site")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
ps.merged.mean@otu_table %>% 
  data.frame() %>% 
  select(-Middle) %>% 
  mutate(`more in` = ifelse(Ashan > high_road, "Ashan", "high_road") %>% as.factor()) %>% 
  pivot_longer(!`more in`, names_to = "Site", values_to = "mean") %>% 
  group_by(`more in`, Site) %>% 
  summarise(mean_read_count = round(mean(mean), 3))
## `summarise()` has grouped output by 'more in'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   more in [2]
##   `more in` Site      mean_read_count
##   <fct>     <chr>               <dbl>
## 1 Ashan     Ashan               0.213
## 2 Ashan     high_road           0.027
## 3 high_road Ashan               0.032
## 4 high_road high_road           0.181
ps.merged.mean@otu_table %>% 
  data.frame() %>% 
  select(-Middle) %>% 
  mutate(`more in` = ifelse(Ashan > high_road, "Ashan", "high_road") %>% as.factor()) %>% 
  pivot_longer(!`more in`, names_to = "Site", values_to = "mean") %>% 
  mutate(Label = paste0("increased_in_", `more in`, " -- ", "ASVs_from_", Site) %>% as.factor()) %>% 
  ggplot(aes(x=mean)) + 
  geom_density() +
  theme_bw() +
  facet_wrap(~Label)

То же, но с данными ancom-bc, изменяющиеся филотипы разделены на достоверно/недостоверно меняющиеся.

otu_corr <- log_corr_abn %>% 
  t() %>% 
  data.frame() %>% 
  mutate(Site = ps.ff@sam_data$Site) %>% 
  group_by(Site) %>% 
  summarise(across(everything(), mean), .groups = 'drop') %>% 
  pivot_longer(!Site, names_to = "taxon", values_to = "abundance")  
  # rename(site = Site)

otu_corr %>% 
  left_join(ancom_ff$res_pair , by=c("taxon")) %>% 
  filter(Site %in% c("Ashan", "high_road")) %>% 
  # filter(taxon %in% c("Seq1", "Seq2")) %>% 
  select(c("Site", "taxon", "lfc_Sitehigh_road", "diff_Sitehigh_road", "abundance")) %>% 
  mutate(increasing = ifelse(lfc_Sitehigh_road > 0, TRUE, FALSE)) %>% 
   ggplot(aes(x=increasing, y=abundance)) +
    # geom_text(aes(label = taxon)) +
    geom_boxplot(outlier.size = 0, outlier.alpha = 0) +
    ggbeeswarm::geom_quasirandom(aes(color=diff_Sitehigh_road),size=0.75, alpha=0.7,dodge.width = 0.75) +
    theme_light() + 
    scale_color_brewer(palette="Dark2") +
    labs(title = "Mean", x = "lfc positive", y = "corrected log abundance", color = "Достоверность\n изменения филотипа") +
    theme(axis.text.x = element_text(angle = 90)) +
  # ggpubr::stat_pvalue_manual(
  # stat_test,  label = "{p}", 
  # tip.length = 0.005, hide.ns = TRUE
  # ) +
  facet_wrap(~Site)

0.8.2 2 log rel

То же, но в виде регрессии

ps.merged.mean@otu_table %>% 
  data.frame() %>% 
  select(-Middle) %>% 
  mutate(lfc = log2(Ashan/high_road)) %>% 
  mutate(`more in` = ifelse(Ashan > high_road, "Ashan", "high_road") %>% as.factor()) %>% 
  pivot_longer(!c(`more in`, lfc) , names_to = "Site", values_to = "merged_mean") %>% 
  mutate(merged_mean = log2(merged_mean)) %>% 
  mutate(Site = as.factor(Site)) %>% 
  ggplot(aes(x=merged_mean, 
             y=lfc, 
             label=Site, 
             color=Site)) + 
  geom_point() +
  geom_smooth(aes(y=lfc, 
                  x=merged_mean, 
                  fill=Site, 
                  color=Site),
              method=lm, 
              se=TRUE) +
  theme_bw() +
  facet_wrap(~Site)
## `geom_smooth()` using formula = 'y ~ x'

то же, но с данными ancomBC(менее строгая фильтрацией)
lm общая, разделение по группам не влияет

dt <- otu_corr %>% 
  left_join(lfc, by=c("taxon")) %>% 
  filter(Site %in% c("Ashan", "high_road")) 

l <- lm(lfc_Sitehigh_road ~ abundance*Site, data=dt) 
sl <- summary(l)
p <- sl$coefficients["abundance",4]

lfc <- ancom_ff$res_pair %>% 
  select(c("taxon", "lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road"))

otu_corr %>% 
  left_join(lfc, by=c("taxon")) %>% 
  filter(Site %in% c("Ashan", "high_road")) %>% 
  select(-c("lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road")) %>% 
  ggplot(aes(x=abundance, 
             y=lfc_Sitehigh_road, 
             label=Site, 
             color=Site)) + 
  geom_point()  +
  geom_smooth(aes(y=lfc_Sitehigh_road, x=abundance, fill=Site, color=Site),
              method=lm, 
              se=TRUE) +
  theme_bw() +
    labs(
  title = "Ашан -- Шоссе",
  subtitle = paste0( "lm:\n",
        " R2adjasted -- " , round(sl$adj.r.squared, 3),
        "\n abundance p-value ", format.pval(p)
      )
) +
  facet_wrap(~Site)
## `geom_smooth()` using formula = 'y ~ x'

0.9 Packages info

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0    ggforce_0.4.1   ggrepel_0.9.3   ampvis2_2.7.28 
##  [5] phyloseq_1.42.0 lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0  
##  [9] dplyr_1.1.0     purrr_1.0.1     readr_2.1.4     tidyr_1.3.0    
## [13] tibble_3.1.8    ggplot2_3.4.1   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##   [1] ggbeeswarm_0.7.1       colorspace_2.1-0       ggsignif_0.6.4        
##   [4] ellipsis_0.3.2         XVector_0.38.0         proxy_0.4-27          
##   [7] rstudioapi_0.14        farver_2.1.1           ggfittext_0.9.1       
##  [10] DT_0.27                bit64_4.0.5            fansi_1.0.4           
##  [13] codetools_0.2-19       splines_4.2.0          cachem_1.0.7          
##  [16] knitr_1.42             polyclip_1.10-4        ade4_1.7-22           
##  [19] jsonlite_1.8.4         broom_1.0.3            cluster_2.1.4         
##  [22] compiler_4.2.0         httr_1.4.5             backports_1.4.1       
##  [25] assertthat_0.2.1       Matrix_1.5-3           fastmap_1.1.1         
##  [28] lazyeval_0.2.2         treemapify_2.5.5       cli_3.6.0             
##  [31] tweenr_2.0.2           htmltools_0.5.4        tools_4.2.0           
##  [34] igraph_1.4.1           gtable_0.3.1           glue_1.6.2            
##  [37] GenomeInfoDbData_1.2.9 reshape2_1.4.4         Rcpp_1.0.10           
##  [40] carData_3.0-5          Biobase_2.58.0         jquerylib_0.1.4       
##  [43] vctrs_0.5.2            Biostrings_2.66.0      rhdf5filters_1.10.0   
##  [46] multtest_2.54.0        ape_5.7                nlme_3.1-162          
##  [49] crosstalk_1.2.0        iterators_1.0.14       xfun_0.37             
##  [52] timechange_0.2.0       lifecycle_1.0.3        dendextend_1.16.0     
##  [55] rstatix_0.7.2          ca_0.71.1              zlibbioc_1.44.0       
##  [58] MASS_7.3-58.2          scales_1.2.1           TSP_1.2-2             
##  [61] vroom_1.6.1            hms_1.1.2              parallel_4.2.0        
##  [64] biomformat_1.26.0      rhdf5_2.42.0           RColorBrewer_1.1-3    
##  [67] yaml_2.3.7             gridExtra_2.3          heatmaply_1.4.2       
##  [70] sass_0.4.5             stringi_1.7.12         highr_0.10            
##  [73] S4Vectors_0.36.2       foreach_1.5.2          permute_0.9-7         
##  [76] seriation_1.4.1        BiocGenerics_0.44.0    GenomeInfoDb_1.34.9   
##  [79] rlang_1.0.6            pkgconfig_2.0.3        systemfonts_1.0.4     
##  [82] bitops_1.0-7           evaluate_0.20          lattice_0.20-45       
##  [85] Rhdf5lib_1.20.0        htmlwidgets_1.6.1      labeling_0.4.2        
##  [88] cowplot_1.1.1          bit_4.0.5              tidyselect_1.2.0      
##  [91] plyr_1.8.8             magrittr_2.0.3         R6_2.5.1              
##  [94] IRanges_2.32.0         generics_0.1.3         pillar_1.8.1          
##  [97] withr_2.5.0            mgcv_1.8-42            survival_3.5-3        
## [100] abind_1.4-5            RCurl_1.98-1.10        crayon_1.5.2          
## [103] car_3.1-1              utf8_1.2.3             plotly_4.10.1         
## [106] tzdb_0.3.0             rmarkdown_2.20         viridis_0.6.2         
## [109] grid_4.2.0             data.table_1.14.8      vegan_2.6-4           
## [112] webshot_0.5.4          digest_0.6.31          stats4_4.2.0          
## [115] munsell_0.5.0          beeswarm_0.4.0         registry_0.5-1        
## [118] viridisLite_0.4.1      vipor_0.4.5            bslib_0.4.2           
## [121] egg_0.4.5