AncomBC straw example

Импорт phyloseq объекта и библиотек Датасет - хроносерия разложения соломы - от фактор Day 10ти уровней

library(phyloseq)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggpubr)
library(ampvis2)
library(ANCOMBC)
library(heatmaply)

## Loading required package: plotly

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

## Loading required package: viridis

## Loading required package: viridisLite

## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust vegan

## 
## ======================
## Welcome to heatmaply version 1.3.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================

ps.f <- readRDS("ps.f")

Удаляем сэмпл который портит ординацию(‘straw-16s-D07-1’)

plot_rich_reads_samlenames_lm <- function(physeq){
  rish <- estimate_richness(physeq, measures = "Observed")
  reads.sum <- as.data.frame(sample_sums(physeq))
  reads.summary <- cbind(rish, reads.sum)
  colnames(reads.summary) <- c("otus","reads")
  reads.summary["Repeat"] <-unlist(purrr::map(stringr::str_split(rownames(physeq@sam_data), "straw-16s-", 2), function(x) x[[2]]))
  reads.summary["Site"] <- physeq@sam_data$Day
  library(ggrepel)
  require(ggforce)
  p1 <- ggplot(data=reads.summary) + 
    geom_point(aes(y=otus, x=log2(reads), color=Site),size=3) + 
    geom_text_repel(aes(y=otus, x=log2(reads), label=paste0(Repeat))) + 
    theme_bw() +
    geom_smooth(aes(y=otus, x=log2(reads), fill=Site, color=Site),method=lm, se=FALSE, ymin = 1) + 
    scale_x_continuous(sec.axis = sec_axis(sec.axis ~ 2**.)) 
  # geom_mark_ellipse(aes(y = otus, x=reads, group = Repeats, label = Repeats, color = Repeats), label.fontsize = 10, label.buffer = unit(2, "mm"), label.minwidth = unit(5, "mm"),con.cap = unit(0.1, "mm"))
  return(p1)
}

p.rich1 <- plot_rich_reads_samlenames_lm(ps.f)
ps.f2 <- subset_samples(ps.f, sample_names(ps.f) != 'straw-16s-D07-1')

p.rich2 <- plot_rich_reads_samlenames_lm(ps.f2)

ggarrange(p.rich1, p.rich2)

## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Просто ординация, которая показывает что в общем датасет интересный и удобный для дальнейшей обработки, т.к. фактор влияющий на датасет один(время, а других факторов у нас и нет)

beta_custom_norm_NMDS_elli_w <- function(ps, seed = 7888, normtype="vst", Color="What", Group="Repeat"){
  require(phyloseq)
  require(ggplot2)
  require(ggpubr)
  require(DESeq2)
  library(ggforce)

  ordination.b <- ordinate(ps, "NMDS", "bray")
  mds <- as.data.frame(ordination.b$points)
  p  <-  plot_ordination(ps,
                         ordination.b,
                         type="sample",
                         color = Color,
                         # title="NMDS - Bray-Curtis",
                         title=NULL,
                         axes = c(1,2) ) + 
    theme_bw() + 
    theme(text = element_text(size = 10)) + 
    geom_point(size = 3) +
    annotate("text",
             x=min(mds$MDS1) + abs(min(mds$MDS1))/10,
             y=max(mds$MDS2),
             label=paste0("Stress -- ", round(ordination.b$stress, 3))) +
    geom_mark_ellipse(aes_string(group = Group, label = Group),
                      label.fontsize = 10,
                      label.buffer = unit(2, "mm"),
                      label.minwidth = unit(5, "mm"),
                      con.cap = unit(0.1, "mm"),
                      con.colour='gray') +
    theme(legend.position = "none") 

  return(p)
}

p.beta.w <- beta_custom_norm_NMDS_elli_w(ps.f2, Group="Day", Color="Day")
p.beta.w

Просто хитмепина и датафрейм на её основе с 60-ти топовыми по относительной представленностями филотипами

phyloseq_to_amp <- function(ps){
    require(ampvis2)
    require(tibble)
    require(phyloseq)
    colnames(ps@tax_table@.Data) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
    OTU1 = as(otu_table(ps), "matrix")
    OTU1 <- t(OTU1)
    OTUdf = as.data.frame(OTU1)
    taxa.ps <- as(tax_table(ps), "matrix")
    taxa.df = as.data.frame(taxa.ps)
    my_otu_table <- merge(OTUdf, taxa.df, by=0)
    my_otu_table <- column_to_rownames(my_otu_table, var="Row.names")

    my_metadata <- as_tibble(sample_data(ps), rownames=NA)
    my_metadata <- rownames_to_column(my_metadata,var = "SampleID")
    my_tree <- phy_tree(ps)
    amp.ps <- amp_load(otutable = my_otu_table, metadata = my_metadata, tree = my_tree)
    return(amp.ps)
}

amp <- phyloseq_to_amp(ps.f2)

## Warning: Could not find a column named OTU/ASV in otutable, using rownames as
## OTU ID's

ad <- amp_heatmap(amp, tax_show = 60, group_by = "Day", tax_aggregate = "OTU", textmap=TRUE)
ad_g <- amp_heatmap(amp, tax_show = 60, group_by = "Day", tax_aggregate = "OTU", tax_add = "Genus", textmap=TRUE)

r.ad_g <- rownames(ad_g)
r.ad <- rownames(ad)

amp_heatmap(amp, tax_show = 40, group_by = "Day", tax_aggregate = "OTU",tax_add = "Genus")

## Warning: Transformation introduced infinite values in discrete y-axis

Фигачим amcomBC

Ссылки: статья на AncomBC - https://www.nature.com/articles/s41467-020-17041-7.pdf cтатья на Ancom II где объясняется про ебучие нули - https://www.frontiersin.org/articles/10.3389/fmicb.2017.02114/full видео где объясняется как работает Ancom - https://youtu.be/A6o2nOnDsJU мануал на AncomBC от автора - https://bioconductor.org/packages/devel/bioc/vignettes/ANCOMBC/inst/doc/ANCOMBC.html

А, ну и естественно повторностей должно быть как минимум 5

ancom_da_glob <- ancombc(phyloseq = ps.f2,
                         formula = "Day",
                         p_adj_method = "fdr",
                         zero_cut = 0.90,
                         lib_cut = 1000,
                         group = "Day",
                         struc_zero = TRUE,
                         neg_lb = TRUE,
                         tol = 1e-5,
                         max_iter = 100,
                         conserve = TRUE,
                         alpha = 0.05,
                         global = TRUE)

## Warning: Small sample size detected for the following group(s): 
## D01 D03 D07 D08 D12 D13 D15
## ANCOM-BC results would be unstable when the sample size is < 5 per group

Кто меняется(по умолчанию относительно Day01) во всех днях:

*если хочется посмотреть относительно другого фактора то решение вот сдесь https://github.com/FrederickHuangLin/ANCOMBC/issues/64

ancom_da_glob$res_global %>% 
  dplyr::filter(diff_abn == TRUE) %>% 
  head(10)

##               W         p_val         q_val diff_abn
## Seq342 141.3182  1.102417e-25  1.653625e-25     TRUE
## Seq196 426.6669  0.000000e+00  0.000000e+00     TRUE
## Seq276 141.4114  0.000000e+00  0.000000e+00     TRUE
## Seq419 648.6746 1.481384e-133 6.422217e-133     TRUE
## Seq98  265.5449  0.000000e+00  0.000000e+00     TRUE
## Seq117 448.1055  0.000000e+00  0.000000e+00     TRUE
## Seq79  558.4962  0.000000e+00  0.000000e+00     TRUE
## Seq122 537.5903  0.000000e+00  0.000000e+00     TRUE
## Seq179 523.1776 1.248275e-106 3.516132e-106     TRUE
## Seq145 484.1610  0.000000e+00  0.000000e+00     TRUE

Ординация с нормализацией по AncomBC. Неясно как к этому относиться, не вижу смысла в рутинном использовании.

beta_custom_norm_NMDS_elli_customord <- function(ps, seed = 7888, Color="What", Group="Repeat", customord){
  require(phyloseq)
  require(ggplot2)
  require(ggpubr)
  require(DESeq2)
  library(ggforce)
  library(vegan)
  
  ancom.bc <- customord$samp_frac
  y.ancom.bc <- t(t(log(customord$feature_table + 1)) - ancom.bc)
  y.ancom.bc <- y.ancom.bc+abs(min(y.ancom.bc, na.rm = T))
  dist.ancom.bc <- vegdist(t(y.ancom.bc), method="bray", na.rm = T)
  
  ordination.b <- metaMDS(dist.ancom.bc,
          distance = "bray",
          k = 3,
          maxit = 999, 
          trymax = 500,
          wascores = TRUE)
  
  mds <- as.data.frame(ordination.b$points)
  
  p  <-  plot_ordination(ps,
                         ordination.b,
                         type="sample",
                         color = Color,
                         # title="NMDS - Bray-Curtis",
                         title=NULL,
                         axes = c(1,2) ) + 
    theme_bw() + 
    theme(text = element_text(size = 10)) + 
    geom_point(size = 3) +
    annotate("text",
             x=min(mds$MDS1) + abs(min(mds$MDS1))/8,
             y=max(mds$MDS2),
             label=paste0("Stress -- ", round(ordination.b$stress, 3))) +
    geom_mark_ellipse(aes_string(group = Group, label = Group),
                      label.fontsize = 10,
                      label.buffer = unit(2, "mm"),
                      label.minwidth = unit(5, "mm"),
                      con.cap = unit(0.1, "mm"),
                      con.colour='gray') +
    theme(legend.position = "none") 

  return(p)
}

p.beta.div <- beta_custom_norm_NMDS_elli_customord(ps.f2, Group="Day", Color="Day", customord=ancom_da_glob)
p.beta.div

heatmap по тому, как меняется относительноая представленность таксонов относительно первого дня по ancomBC, взяты топ60 таксонов(см выше) Все что можно группируется по WGCNA

dat <- ancom_da_glob$res$beta[r.ad, ]
rownames(dat) <- r.ad_g
heatmaply(dat, hclust_method="mcquitty", width = 2000, height = 1000 )

То же, но +/- - достоверно/недостоверно отличаются

dat.p <- ancom_da_glob$res$diff_abn[r.ad, ]
dat.p[dat.p == TRUE] <- 1
rownames(dat.p) <- r.ad_g
heatmaply(dat.p, hclust_method="mcquitty", hide_colorbar = TRUE)

То же, но p-value c поправкой в том случае если вас уже тошнит от интерактивных хитмапов или если вы работаете с картофелины без графики.

col <- rev(cm.colors(256))
heatmap(as.matrix(ancom_da_glob$res$q_val[r.ad, ]), Colv = NA, col = col)

AncomBC straw example

GrGladkov

asZt7ph/56/1985