import os

%load_ext rpy2.ipython 
# import vamb
# %config IPCompleter.greedy=True
%config Completer.use_jedi = True
# %load_ext rpy2.ipython


import pandas as pd
import numpy as np


%ls ../

Ant_1/         align_log.tsv  my_interproscan/  see_some_results/  trees/
Bact_comm_4x/  ant_1/         ncbi/             server1_arc/       waypipe/
CheckM/        ant_1_tree/    notebooks/        somebases/
al_R/          metagenome/    plates/           temp/


%cd ~/storage/trees/nal/
!ls -htl | head

/mnt/storage/trees/nal
total 130M
-rw-rw-r-- 1 gladkov2 gladkov2 499K Apr 26 09:17 iqtee_chi_al_d_2.tsv
-rw-r--r-- 1 gladkov2 gladkov2 1.2M Apr 23 14:39 taxa.txt
-rw-rw-r-- 1 gladkov2 gladkov2 3.3M Apr 23 14:39 otu.txt
-rw-rw-r-- 1 gladkov2 gladkov2 3.8M Apr 23 14:39 rep.fasta
-rw-rw-r-- 1 gladkov2 gladkov2 3.8M Apr 23 14:24 ref.fasta
-rw-r--r-- 1 gladkov2 gladkov2 1.6M Apr 23 13:00 ps_f.rds
-rw-rw-r-- 1 gladkov2 gladkov2 658K Apr 19 14:41 al_2_drna.log
-rw-rw-r-- 1 gladkov2 gladkov2 658K Apr 19 14:41 al_2_1.log
-rw-rw-r-- 1 gladkov2 gladkov2  14M Apr 19 14:41 al_2_drna.fasta


%%R

a <- list.files()


!mafft --auto --thread 50 --inputorder "rep_seq.fasta" > "al_1.fasta"

nthread = 50
nthreadpair = 50
nthreadtb = 16
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00


Making a distance matrix ..
 16101 / 16159 (thread   45)
done.

Constructing a UPGMA tree (efffree=0) ... 
 16150 / 16159
done.

Progressive alignment 1/2... 
STEP  14601 / 16158 (thread   15) h
Reallocating..done. *alloclen = 1662
STEP  16001 / 16158 (thread   15) h
Reallocating..done. *alloclen = 2702
STEP  16101 / 16158 (thread    1)
Reallocating..done. *alloclen = 3724

done.

Making a distance matrix from msa.. 
 16100 / 16159 (thread   15)
done.

Constructing a UPGMA tree (efffree=1) ... 
 16150 / 16159
done.

Progressive alignment 2/2... 
STEP  15101 / 16158 (thread    1) h
Reallocating..done. *alloclen = 1641
STEP  16101 / 16158 (thread    8) h
Reallocating..done. *alloclen = 2677

Reallocating..done. *alloclen = 3688

done.

disttbfast (nuc) Version 7.475
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
16 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.


!ls -htl

total 61M
-rw-rw-r-- 1 gladkov2 gladkov2  20M Apr  2 14:06 al_d_2.fasta
-rw-rw-r-- 1 gladkov2 gladkov2  37M Apr  2 11:57 al_1.fasta
-rw-r--r-- 1 gladkov2 gladkov2 796K Nov 20 12:01 ps.rds
-rw-r--r-- 1 gladkov2 gladkov2 4.1M Nov 19 15:42 rep_seq.fasta
-rw-r--r-- 1 gladkov2 gladkov2 5.5K Nov 19 15:42 track.tsv


!head -n 50 al_1.fasta

>Seq1
------------------------------------------acgtaggtg---------
---------------------gc-------------aagcg-------------------
------------------------------------------------------------
-------------------ttgtc--cg------------------------------ga
at-------------------------------------tat----t-----gg-g-cg-
t-------------aa-ag-----------------------------------------
--------cg-cgc--gc-a-------------------------gg-cgg-tc-c-t-t
---------------------------t-------------------------------a
----------a---------------------------------------------gtc-
--------------t-g-a-t-------------------------------------gt
-------------------------------------g----------------------
---------------------------------------------aa-------------
-----------a-----------------------g--c-cc-------a-c-g------
-gc-tca------------------------acc-----------gtg------------
--gaggg---------tca-t---------------------------------------
-----------------------------t-g----g--------aaa------------
---------------------------------------------------------ct-
g-g------------g--gg-a--c------------t----t--ga-gtgc-a-gg---
-------a-ga-g---------------------------a-a-----------------
---------------------ga----------------------gtg-gaa--------
------ttc---ca------------------------------cgtg----------t-
a---------------------gcg----------gt------------gaaat------
-------gc-g-t-----------------------------------------------
-ag---------aga-------t-g-tg-----------------------g-agga---
----------------------a-cac------------------------c--------
-----a---------------g-t----------------------ggc-gaag-gcg--
--------------actc-t-tt---------------------------g---------
----gc-c----------------------------------------------------
----------------------------------------------------------t-
------------------------g-----------------------------------
------------------------------t-----------------------------
--------------------------------------a---------------------
---------------------------------------------a--------------
----------------c-----------------------t-g----------acgct--
------g-a-g-g-c-----------g-c-------------------------------
-------------------g-----------aaag-------------------------
---cgt---------------------------------------gggg-a---------
---------------------------------------gcaaacagg------------
-------------------------------------------
>Seq2
------------------------------------------acgtaggtg---------
---------------------gc-------------aagcg-------------------
------------------------------------------------------------
-------------------ttgtc--cg------------------------------ga
at-------------------------------------tat----t-----gg-g-cg-
t-------------aa-ag-----------------------------------------
--------cg-cgc--gc-a-------------------------gg-cgg-tc-c-t-t
---------------------------t-------------------------------a
----------a---------------------------------------------gtc-


%%R

library(DECIPHER)
dna <- readDNAStringSet("rep_seq.fasta")
head(dna)
alignment <- AlignSeqs(DNAStringSet(dna), anchor=NA,verbose=FALSE, processors = 50)
writeXStringSet(alignment, file="al_d_2.fasta")


!head -n 50 al_d_2.fasta

>Seq1
----------------ACGTAG----------GTG------GCAA------GCGTT-------GTCCGGAATT----A--
--TT---G-----G-----G----CG----------TAA---AGCG------CG----CG-----CA-------GGC---
---GG------TC----C------------T--T------------------------------------------T---
--AAG------------------------------------------------TC--T-----G-----A----T-----
---------------G------TGA--------------AAG----C----CCAC------GG--------C-----TCA
ACC--G--T-----G-G---------A----G---------GGT-----CA---T-------T-G---GAA--------A
CT-----G--G-G--G------G--A---C--T--------T----G---A--------G----TGC---A-----GGA-
----GAGAA------GA----GTG------G------------AA-----TT----CCA----------------C----
-GTG----T-----A---G-------CGG----T------GAAAT-------GCG---T-------AG------AGAT--
--GT-----------GGAGGA------ACA------CCA---G---T-------G------GC-----GAA-------GG
CG----AC-----TCT-----TTG------G---C--CT-----------------------------------------
---------------------------G-------------------------------------------TA-------
-------------------------------------------------------A----------CT------------
---------G-----AC--------G-----CTGA-----G-----G---------C--G---C------GAAAGCG---
-T---------------------------------------G------GGG-----AGC----A------AACAGG----
-------------------
>Seq2
----------------ACGTAG----------GTG------GCAA------GCGTT-------GTCCGGAATT----A--
--TT---G-----G-----G----CG----------TAA---AGCG------CG----CG-----CA-------GGC---
---GG------TC----C------------T--T------------------------------------------T---
--AAG------------------------------------------------TC--T-----G-----A----T-----
---------------G------TGA--------------AAG----C----CCAC------GG--------C-----TCA
ACC--G--T-----G-G---------A----G---------GGT-----CA---T-------T-G---GAA--------A
CT-----G--G-G--G------G--A---C--T--------T----G---A--------G----TAC---A-----GAA-
----GAGAA------GA----GTG------G------------AA-----TT----CCA----------------C----
-GTG----T-----A---G-------CGG----T------GAAAT-------GCG---T-------AG------AGAT--
--GT-----------GGAGGA------ACA------CCA---G---T-------G------GC-----GAA-------GG
CG----AC-----TCT-----TTG------G---T--CT-----------------------------------------
---------------------------G-------------------------------------------TA-------
-------------------------------------------------------A----------CT------------
---------G-----AC--------G-----CTGA-----G-----G---------C--G---C------GAAAGCG---
-T---------------------------------------G------GGG-----AGC----A------AACAGG----
-------------------
>Seq3
----------------ACAGAG----------GAT------GCAA------GCGTT-------ATCCGGAATG----A--
--TT---G-----G-----G----CG----------TAA---AGCG------TC----TG-----TA-------GGT---
---GG------CT----T------------T--T------------------------------------------C---
--AAG------------------------------------------------TC--C-----G-----C----C-----
---------------G------TCA--------------AAT----C----CCAG------GG--------C-----TCA
ACC--C--T-----G-G---------A----C---------AGG-----CG---G-------T-G---GAA--------A
CT-----A--C-C--A------A--G---C--T--------G----G---A--------G----TAC---G-----GTA-
----GGGGC------AG----AGG------G------------AA-----TT----TCC----------------G----
-GTG----G-----A---G-------CGG----T------GAAAT-------GCG---T-------TG------AGAT--
--CG-----------GAAAGA------ACA------CCA---A---C-------G------GC-----GAA-------AG
CA----CT-----CTG-----CTG------G---G--CC-----------------------------------------
---------------------------G-------------------------------------------AC-------
-------------------------------------------------------A----------CT------------
---------G-----AC--------A-----CTGA-----G-----A---------G--A---C------GAAAGCT---
-A---------------------------------------G------GGG-----AGC----A------AATGGG----


!timeout 5 iqtree -s al_1.fasta --prefix al_1_1


!grep chi al_1_1.log

****  TOTAL      89.23%  3988 sequences failed composition chi2 test (p-value<5%; df=3)


!rm iqtee_chi_al1_1.tsv


!grep Seq al_1_1.log | sed 's/^ *//g' | sed 's/ \+ /\t/g' > iqtee_chi_al1_1.tsv


df = pd.read_csv("iqtee_chi_al1_1.tsv", delimiter='\t', header = None)
df


df.columns = ['somenumber', 'Id', 'Gap_Ambiguity', 'Composition', 'p-value']
# df_filt = df.query('Composition =="passed"')
df_filt = df[['Id', 'Composition']]
df_filt


# df.Id.to_csv(".txt", sep='\t', index=False, header=False)


df_taxa = pd.read_csv("taxa.txt", delimiter='\t')
df_taxa = df_taxa.rename(columns={"Unnamed: 0":"Id"})
df_mix = df_taxa.merge(df_filt, on='Id')
df_mix['Res'] = np.where(((df_mix['Composition']=="failed") |
                            (df_mix['Order']=="Chloroplast") |
                            (df_mix['Family']=="Mitochondria")) ,False, True)
df_mix


len(df_filt.query('Composition == "failed"'))

3988


len(df_mix.query('Res == True'))

11909


len(df_taxa)

16159


df_taxa['Phylum'].isna().sum()
# df_taxa.groupby(['Phylum']).Id.count()

1460


# df_taxa.groupby(['Kingdom']).Id.count()
df_taxa['Kingdom'].isna().sum()

235


%%R

list.files()

UsageError: Cell magic `%%R` not found.


import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO
import numpy as np
import pandas as pd
import math


path = "rep_seq.fasta"

sizes = [len(rec) for rec in SeqIO.parse(path, 'fasta')]

/home/gladkov2/anaconda3/envs/tree/lib/python3.9/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:ylabel='Density'>


df_otus = pd.read_csv("otu_table.txt", delimiter='\t')
abundance = df_otus.sum(axis=1).tolist()
Id = df_filt.Id.tolist()
Kin = df_mix.Kingdom.tolist()
df_ill = pd.DataFrame({'len':sizes, 'abu':[math.log(x, 2) for x in abundance] ,'kin':Kin})


df_ill.loc[(df_ill['abu'] > 10000) & (df_ill['len'] < 200)]


df_mix.loc[(df_mix['Id'] == 'Seq4') | (df_mix['Id'] == 'Seq8') ]


df_ill.loc[(df_ill['abu'] > 10000) & (df_ill['len'] < 200)]


df_mix['len'] = sizes
df_mix['abu'] = [math.log(x, 2) for x in abundance]
df_mix


plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_mix,
    x='len',
    y='abu',
    hue='Composition'
)

<AxesSubplot:xlabel='len', ylabel='abu'>


plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_mix,
    x='len',
    y='abu',
    hue='Kingdom'
)

<AxesSubplot:xlabel='len', ylabel='abu'>


df_mix['Organella'] = np.where(((df_mix['Order']=="Chloroplast") |
                            (df_mix['Family']=="Mitochondria")) ,True, False)
plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_mix,
    x='len',
    y='abu',
    hue='Organella'
)

<AxesSubplot:xlabel='len', ylabel='abu'>


df_right = df_mix.query('Organella==False & Kingdom!="Eukaryota"')

plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_right,
    x='len',
    y='abu',
    hue='Composition'
)

<AxesSubplot:xlabel='len', ylabel='abu'>


plt.figure(figsize=(9, 15))
sns.boxplot(data=df_right, orient="h", y='Phylum', x='len')`

<AxesSubplot:xlabel='len', ylabel='Phylum'>


plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_right.query('Phylum == "Gemmatimonadota"'),
    x='len',
    y='abu'
)

<AxesSubplot:xlabel='len', ylabel='abu'>

!ls

al_1.fasta    iqtee_chi_al1_1.tsv  ps.rds	  test
al_1_1.log    otu_table.biom	   rep_seq.fasta  track.tsv
al_d_2.fasta  otu_table.txt	   taxa.txt


%%R

library(phyloseq)
library(Biostrings)

ps <- read_rds("ps.rds")
ps <- merge_phyloseq(ps, refseq(readDNAStringSet("rep_seq.fasta")))

pop_taxa <- function(physeq, badTaxa){
  allTaxa = taxa_names(physeq)
  myTaxa <- allTaxa[!(allTaxa %in% badTaxa)]
  return(prune_taxa(myTaxa, physeq))
}

delete_mit_chl_eu <- function(ps){
  badTaxa <- taxa_names(subset_taxa(ps, Order=="Chloroplast"))
  ps <- pop_taxa(ps, badTaxa)
  badTaxa <- taxa_names(subset_taxa(ps, Family=="Mitochondria"))
  ps <- pop_taxa(ps, badTaxa)
  badTaxa <- taxa_names(subset_taxa(ps, Kingdom == "Eukaryota"))
  ps <- pop_taxa(ps, badTaxa)
  badTaxa <- taxa_names(subset_taxa(ps, is.na(Phylum)))
  ps <- pop_taxa(ps, badTaxa)
  return(ps)
}

saveRDS(ps.f, file = "ps_f.rds")
ps.f <- delete_mit_chl_eu(ps)
writeXStringSet(ps.f@refseq, file="ref_filt.fasta")


!mafft --auto --thread 50 --inputorder "ref_filt.fasta" > "al_2.fasta"

nthread = 50
nthreadpair = 50
nthreadtb = 16
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00


Making a distance matrix ..
 14801 / 14828 (thread   40)
done.

Constructing a UPGMA tree (efffree=0) ... 
 14820 / 14828
done.

Progressive alignment 1/2... 
STEP  14301 / 14827 (thread    5) h
Reallocating..done. *alloclen = 1677
STEP  14801 / 14827 (thread   11) h
done.

Making a distance matrix from msa.. 
 14800 / 14828 (thread   13)
done.

Constructing a UPGMA tree (efffree=1) ... 
 14820 / 14828
done.

Progressive alignment 2/2... 
STEP  14301 / 14827 (thread    6) h
Reallocating..done. *alloclen = 1642
STEP  14801 / 14827 (thread   14) h
done.

disttbfast (nuc) Version 7.475
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
16 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.


!timeout 5 iqtree -s al_2.fasta --prefix al_2_1

IQ-TREE multicore version 2.1.2 COVID-edition for Linux 64-bit built Mar 30 2021
Developed by Bui Quang Minh, James Barbetti, Nguyen Lam Tung,
Olga Chernomor, Heiko Schmidt, Dominik Schrempf, Michael Woodhams.

Host:    8753872069ef (AVX512, FMA3, 2708 GB RAM)
Command: iqtree -s al_2.fasta --prefix al_2_1
Seed:    647323 (Using SPRNG - Scalable Parallel Random Number Generator)
Time:    Mon Apr 19 13:50:24 2021
Kernel:  AVX+FMA - 1 threads (208 CPU cores detected)

HINT: Use -nt option to specify number of threads because your CPU has 208 cores!
HINT: -nt AUTO will automatically determine the best number of threads to use.

Reading alignment file al_2.fasta ... Fasta format detected
Alignment most likely contains DNA/RNA sequences
Alignment has 14828 sequences with 1278 columns, 1137 distinct patterns
801 parsimony-informative, 221 singleton sites, 256 constant sites
          Gap/Ambiguity  Composition  p-value
   1  Seq16007   84.74%    failed      0.08%
   2  Seq13966   80.67%    failed      0.00%
   3  Seq15747   81.77%    failed      0.00%
   4  Seq15400   85.92%    failed      0.24%
   5  Seq10913   78.79%    failed      0.01%
   6  Seq14445   83.33%    failed      0.03%
   7  Seq16096   81.53%    failed      0.00%
   8  Seq15482   86.38%    failed      0.01%
   9  Seq16057   86.31%    failed      0.02%
  10  Seq16145   86.78%    failed      0.22%
  11  Seq15178   84.66%    failed      0.02%
  12  Seq15307   84.51%    failed      0.00%
  13  Seq16111   85.37%    failed      0.03%
  14  Seq15593   80.13%    passed     83.57%
  15  Seq15681   80.20%    passed     44.51%
  16  Seq10274   80.20%    passed     51.77%
  17  Seq13971   86.38%    failed      1.02%
  18  Seq13135   80.20%    passed     69.90%
  19  Seq13143   80.20%    passed     71.67%
  И т.д.
 
14825  Seq2805    80.28%    passed     73.42%
14826  Seq4034    80.28%    passed     64.30%
14827  Seq7258    80.28%    passed     71.48%
14828  Seq15967   80.28%    passed     54.77%
WARNING: 14828 sequences contain more than 50% gaps/ambiguity
****  TOTAL      80.29%  1350 sequences failed composition chi2 test (p-value<5%; df=3)


!timeout 5 iqtree -s al_2_drna.fasta --prefix al_2_drna

IQ-TREE multicore version 2.1.2 COVID-edition for Linux 64-bit built Mar 30 2021
Developed by Bui Quang Minh, James Barbetti, Nguyen Lam Tung,
Olga Chernomor, Heiko Schmidt, Dominik Schrempf, Michael Woodhams.

Host:    8753872069ef (AVX512, FMA3, 2708 GB RAM)
Command: iqtree -s al_2_drna.fasta --prefix al_2_drna
Seed:    574271 (Using SPRNG - Scalable Parallel Random Number Generator)
Time:    Mon Apr 19 14:41:13 2021
Kernel:  AVX+FMA - 1 threads (208 CPU cores detected)

HINT: Use -nt option to specify number of threads because your CPU has 208 cores!
HINT: -nt AUTO will automatically determine the best number of threads to use.

Reading alignment file al_2_drna.fasta ... Fasta format detected
Alignment most likely contains DNA/RNA sequences
Alignment has 14828 sequences with 913 columns, 777 distinct patterns
490 parsimony-informative, 139 singleton sites, 284 constant sites
          Gap/Ambiguity  Composition  p-value
   1  Seq16007   78.64%    failed      0.03%
...
   14827  Seq7258    72.40%    passed     84.13%
14828  Seq15967   72.40%    passed     74.84%
WARNING: 14828 sequences contain more than 50% gaps/ambiguity
****  TOTAL      72.41%  1241 sequences failed composition chi2 test (p-value<5%; df=3)


!grep chi al_2_1.log

****  TOTAL      80.29%  1350 sequences failed composition chi2 test (p-value<5%; df=3)


!grep chi al_1_1.log

****  TOTAL      89.23%  3988 sequences failed composition chi2 test (p-value<5%; df=3)


!grep chi al_2_d.log

****  TOTAL      71.02%  1259 sequences failed composition chi2 test (p-value<5%; df=3)


!grep chi al_2_drna.log

****  TOTAL      72.41%  1241 sequences failed composition chi2 test (p-value<5%; df=3)


%%R

library(DECIPHER)
dna <- readDNAStringSet("ref_filt.fasta")
head(dna)
alignment <- AlignSeqs(RNAStringSet(dna), anchor=NA,verbose=FALSE, processors = 50)
alignment <- DNAStringSet(alignment)
writeXStringSet(alignment, file="al_2_drna.fasta")


!timeout 5 iqtree -s al_2_d.fasta --prefix al_2_d

IQ-TREE multicore version 2.1.2 COVID-edition for Linux 64-bit built Mar 30 2021
Developed by Bui Quang Minh, James Barbetti, Nguyen Lam Tung,
Olga Chernomor, Heiko Schmidt, Dominik Schrempf, Michael Woodhams.

Host:    8753872069ef (AVX512, FMA3, 2708 GB RAM)
Command: iqtree -s al_2_d.fasta --prefix al_2_d
Seed:    36355 (Using SPRNG - Scalable Parallel Random Number Generator)
Time:    Mon Apr 19 14:10:39 2021
Kernel:  AVX+FMA - 1 threads (208 CPU cores detected)

HINT: Use -nt option to specify number of threads because your CPU has 208 cores!
HINT: -nt AUTO will automatically determine the best number of threads to use.

Reading alignment file al_2_d.fasta ... Fasta format detected
Alignment most likely contains DNA/RNA sequences
Alignment has 14828 sequences with 869 columns, 775 distinct patterns
525 parsimony-informative, 137 singleton sites, 207 constant sites
          Gap/Ambiguity  Composition  p-value
   1  Seq16007   77.56%    failed      0.02%
  ...
14826  Seq4034    71.00%    passed     80.76%
14827  Seq7258    71.00%    passed     85.11%
14828  Seq15967   71.00%    passed     76.78%
WARNING: 14828 sequences contain more than 50% gaps/ambiguity
****  TOTAL      71.02%  1259 sequences failed composition chi2 test (p-value<5%; df=3)


%%R

library(DECIPHER)
dna <- readDNAStringSet("ref_filt.fasta")
head(dna)
alignment <- AlignSeqs(DNAStringSet(dna), anchor=NA,verbose=FALSE, processors = 50)
writeXStringSet(alignment, file="al_2_d.fasta")


!esl-alistat al_2_drna.fasta

Alignment number:    1
Format:              aligned FASTA
Number of sequences: 14828
Alignment length:    913
Total # residues:    3734491
Smallest:            165
Largest:             318
Average length:      251.9
Average identity:    72%
//


!esl-alistat al_1.fasta

Alignment number:    1
Format:              aligned FASTA
Number of sequences: 16159
Alignment length:    2323
Total # residues:    4041636
Smallest:            165
Largest:             318
Average length:      250.1
Average identity:    66%
//


!esl-alistat al_d_2.fasta

Alignment number:    1
Format:              aligned FASTA
Number of sequences: 16159
Alignment length:    1219
Total # residues:    4041636
Smallest:            165
Largest:             318
Average length:      250.1
Average identity:    68%
//


!esl-alistat al_2.fasta

Alignment number:    1
Format:              aligned FASTA
Number of sequences: 14828
Alignment length:    1278
Total # residues:    3734491
Smallest:            165
Largest:             318
Average length:      251.9
Average identity:    71%
//


!esl-alistat al_2_d.fasta

Alignment number:    1
Format:              aligned FASTA
Number of sequences: 14828
Alignment length:    869
Total # residues:    3734491
Smallest:            165
Largest:             318
Average length:      251.9
Average identity:    72%
//


%%R

library(phyloseq)
library(tidyverse)
library(DECIPHER)



ps.f <- read_rds("ps_f.rds")
writeXStringSet(ps.f@refseq, file="rep.fasta")
write.table(t(ps.f@otu_table@.Data), file = "otu.txt", sep= "\t", col.names = NA, quote=FALSE)
write.table(ps.f@tax_table@.Data, file = "taxa.txt", sep= "\t", col.names = NA, quote=FALSE)


!grep Seq al_2_d.log | sed 's/^ *//g' | sed 's/ \+ /\t/g' > iqtee_chi_al_d_2.tsv

df = pd.read_csv("iqtee_chi_al_d_2.tsv", delimiter='\t', header = None)
df.columns = ['somenumber', 'Id', 'Gap_Ambiguity', 'Composition', 'p-value']
df_filt = df[['Id', 'Composition']]

df_otus = pd.read_csv("otu.txt", delimiter='\t')
abundance = df_otus.sum(axis=1).tolist()

df_taxa = pd.read_csv("taxa.txt", delimiter='\t')
df_taxa = df_taxa.rename(columns={"Unnamed: 0":"Id"})

path = "rep.fasta"
sizes = [len(rec) for rec in SeqIO.parse(path, 'fasta')]

df_mix = df_taxa.merge(df_filt, on='Id')
df_mix['len'] = sizes
df_mix['abu'] = [math.log(x, 2) for x in abundance]

df_mix


df_mix

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-b513c6e03045> in <module>
----> 1 df_mix

NameError: name 'df_mix' is not defined


plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_mix,
    x='len',
    y='abu',
    hue='Composition'
)

<AxesSubplot:xlabel='len', ylabel='abu'>


pd.set_option('display.max_rows', None)
df_mix.query('(Composition == "failed") & (Phylum)').sort_values('len', ascending=False).groupby("Order").size().sort_values(ascending=False)

Order
Gemmatimonadales                       132
Chloroflexales                          74
Chitinophagales                         73
Kallotenuales                           69
Thermomicrobiales                       65
Gemmatales                              62
Flavobacteriales                        40
Isosphaerales                           37
Sphingobacteriales                      28
Cytophagales                            17
Bacteroidales                           14
Ktedonobacterales                       12
Entomoplasmatales                       11
Planctomycetales                         8
Tepidisphaerales                         6
Peptostreptococcales-Tissierellales      5
Rokubacteriales                          5
SBR1031                                  5
Saccharimonadales                        5
Acetobacterales                          5
Longimicrobiales                         4
Rhodospirillales                         2
Rickettsiales                            2
Coriobacteriales                         2
Campylobacterales                        2
Bacteroidetes_VC2.1_Bac22                2
Nitrospirales                            1
Opitutales                               1
11-24                                    1
Mycoplasmatales                          1
Holosporales                             1
Absconditabacteriales_(SR1)              1
Elev-1554                                1
DS-100                                   1
Cyanobacteriales                         1
Candidatus_Yanofskybacteria              1
Candidatus_Woesebacteria                 1
Candidatus_Daviesbacteria                1
Ardenticatenales                         1
Actinomycetales                          1
Tistrellales                             1
dtype: int64


df_mix.query('Family == "Isosphaeraceae"').groupby("Composition").size()

Composition
failed    37
passed    33
dtype: int64


df_mix.groupby("Kingdom").size().sort_values(ascending=False)

Kingdom
Bacteria    14459
Archaea       134
dtype: int64


%%bash

tail -n 10 ~/storage/trees/Vasya/same_leng_aln.fa.log | telegram-send --stdin
# telegram-send $text
FILE=/etc/resolv.confx-special/nautilus-clipboard
copy
file:///home/crabron/server2/trees/BI4/its_al_maf.fasta.treefile
if test -f "$FILE"; then
    echo "$FILE exists."
fi


%%R

library(DECIPHER)
dna <- readDNAStringSet("ref.fasta")
head(dna)
alignment <- AlignSeqs(DNAStringSet(dna), anchor=NA,verbose=FALSE, processors = 50)
writeXStringSet(alignment, file="al_3_d.fasta")


%%R

library(DECIPHER)
dna <- readDNAStringSet("ref.fasta")
head(dna)
alignment <- AlignSeqs(RNAStringSet(dna), anchor=NA,verbose=FALSE, processors = 50)
writeXStringSet(alignment, file="al_3_drna.fasta")


!mafft --auto --thread 50 --inputorder "ref.fasta" > "al_3.fasta"

nthread = 50
nthreadpair = 50
nthreadtb = 16
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00


Making a distance matrix ..
 14801 / 14828 (thread    4)
done.

Constructing a UPGMA tree (efffree=0) ... 
 14820 / 14828
done.

Progressive alignment 1/2... 
STEP  14301 / 14827 (thread    8) h
Reallocating..done. *alloclen = 1677
STEP  14801 / 14827 (thread    1) h
done.

Making a distance matrix from msa.. 
 14800 / 14828 (thread    0)
done.

Constructing a UPGMA tree (efffree=1) ... 
 14820 / 14828
done.

Progressive alignment 2/2... 
STEP  14301 / 14827 (thread    3) h
Reallocating..done. *alloclen = 1642
STEP  14801 / 14827 (thread   11) h
done.

disttbfast (nuc) Version 7.475
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
16 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.


%ll -hlt | head -n 4

total 176M
-rw-rw-r-- 1 gladkov2  19M Apr 26 14:45 al_3.fasta
-rw-rw-r-- 1 gladkov2  14M Apr 26 14:44 al_3_drna.fasta
-rw-rw-r-- 1 gladkov2  13M Apr 26 14:30 al_3_d.fasta


!esl-alistat al_3_d.fasta

Alignment number:    1
Format:              aligned FASTA
Number of sequences: 14828
Alignment length:    877
Total # residues:    3734491
Smallest:            165
Largest:             318
Average length:      251.9
Average identity:    72%
//


!esl-alistat al_3_drna.fasta

Alignment number:    1
Format:              aligned FASTA
Number of sequences: 14828
Alignment length:    915
Total # residues:    3734491
Smallest:            165
Largest:             318
Average length:      251.9
Average identity:    72%
//


!esl-alistat al_3.fasta

Alignment number:    1
Format:              aligned FASTA
Number of sequences: 14828
Alignment length:    1278
Total # residues:    3734491
Smallest:            165
Largest:             318
Average length:      251.9
Average identity:    71%
//


!timeout 5 iqtree -s al_3_d.fasta --prefix al_3_d  --quiet 
!grep chi al_3_d.log

****  TOTAL      71.28%  1254 sequences failed composition chi2 test (p-value<5%; df=3)


!timeout 5 iqtree -s al_3_drna.fasta --prefix al_3_drna --quiet 
!grep chi al_3_drna.log

****  TOTAL      72.47%  1241 sequences failed composition chi2 test (p-value<5%; df=3)


!timeout 5 iqtree -s  al_3.fasta --prefix al_3 --quiet 
!grep chi al_3.log

****  TOTAL      80.29%  1350 sequences failed composition chi2 test (p-value<5%; df=3)


!grep Seq al_3_d.log | sed 's/^ *//g' | sed 's/ \+ /\t/g' > iqtee_chi_al_d_3.tsv

df = pd.read_csv("iqtee_chi_al_d_2.tsv", delimiter='\t', header = None)
df.columns = ['somenumber', 'Id', 'Gap_Ambiguity', 'Composition', 'p-value']
df_filt = df[['Id', 'Composition']]

df_otus = pd.read_csv("otu.txt", delimiter='\t')
abundance = df_otus.sum(axis=1).tolist()

df_taxa = pd.read_csv("taxa.txt", delimiter='\t')
df_taxa = df_taxa.rename(columns={"Unnamed: 0":"Id"})

path = "rep.fasta"
sizes = [len(rec) for rec in SeqIO.parse(path, 'fasta')]

df_mix = df_taxa.merge(df_filt, on='Id')
df_mix['len'] = sizes
df_mix['abu'] = [math.log(x, 2) for x in abundance]

df_mix


plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_mix,
    x='len',
    y='abu',
    hue='Composition'
)

<AxesSubplot:xlabel='len', ylabel='abu'>


df_mix.query('Composition=="failed"').count()

Id             838
Kingdom        838
Phylum         838
Class          770
Order          702
Family         668
Genus          286
Species         11
Composition    838
len            838
abu            838
dtype: int64


!iqtree -s al_3_d.fasta -nt 30 -B 1000

IQ-TREE multicore version 2.1.2 COVID-edition for Linux 64-bit built Mar 30 2021
Developed by Bui Quang Minh, James Barbetti, Nguyen Lam Tung,
Olga Chernomor, Heiko Schmidt, Dominik Schrempf, Michael Woodhams.

Host:    8753872069ef (AVX512, FMA3, 2708 GB RAM)
Command: iqtree -s al_3_d.fasta -nt 30 -B 1000
Seed:    24462 (Using SPRNG - Scalable Parallel Random Number Generator)
Time:    Mon Apr 26 14:59:58 2021
Kernel:  AVX+FMA - 30 threads (208 CPU cores detected)

Reading alignment file al_3_d.fasta ... Fasta format detected
Alignment most likely contains DNA/RNA sequences
Alignment has 14828 sequences with 877 columns, 779 distinct patterns
528 parsimony-informative, 135 singleton sites, 214 constant sites
          Gap/Ambiguity  Composition  p-value
   1  Seq16007   77.77%    failed      0.03%
   2  Seq13966   71.84%    failed      0.00%
...
14825  Seq2805    71.27%    passed     90.40%
14826  Seq4034    71.27%    passed     80.53%
14827  Seq7258    71.27%    passed     84.94%
14828  Seq15967   71.27%    passed     76.44%
WARNING: 14828 sequences contain more than 50% gaps/ambiguity
****  TOTAL      71.28%  1254 sequences failed composition chi2 test (p-value<5%; df=3)

NOTE: Restoring information from model checkpoint file al_3_d.fasta.model.gz

CHECKPOINT: Initial tree restored
WARNING: Number of threads seems too high for short alignments. Use -T AUTO to determine best number of threads.
Perform fast likelihood tree search using GTR+I+G model...
Estimate model parameters (epsilon = 5.000)
Perform nearest neighbor interchange...
Estimate model parameters (epsilon = 1.000)
1. Initial log-likelihood: -558634.099
2. Current log-likelihood: -558622.053
3. Current log-likelihood: -558620.097
Optimal log-likelihood: -558619.863
Rate parameters:  A-C: 0.86330  A-G: 2.40308  A-T: 1.49810  C-G: 0.63581  C-T: 4.45988  G-T: 1.00000
Base frequencies:  A: 0.253  C: 0.200  G: 0.351  T: 0.196
Proportion of invariable sites: 0.000
Gamma shape alpha: 0.643
Parameters optimization took 3 rounds (171.491 sec)
Time for fast ML tree search: 1182.339 seconds

NOTE: ModelFinder requires 3772 MB RAM!
ModelFinder will test up to 286 DNA models (sample size: 877) ...
 No. Model         -LnL         df  AIC          AICc         BIC
WARNING: Number of threads seems too high for short alignments. Use -T AUTO to determine best number of threads.
  1  GTR+F         629676.290   29661 1318674.580  1760927838.580 1460350.554
WARNING: Number of threads seems too high for short alignments. Use -T AUTO to determine best number of threads.
  2  GTR+F+I       629675.889   29662 1318675.777  1761046487.777 1460356.528
WARNING: Number of threads seems too high for short alignments. Use -T AUTO to determine best number of threads.
  3  GTR+F+G4      558619.434   29662 1176562.868  1760904374.868 1318243.619
WARNING: Number of threads seems too high for short alignments. Use -T AUTO to determine best number of threads.
  4  GTR+F+I+G4    558619.419   29663 1176564.837  1761023028.837 1318250.364
WARNING: Number of threads seems too high for short alignments. Use -T AUTO to determine best number of threads.
  5  GTR+F+R2      574770.565   29663 1208867.131  1761055331.131 1350552.657
WARNING: Number of threads seems too high for short alignments. Use -T AUTO to determine best number of threads.
  6  GTR+F+R3      562988.577   29665 1185307.153  1761269087.153 1327002.233
WARNING: Number of threads seems too high for short alignments. Use -T AUTO to determine best number of threads.
^C

	0	1	2	3	4
0	1	Seq1	89.15%	passed	16.22%
1	2	Seq2	89.15%	passed	22.21%
2	3	Seq3	89.15%	passed	22.49%
3	4	Seq4	92.81%	failed	1.06%
4	5	Seq5	89.15%	passed	25.56%
...	...	...	...	...	...
16154	16155	Seq16155	86.31%	passed	37.86%
16155	16156	Seq16156	86.31%	passed	27.84%
16156	16157	Seq16157	88.81%	failed	3.59%
16157	16158	Seq16158	90.74%	passed	6.29%
16158	16159	Seq16159	89.15%	passed	14.00%

	Id	Kingdom	Phylum	Class	Order	Family	Genus	Species	Composition	Res
0	Seq1	Bacteria	Firmicutes	Bacilli	Bacillales	Planococcaceae	NaN	longiquaesitum	passed	True
1	Seq2	Bacteria	Firmicutes	Bacilli	Bacillales	NaN	NaN	NaN	passed	True
2	Seq3	Bacteria	Cyanobacteria	Cyanobacteriia	Chloroplast	NaN	NaN	NaN	passed	False
3	Seq4	Eukaryota	NaN	NaN	NaN	NaN	NaN	NaN	failed	False
4	Seq5	Archaea	Crenarchaeota	Nitrososphaeria	Nitrososphaerales	Nitrososphaeraceae	NaN	NaN	passed	True
...	...	...	...	...	...	...	...	...	...	...
16154	Seq16155	Bacteria	Planctomycetota	Planctomycetes	Gemmatales	Gemmataceae	Telmatocola	NaN	passed	True
16155	Seq16156	Bacteria	Chloroflexi	Ktedonobacteria	C0119	NaN	NaN	NaN	passed	True
16156	Seq16157	Eukaryota	NaN	NaN	NaN	NaN	NaN	NaN	failed	False
16157	Seq16158	Eukaryota	NaN	NaN	NaN	NaN	NaN	NaN	passed	True
16158	Seq16159	Bacteria	Actinobacteriota	Actinobacteria	Micromonosporales	Micromonosporaceae	NaN	NaN	passed	True

	Id	Kingdom	Phylum	Class	Order	Family	Genus	Species	Composition	Res	len	abu
0	Seq1	Bacteria	Firmicutes	Bacilli	Bacillales	Planococcaceae	NaN	longiquaesitum	passed	True	252	15.350732
1	Seq2	Bacteria	Firmicutes	Bacilli	Bacillales	NaN	NaN	NaN	passed	True	252	14.813380
2	Seq3	Bacteria	Cyanobacteria	Cyanobacteriia	Chloroplast	NaN	NaN	NaN	passed	False	252	14.800445
3	Seq4	Eukaryota	NaN	NaN	NaN	NaN	NaN	NaN	failed	False	167	14.478959
4	Seq5	Archaea	Crenarchaeota	Nitrososphaeria	Nitrososphaerales	Nitrososphaeraceae	NaN	NaN	passed	True	252	14.365092
...	...	...	...	...	...	...	...	...	...	...	...	...
16154	Seq16155	Bacteria	Planctomycetota	Planctomycetes	Gemmatales	Gemmataceae	Telmatocola	NaN	passed	True	318	0.000000
16155	Seq16156	Bacteria	Chloroflexi	Ktedonobacteria	C0119	NaN	NaN	NaN	passed	True	318	0.000000
16156	Seq16157	Eukaryota	NaN	NaN	NaN	NaN	NaN	NaN	failed	False	260	0.000000
16157	Seq16158	Eukaryota	NaN	NaN	NaN	NaN	NaN	NaN	passed	True	215	0.000000
16158	Seq16159	Bacteria	Actinobacteriota	Actinobacteria	Micromonosporales	Micromonosporaceae	NaN	NaN	passed	True	252	0.000000

	Id	Kingdom	Phylum	Class	Order	Family	Genus	Species	Composition	len	abu
0	Seq16007	NaN	NaN	NaN	NaN	NaN	NaN	NaN	failed	195	1.000000
1	Seq13966	NaN	NaN	NaN	NaN	NaN	NaN	NaN	failed	247	2.321928
2	Seq15747	NaN	NaN	NaN	NaN	NaN	NaN	NaN	failed	233	1.000000
3	Seq15400	NaN	NaN	NaN	NaN	NaN	NaN	NaN	failed	180	1.000000
4	Seq10913	NaN	NaN	NaN	NaN	NaN	NaN	NaN	failed	271	3.321928
...	...	...	...	...	...	...	...	...	...	...	...
14823	Seq926	Bacteria	Proteobacteria	Gammaproteobacteria	Aeromonadales	Aeromonadaceae	Aeromonas	NaN	passed	252	8.839204
14824	Seq2805	Bacteria	Proteobacteria	Gammaproteobacteria	Alteromonadales	Shewanellaceae	Shewanella	NaN	passed	252	6.714246
14825	Seq4034	Bacteria	Proteobacteria	Gammaproteobacteria	Vibrionales	Vibrionaceae	Photobacterium	NaN	passed	252	5.954196
14826	Seq7258	Bacteria	Proteobacteria	Gammaproteobacteria	Vibrionales	Vibrionaceae	Vibrio	NaN	passed	252	4.584963
14827	Seq15967	Bacteria	NaN	NaN	NaN	NaN	NaN	NaN	passed	252	1.000000

	Id	Kingdom	Phylum	Class	Order	Family	Genus	Species	Composition	len	abu
0	Seq15681	Archaea	Halobacterota	Methanocellia	Methanocellales	Methanocellaceae	Rice_Cluster_I	NaN	passed	253	1.000000
1	Seq10274	Archaea	Halobacterota	Methanosarcinia	Methanosarciniales	Methanosarcinaceae	Methanosarcina	NaN	passed	253	3.584963
2	Seq13135	Archaea	Halobacterota	Methanosarcinia	Methanosarciniales	Methanosarcinaceae	Methanosarcina	NaN	passed	253	2.584963
3	Seq13143	Archaea	Halobacterota	Methanosarcinia	Methanosarciniales	Methanosarcinaceae	Methanosarcina	NaN	passed	253	2.584963
4	Seq9591	Archaea	Halobacterota	Methanosarcinia	Methanosarciniales	Methanosarcinaceae	Methanosarcina	NaN	passed	253	3.807355
5	Seq2962	Archaea	Halobacterota	Methanosarcinia	Methanosarciniales	Methanosarcinaceae	Methanosarcina	NaN	passed	253	6.614710
6	Seq10000	Archaea	Thermoplasmatota	Thermoplasmatota_cl	Thermoplasmatota_or	Thermoplasmatota_fa	NaN	NaN	passed	252	3.700440
7	Seq14321	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	2.000000
8	Seq11634	Archaea	Thermoplasmatota	Thermoplasmatota_cl	Thermoplasmatota_or	Thermoplasmatota_fa	NaN	NaN	passed	252	3.169925
9	Seq7773	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	4.392317
10	Seq4330	Archaea	Thermoplasmatota	Thermoplasmatota_cl	Thermoplasmatota_or	Thermoplasmatota_fa	NaN	NaN	passed	252	5.781360
11	Seq13711	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	2.321928
12	Seq3863	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	6.044394
13	Seq5763	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	5.129283
14	Seq6292	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	4.906891
15	Seq12917	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	2.584963
16	Seq7772	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	4.392317
17	Seq9470	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	3.807355
18	Seq11854	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	3.000000
19	Seq618	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	9.419960
20	Seq12120	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	3.000000
21	Seq4022	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	5.954196
22	Seq2958	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	6.614710
23	Seq14422	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	2.000000
24	Seq4755	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	5.584963
25	Seq1854	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	7.539159
26	Seq8469	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	4.169925
27	Seq9328	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	3.906891
28	Seq10169	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	3.584963
29	Seq6412	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	4.857981
30	Seq11377	Archaea	Thermoplasmatota	Thermoplasmata	NaN	NaN	NaN	NaN	passed	252	3.169925
31	Seq10042	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	3.700440
32	Seq14494	Archaea	Thermoplasmatota	NaN	NaN	NaN	NaN	NaN	passed	252	2.000000
33	Seq10751	Archaea	Euryarchaeota	Methanobacteria	Methanobacteriales	Methanobacteriaceae	Methanobrevibacter	NaN	passed	253	3.459432
34	Seq8222	Archaea	Euryarchaeota	Methanobacteria	Methanobacteriales	Methanobacteriaceae	Methanobrevibacter	NaN	passed	253	4.247928
35	Seq14374	Archaea	Euryarchaeota	Methanobacteria	Methanobacteriales	Methanobacteriaceae	Methanobrevibacter	NaN	passed	253	2.000000
36	Seq12637	Archaea	Euryarchaeota	Methanobacteria	Methanobacteriales	Methanobacteriaceae	Methanosphaera	NaN	passed	253	2.807355
37	Seq12020	Archaea	Euryarchaeota	Methanobacteria	Methanobacteriales	Methanobacteriaceae	Methanobacterium	NaN	passed	253	3.000000
38	Seq14004	Archaea	Euryarchaeota	Methanobacteria	Methanobacteriales	Methanobacteriaceae	Methanobacterium	NaN	passed	253	2.000000
13976	Seq7258	Bacteria	Proteobacteria	Gammaproteobacteria	Vibrionales	Vibrionaceae	Vibrio	NaN	passed	252	4.584963

	len	abu	id
3	167	22835	Seq4
7	168	18599	Seq8

Фильтр:¶