Related article: Daisley et al. (2024). isolateR: an R package for generating microbial libraries from Sanger sequencing data. In draft

Related GitHub: https://github.com/bdaisley/isolateR

The R code provided below was used to generate the following figures:

  • Figure 5B
  • Figure 5C

See here for the in silico PCR script used to extract full length 16S rRNA, V3-V6 region 16S rRNA, and cpn60 UT marker genes from IMGG genomes:

1. Load required packages

library(knitr)
library(isolateR)
library(DECIPHER)
library(readr)
library(dplyr)
library(stringr)
library(ggplot2)
library(ggvenn)
library(reshape2)
library(stringr)
library(httr)

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils
##  [8] datasets  methods   base
##
## other attached packages:
##  [1] httr_1.4.3          reshape2_1.4.4      ggvenn_0.1.10
##  [4] ggplot2_3.5.0       stringr_1.4.0       readr_2.1.2
##  [7] DECIPHER_2.16.1     RSQLite_2.2.14      isolateR_0.0.0.9003
## [10] reactablefmtr_2.0.0 reactable_0.4.3     dplyr_1.0.9
## [13] Biostrings_2.64.0   GenomeInfoDb_1.24.2 XVector_0.36.0
## [16] IRanges_2.30.0      S4Vectors_0.34.0    BiocGenerics_0.42.0
## [19] knitr_1.39
##
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-3       ggtree_3.4.0           sangeranalyseR_1.6.1
##   [4] seqinr_4.2-23          ellipsis_0.3.2         ggdendro_0.1.23
##   [7] aplot_0.1.6            rstudioapi_0.13        DT_0.26
##  [10] bit64_4.0.5            fansi_1.0.3            R.methodsS3_1.8.1
##  [13] cachem_1.0.6           ade4_1.7-19            jsonlite_1.8.8
##  [16] R.oo_1.24.0            shinydashboard_0.7.2   msa_1.30.1
##  [19] shiny_1.7.1            LPSN_0.14.1            BiocManager_1.30.18
##  [22] rentrez_1.2.3          sangerseqR_1.32.0      compiler_4.2.0
##  [25] Matrix_1.4-1           assertthat_0.2.1       fastmap_1.1.0
##  [28] lazyeval_0.2.2         cli_3.6.2              later_1.3.0
##  [31] htmltools_0.5.2        tools_4.2.0            gtable_0.3.0
##  [34] glue_1.6.2             GenomeInfoDbData_1.2.8 Rcpp_1.0.8.3
##  [37] jquerylib_0.1.4        excelR_0.4.0           vctrs_0.6.5
##  [40] ape_5.6-2              nlme_3.1-157           crosstalk_1.2.0
##  [43] xfun_0.39              openxlsx_4.2.5         mime_0.12
##  [46] lifecycle_1.0.4        shinycssloaders_1.0.0  xmlconvert_0.1.2
##  [49] XML_3.99-0.16.1        zlibbioc_1.42.0        MASS_7.3-57
##  [52] scales_1.3.0           hms_1.1.1              promises_1.2.0.1
##  [55] yaml_2.3.5             curl_4.3.2             memoise_2.0.1
##  [58] dataui_0.0.1           gridExtra_2.3          ggfun_0.0.6
##  [61] pander_0.6.5           yulab.utils_0.0.4      sass_0.4.1
##  [64] stringi_1.7.6          tidytree_0.3.9         zip_2.2.0
##  [67] rlang_1.1.3            pkgconfig_2.0.3        bitops_1.0-7
##  [70] evaluate_0.23          lattice_0.20-45        purrr_1.0.2
##  [73] treeio_1.12.0          patchwork_1.1.1        htmlwidgets_1.5.4
##  [76] bit_4.0.4              tidyselect_1.2.0       plyr_1.8.7
##  [79] logger_0.2.2           magrittr_2.0.3         R6_2.5.1
##  [82] generics_0.1.2         DBI_1.1.2              withr_2.5.0
##  [85] pillar_1.7.0           svMisc_1.2.3           RCurl_1.98-1.6
##  [88] tibble_3.1.7           crayon_1.5.1           shinyWidgets_0.7.6
##  [91] utf8_1.2.2             plotly_4.10.0          tzdb_0.3.0
##  [94] rmarkdown_2.14         data.table_1.14.2      blob_1.2.3
##  [97] digest_0.6.34          xtable_1.8-4           tidyr_1.2.0
## [100] httpuv_1.6.5           gridGraphics_0.5-1     R.utils_2.11.0
## [103] munsell_0.5.0          viridisLite_0.4.0      ggplotify_0.1.0
## [106] bslib_0.3.1            shinyjs_2.1.0

2. Inspect genome datasets

This section assumes the marker genes have already been extracted from genomes via in silico PCR as described here:

#Load IMGG Study Supplementary Table S7
dir.create("imgg")
download.file("https://static-content.springer.com/esm/art%3A10.1038%2Fs41564-022-01270-1/MediaObjects/41564_2022_1270_MOESM4_ESM.xlsx", "imgg/41564_2022_1270_MOESM4_ESM.xlsx", method= "libcurl", mode='wb', headers=NULL)

mongo.meta <- readxl::read_xlsx("imgg/41564_2022_1270_MOESM4_ESM.xlsx", sheet="Table S7", skip=1) %>%
  mutate(subject_id = stringr::str_split_fixed(IMGG_ID, "\\.", 2)[,1]) %>%
  mutate(subject_id = gsub("^[A-Z]", "s", subject_id)) %>%
  group_by(subject_id) %>%
  mutate(per_subject_counts=n()) %>%
  ungroup()
## New names:
## • `Completeness` -> `Completeness...8`
## • `Contamination` -> `Contamination...9`
## • `N_contigs` -> `N_contigs...10`
## • `N50` -> `N50...11`
## • `GC_content` -> `GC_content...14`
## • `N_contigs` -> `N_contigs...28`
## • `N50` -> `N50...29`
## • `GC_content` -> `GC_content...30`
## • `Completeness` -> `Completeness...31`
## • `Contamination` -> `Contamination...32`
full.pcr.dir <- "/mnt/md0/isolateR_meta_analysis/Rmarkdown/02_IMGG_marker_genes_pcr"
pcr.summary.raw <- readr::read_tsv(paste(full.pcr.dir, "/summary_info.txt", sep=""), show_col_types = FALSE)
print(colSums(pcr.summary.raw[,2:4] > 0))
##    primers_16S_FULL    primers_16S_V3V6 primers_cpn60_degen
##                6705                6726                6540
#Print venn diagram of overlap of marker genes across the genomes
ggvenn::ggvenn(list(FULL_16S = (pcr.summary.raw %>% filter(primers_16S_FULL > 0))$name,
                    V3V6_16S = (pcr.summary.raw %>% filter(primers_16S_V3V6 > 0 ))$name,
                    cpn60_UT = (pcr.summary.raw %>% filter(primers_cpn60_degen > 0 ))$name),
               fill_color = c("#868686FF", "#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5,
               set_name_size = 5)

#Plot copy number of marker genes per genome
pcr.summary.mer <- pcr.summary.raw %>% dplyr::rename("IMGG_ID" = "name") %>%
  dplyr::left_join(., mongo.meta, by="IMGG_ID") %>%
  mutate(Domain=stringr::str_split_fixed(Lineage, ";", 8)[,1]) %>%
  mutate_at(vars(Domain), funs(gsub("d__", "", .))) %>%
  filter(Domain=="Bacteria") %>%
  mutate(Phylum=stringr::str_split_fixed(Lineage, ";", 8)[,2]) %>%
  mutate_at(vars(Phylum), funs(gsub("p__", "", .))) %>%
  filter(Phylum!="") %>%
  group_by(Phylum) %>%
  mutate(phylum_counts=n()) %>%
  ungroup() %>%
  reshape2::melt(., id.vars=c("IMGG_ID", "Phylum", "phylum_counts")) %>%
  filter(grepl("primers_16S_FULL|primers_16S_V3V6|primers_cpn60_degen", variable)) %>% droplevels() %>%
  mutate_at(vars(value, phylum_counts), as.numeric)

ggplot(subset(pcr.summary.mer, phylum_counts > 3),
       aes(x = value,  y=Phylum, fill=Phylum)) +
  ggridges::geom_density_ridges(scale = 0.9,
                                linewidth = 0.3) +
  scale_y_discrete(limits=rev) +
  facet_grid(~variable) +
  xlab("Copy number per genome") +
  theme_bw() + theme(legend.position = "none")
## Picking joint bandwidth of 0.404
## Picking joint bandwidth of 0.35
## Picking joint bandwidth of 0.092

#Filter only the genomes that have all marker genes present
pcr.summary.filt <- readr::read_tsv(paste(full.pcr.dir, "/summary_info.txt", sep=""), show_col_types = FALSE) %>%
  filter(primers_16S_FULL >0 & primers_16S_V3V6 >0 & primers_cpn60_degen > 0)

print(colSums(pcr.summary.filt[,2:4] > 0))
##    primers_16S_FULL    primers_16S_V3V6 primers_cpn60_degen
##                6533                6533                6533

3. isoLIB cutoff benchmark

Run isoLIB at different cutoffs for each of the marker genes, per-subject for each of the N=60 individuals and then the pooled dataset together.

These steps will take a very long time to run on a standard laptop/desktop computer. Recommended to run on a HPC with 48+ cores/physical processing units. Alternatively, skip to Step 3.4 and download the RDS files to continue.

3.1 Set cutoffs

group_cutoff.list <- c(1,0.999,0.998,0.997,0.996,0.995,0.994,0.993,0.992,0.991,0.990,
                       0.989,0.988,0.987,0.986,0.985,0.984,0.983,0.982,0.981,0.980,
                       0.979,0.978,0.977,0.976,0.975,0.974,0.973,0.972,0.971,0.970,
                       0.969,0.968,0.967,0.966,0.965,0.964,0.963,0.962,0.961,0.960,
                       0.959,0.958,0.957,0.956,0.955,0.954,0.953,0.952,0.951,0.950,
                       0.940,0.930,0.920,0.910,0.900,0.890,0.880,0.870,0.860,0.850)

3.2 16S - Full length

#Make index list for pulling marker gene sequences
pcr.file.names <- paste("_", colnames(pcr.summary.filt[,2:4]), "_seqs.fasta", sep="")

#Load in sequences
seq.list.u <- parallel::mclapply(1:length(pcr.summary.filt$name), mc.cores=16, function(i) {
  seq.list1 <- Biostrings::readBStringSet(paste(full.pcr.dir, "/seqs/", pcr.summary.filt$name[i], "_primers_16S_FULL_seqs.fasta", sep=""))
  if(length(seq.list1) > 1){
    seq.list2 <- DECIPHER::AlignSeqs(DNAStringSet(seq.list1))
    seq.list3 <- BStringSet(DECIPHER::ConsensusSequence(seq.list2, ignoreNonBases=TRUE))
    names(seq.list3) <- names(seq.list1)[1]
    seq.list3
  } else {
    seq.list3 <- seq.list1
  }
})

seq.list <- do.call(c, seq.list.u)

#Make holders to allow input for isoLIB transformation of data
date.formatted <- gsub("-", "_", Sys.Date())
seq.list.df <- cbind(input=rep("path",length(names(seq.list))),
                     warning=rep("xNA", length(names(seq.list))),
                     date=rep(date.formatted, length(names(seq.list))),
                     filename=names(seq.list),
                     seqs_raw=rep("xNA", length(names(seq.list))),
                     phred_raw=rep("xNA", length(names(seq.list))),
                     Ns_raw=rep("xNA", length(names(seq.list))),
                     length_raw=rep("xNA", length(names(seq.list))),
                     phred_spark_raw=rep("xNA", length(names(seq.list))),
                     seqs_trim=paste(seq.list),
                     phred_trim=rep(0, length(names(seq.list))),
                     Ns_trim=rep(0, length(names(seq.list))),
                     length_trim=rep(0, length(names(seq.list))),
                     closest_match=rep("xNA", length(names(seq.list))),
                     NCBI_acc=rep("xNA", length(names(seq.list))),
                     ID=rep(0, length(names(seq.list))),
                     rank_phylum=rep("xNA", length(names(seq.list))),
                     rank_class=rep("xNA", length(names(seq.list))),
                     rank_order=rep("xNA", length(names(seq.list))),
                     rank_family=rep("xNA", length(names(seq.list))),
                     rank_genus=rep("xNA", length(names(seq.list))),
                     rank_species=rep("xNA", length(names(seq.list))),
                     phylum_threshold=rep(0, length(names(seq.list))),
                     class_threshold=rep(0, length(names(seq.list))),
                     order_threshold=rep(0, length(names(seq.list))),
                     family_threshold=rep(0, length(names(seq.list))),
                     genus_threshold=rep(0, length(names(seq.list))),
                     species_threshold=rep(0, length(names(seq.list)))) %>% as.data.frame() %>%
  mutate(filename=gsub("_16S_copy[0-9]+", "", filename)) %>%
  mutate(length_trim= nchar(seqs_trim))

collector.list1 <- list()
for(i in 1:length(unique(mongo.meta$subject_id))){
  sample.list <- (mongo.meta %>% arrange(desc(per_subject_counts)) %>% filter(subject_id %in% unique(subject_id)[i]))$IMGG_ID
  seq.list.df.filt <- seq.list.df %>% filter(filename %in% sample.list)
  seq.list.df.filt <- seq.list.df.filt[1:10,]
  write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
  #----------------------------------------------------------------------------------isoLIB function
  for(ii in 1:length(group_cutoff.list)){
    isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
    isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
      mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
      mutate(group_counts=length(unique(sequence_group))) %>%
      mutate(group_props=group_counts/n()) %>%
      mutate(result=paste("result_", i, sep=""))
    collector.list1[[paste("result_", i, "_cutoff_", ii, sep="")]] <- isoLIB.df
  }
}
final.list.individuals <- dplyr::bind_rows(c(collector.list1))
saveRDS(final.list.individuals, "individual_list_16S_FULL")


#---For all
collector.list2 <- list()
for(ii in 1:length(group_cutoff.list)){
  seq.list.df.filt <- seq.list.df[1:10,]
  write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
  isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
  isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
    mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
    mutate(group_counts=length(unique(sequence_group))) %>%
    mutate(group_props=group_counts/n()) %>%
    mutate(result=paste("result_", "all", sep=""))

  collector.list2[[paste("result_all", "_cutoff_", ii, sep="")]] <- isoLIB.df

}
final.list.pooled <- dplyr::bind_rows(c(collector.list2))
saveRDS(final.list.pooled, "all_list_16S_FULL.RDS")

final.list.individuals <- readRDS("individual_list_16S_FULL.RDS")
isoLIB.S4.mongo <- readRDS("all_list_16S_FULL.RDS")
res.list.df.mongo.all <- bind_rows(isoLIB.S4.mongo) %>%
  group_by(cutoff) %>% mutate(group_counts= length(unique(sequence_group))) %>%
  mutate(group_props=group_counts/n()) %>% ungroup() %>% mutate(result=paste("result_", "all", sep=""))

final.list.df.16SFULL <- rbind(bind_rows(final.list.individuals),
                               res.list.df.mongo.all) %>% dplyr::rename("IMGG_ID" = "filename") %>%
  dplyr::left_join(., mongo.meta, by="IMGG_ID") %>% arrange(desc(representative)) %>% mutate(lineage=Lineage)

#----------Count number of genomes captured per cutoff
final.list.df.16SFULL.counts <- final.list.df.16SFULL %>%
  mutate(main_groups = ifelse(result=="result_all", "Full dataset", "Per individual")) %>%
  group_by(cutoff, sequence_group, result) %>% mutate(species_rep = first(lineage)) %>%
  ungroup() %>%
  group_by(cutoff, result) %>% mutate(species_counts=length(unique(species_rep))) %>% mutate(species_props=species_counts/length(unique(lineage))) %>%
  ungroup() %>% group_by(result) %>% distinct(cutoff, .keep_all=TRUE) %>% ungroup()

saveRDS(final.list.df.16SFULL.counts, "isoLIB_results_16S_full_length.RDS")

3.2 16S - V3V6 region

#Load in sequences
seq.list.u <- parallel::mclapply(1:length(pcr.summary.filt$name), mc.cores=16, function(i) {
  seq.list1 <- Biostrings::readBStringSet(paste(full.pcr.dir, "/seqs/", pcr.summary.filt$name[i], "_primers_16S_V3V6_seqs.fasta", sep=""))
  if(length(seq.list1) > 1){
    seq.list2 <- DECIPHER::AlignSeqs(DNAStringSet(seq.list1))
    seq.list3 <- BStringSet(DECIPHER::ConsensusSequence(seq.list2, ignoreNonBases=TRUE))
    names(seq.list3) <- names(seq.list1)[1]
    seq.list3
  } else {
    seq.list3 <- seq.list1
  }
})

seq.list <- do.call(c, seq.list.u)

#Make holders to allow input for isoLIB transformation of data
date.formatted <- gsub("-", "_", Sys.Date())
seq.list.df <- cbind(input=rep("path",length(names(seq.list))),
                     warning=rep("xNA", length(names(seq.list))),
                     date=rep(date.formatted, length(names(seq.list))),
                     filename=names(seq.list),
                     seqs_raw=rep("xNA", length(names(seq.list))),
                     phred_raw=rep("xNA", length(names(seq.list))),
                     Ns_raw=rep("xNA", length(names(seq.list))),
                     length_raw=rep("xNA", length(names(seq.list))),
                     phred_spark_raw=rep("xNA", length(names(seq.list))),
                     seqs_trim=paste(seq.list),
                     phred_trim=rep(0, length(names(seq.list))),
                     Ns_trim=rep(0, length(names(seq.list))),
                     length_trim=rep(0, length(names(seq.list))),
                     closest_match=rep("xNA", length(names(seq.list))),
                     NCBI_acc=rep("xNA", length(names(seq.list))),
                     ID=rep(0, length(names(seq.list))),
                     rank_phylum=rep("xNA", length(names(seq.list))),
                     rank_class=rep("xNA", length(names(seq.list))),
                     rank_order=rep("xNA", length(names(seq.list))),
                     rank_family=rep("xNA", length(names(seq.list))),
                     rank_genus=rep("xNA", length(names(seq.list))),
                     rank_species=rep("xNA", length(names(seq.list))),
                     phylum_threshold=rep(0, length(names(seq.list))),
                     class_threshold=rep(0, length(names(seq.list))),
                     order_threshold=rep(0, length(names(seq.list))),
                     family_threshold=rep(0, length(names(seq.list))),
                     genus_threshold=rep(0, length(names(seq.list))),
                     species_threshold=rep(0, length(names(seq.list)))) %>% as.data.frame() %>%
  mutate(filename=gsub("_16S_copy[0-9]+", "", filename)) %>%
  mutate(length_trim= nchar(seqs_trim))

collector.list1 <- list()
for(i in 1:length(unique(mongo.meta$subject_id))){
  sample.list <- (mongo.meta %>% arrange(desc(per_subject_counts)) %>% filter(subject_id %in% unique(subject_id)[i]))$IMGG_ID
  seq.list.df.filt <- seq.list.df %>% filter(filename %in% sample.list)
  seq.list.df.filt <- seq.list.df.filt[1:10,]
  write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
  #----------------------------------------------------------------------------------isoLIB function
  for(ii in 1:length(group_cutoff.list)){
    isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
    isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
      mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
      mutate(group_counts=length(unique(sequence_group))) %>%
      mutate(group_props=group_counts/n()) %>%
      mutate(result=paste("result_", i, sep=""))
    collector.list1[[paste("result_", i, "_cutoff_", ii, sep="")]] <- isoLIB.df
  }
}
final.list.individuals <- dplyr::bind_rows(c(collector.list1))
saveRDS(final.list.individuals, "individual_list_16S_V3V6.RDS")


#---For all
collector.list2 <- list()
for(ii in 1:length(group_cutoff.list)){
  seq.list.df.filt <- seq.list.df[1:10,]
  write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
  isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
  isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
    mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
    mutate(group_counts=length(unique(sequence_group))) %>%
    mutate(group_props=group_counts/n()) %>%
    mutate(result=paste("result_", "all", sep=""))

  collector.list2[[paste("result_all", "_cutoff_", ii, sep="")]] <- isoLIB.df

}
final.list.pooled <- dplyr::bind_rows(c(collector.list2))
saveRDS(final.list.pooled, "all_list_16S_V3V6.RDS")

final.list.individuals <- readRDS("individual_list_16S_V3V6.RDS")
isoLIB.S4.mongo <- readRDS("all_list_16S_V3V6.RDS")
res.list.df.mongo.all <- bind_rows(isoLIB.S4.mongo) %>%
  group_by(cutoff) %>% mutate(group_counts= length(unique(sequence_group))) %>% #ungroup() #%>% filter(cutoff==0.956)
  mutate(group_props=group_counts/n()) %>% ungroup() %>% mutate(result=paste("result_", "all", sep="")) #%>% distinct(cutoff, .keep_all=TRUE)

final.list.df.16SV3V6 <- rbind(bind_rows(final.list.individuals),
                               res.list.df.mongo.all) %>% dplyr::rename("IMGG_ID" = "filename") %>%
  dplyr::left_join(., mongo.meta, by="IMGG_ID") %>% arrange(desc(representative)) %>% mutate(lineage=Lineage)


#----------Count number of genomes captured per cutoff
final.list.df.16SV3V6.counts <- final.list.df.16SV3V6 %>%
  mutate(main_groups = ifelse(result=="result_all", "Full dataset", "Per individual")) %>%
  group_by(cutoff, sequence_group, result) %>% mutate(species_rep = first(lineage)) %>%
  ungroup() %>%
  group_by(cutoff, result) %>% mutate(species_counts=length(unique(species_rep))) %>% mutate(species_props=species_counts/length(unique(lineage))) %>%
  ungroup() %>% group_by(result) %>% distinct(cutoff, .keep_all=TRUE) %>% ungroup()

saveRDS(final.list.df.16SV3V6.counts, "isoLIB_results_16S_V3V6.RDS")

3.3 cpn60 UT

#Load in sequences
seq.list.u <- parallel::mclapply(1:length(pcr.summary.filt$name), mc.cores=16, function(i) {
  seq.list1 <- Biostrings::readBStringSet(paste(full.pcr.dir, "/seqs/", pcr.summary.filt$name[i], "_primers_cpn60_degen_seqs.fasta", sep=""))
  if(length(seq.list1) > 1){
    seq.list2 <- DECIPHER::AlignSeqs(DNAStringSet(seq.list1))
    seq.list3 <- BStringSet(DECIPHER::ConsensusSequence(seq.list2, ignoreNonBases=TRUE))
    names(seq.list3) <- names(seq.list1)[1]
    seq.list3
  } else {
    seq.list3 <- seq.list1
  }
})

seq.list <- do.call(c, seq.list.u)

#Make holders to allow input for isoLIB transformation of data
date.formatted <- gsub("-", "_", Sys.Date())
seq.list.df <- cbind(input=rep("path",length(names(seq.list))),
                     warning=rep("xNA", length(names(seq.list))),
                     date=rep(date.formatted, length(names(seq.list))),
                     filename=names(seq.list),
                     seqs_raw=rep("xNA", length(names(seq.list))),
                     phred_raw=rep("xNA", length(names(seq.list))),
                     Ns_raw=rep("xNA", length(names(seq.list))),
                     length_raw=rep("xNA", length(names(seq.list))),
                     phred_spark_raw=rep("xNA", length(names(seq.list))),
                     seqs_trim=paste(seq.list),
                     phred_trim=rep(0, length(names(seq.list))),
                     Ns_trim=rep(0, length(names(seq.list))),
                     length_trim=rep(0, length(names(seq.list))),
                     closest_match=rep("xNA", length(names(seq.list))),
                     NCBI_acc=rep("xNA", length(names(seq.list))),
                     ID=rep(0, length(names(seq.list))),
                     rank_phylum=rep("xNA", length(names(seq.list))),
                     rank_class=rep("xNA", length(names(seq.list))),
                     rank_order=rep("xNA", length(names(seq.list))),
                     rank_family=rep("xNA", length(names(seq.list))),
                     rank_genus=rep("xNA", length(names(seq.list))),
                     rank_species=rep("xNA", length(names(seq.list))),
                     phylum_threshold=rep(0, length(names(seq.list))),
                     class_threshold=rep(0, length(names(seq.list))),
                     order_threshold=rep(0, length(names(seq.list))),
                     family_threshold=rep(0, length(names(seq.list))),
                     genus_threshold=rep(0, length(names(seq.list))),
                     species_threshold=rep(0, length(names(seq.list)))) %>% as.data.frame() %>%
  mutate(filename=gsub("_cpn60_copy[0-9]+", "", filename)) %>%
  mutate(length_trim= nchar(seqs_trim))

collector.list1 <- list()
for(i in 1:length(unique(mongo.meta$subject_id))){
  sample.list <- (mongo.meta %>% arrange(desc(per_subject_counts)) %>% filter(subject_id %in% unique(subject_id)[i]))$IMGG_ID
  seq.list.df.filt <- seq.list.df %>% filter(filename %in% sample.list)
  seq.list.df.filt <- seq.list.df.filt[1:10,]
  write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
  #----------------------------------------------------------------------------------isoLIB function
  for(ii in 1:length(group_cutoff.list)){
    isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
    isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
      mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
      mutate(group_counts=length(unique(sequence_group))) %>%
      mutate(group_props=group_counts/n()) %>%
      mutate(result=paste("result_", i, sep=""))
    collector.list1[[paste("result_", i, "_cutoff_", ii, sep="")]] <- isoLIB.df
  }
}
final.list.individuals <- dplyr::bind_rows(c(collector.list1))
saveRDS(final.list.individuals, "individual_list_cpn60.RDS")


#---For all
collector.list2 <- list()
for(ii in 1:length(group_cutoff.list)){
  seq.list.df.filt <- seq.list.df[1:10,]
  write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
  isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
  isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
    mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
    mutate(group_counts=length(unique(sequence_group))) %>%
    mutate(group_props=group_counts/n()) %>%
    mutate(result=paste("result_", "all", sep=""))

  collector.list2[[paste("result_all", "_cutoff_", ii, sep="")]] <- isoLIB.df

}
final.list.pooled <- dplyr::bind_rows(c(collector.list2))
saveRDS(final.list.pooled, "all_list_cpn60.RDS")


final.list.individuals <- readRDS("individual_list_cpn60.RDS")
isoLIB.S4.mongo <- readRDS("all_list_cpn60.RDS")
res.list.df.mongo.all <- bind_rows(isoLIB.S4.mongo) %>%
  group_by(cutoff) %>% mutate(group_counts= length(unique(sequence_group))) %>% #ungroup() #%>% filter(cutoff==0.956)
  mutate(group_props=group_counts/n()) %>% ungroup() %>% mutate(result=paste("result_", "all", sep="")) #%>% distinct(cutoff, .keep_all=TRUE)

final.list.df.cpn60 <- rbind(bind_rows(final.list.individuals),
                             res.list.df.mongo.all) %>% dplyr::rename("IMGG_ID" = "filename") %>%
  dplyr::left_join(., mongo.meta, by="IMGG_ID") %>% arrange(desc(representative)) %>% mutate(lineage=Lineage)


#----------Count number of genomes captured per cutoff
final.list.df.cpn60.counts <- final.list.df.cpn60 %>%
  mutate(main_groups = ifelse(result=="result_all", "Full dataset", "Per individual")) %>%
  group_by(cutoff, sequence_group, result) %>% mutate(species_rep = first(lineage)) %>%
  ungroup() %>%
  group_by(cutoff, result) %>% mutate(species_counts=length(unique(species_rep))) %>% mutate(species_props=species_counts/length(unique(lineage))) %>%
  ungroup() %>% group_by(result) %>% distinct(cutoff, .keep_all=TRUE) %>% ungroup()

saveRDS(final.list.df.cpn60.counts, "isoLIB_results_cpn60_UT.RDS")


3.4 Generate Figure 5B

final.list.df.16SFULL.counts <- readRDS(url("https://github.com/bdaisley/isolateR/raw/main/benchmark/isoLIB_results_16S_full_length.RDS"))
final.list.df.16SV3V6.counts <- readRDS(url("https://github.com/bdaisley/isolateR/raw/main/benchmark/isoLIB_results_16S_V3V6.RDS"))
final.list.df.cpn60.counts <- readRDS(url("https://github.com/bdaisley/isolateR/raw/main/benchmark/isoLIB_results_cpn60_UT.RDS"))


#----------Graph results ---------------------------- "group_props" and "species_props"
g1 <- ggplot(final.list.df.16SFULL.counts,
             aes(x=cutoff,
                 y=group_props*100,
                 groups=result,
                 fill=main_groups,
                 colour=main_groups)) +
  geom_line(linewidth=0.6,
            alpha=0.6) +
  scale_y_continuous(limits=c(0,100)) +
  scale_x_continuous(limits=c(0.85,1)) +
  scale_fill_manual(values=c("Full dataset"="blue",
                             "Per individual" = "grey30")) +
  scale_color_manual(values=c("Full dataset"="blue",
                              "Per individual" = "grey30")) +
  xlab("isoLIB 'group_cutoff' setting") +
  ylab("% of unique genomes captured") +
  theme(panel.border = element_rect(color ="black",
                                    fill = NA,
                                    linewidth = 0.5,
                                    linetype = 1),
        axis.text.x=element_text(angle=0,
                                 hjust=0.5,
                                 size=8),
        axis.text.y=element_text(size=8),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        legend.box.background = element_rect(colour = "black",
                                             linewidth=0.5),
        legend.title=element_blank(),
        legend.position = "none",
        panel.grid.minor = element_blank())

#---------------------
g2 <- ggplot(final.list.df.16SFULL.counts,
             aes(x=cutoff,
                 y=species_props*100,
                 groups=result,
                 fill=main_groups,
                 colour=main_groups)) +
  geom_line(linewidth=0.6,
            alpha=0.6) +
  scale_y_continuous(limits=c(0,100)) +
  scale_x_continuous(limits=c(0.85,1)) +
  scale_fill_manual(values=c("Full dataset"="blue",
                             "Per individual" = "grey30")) +
  scale_color_manual(values=c("Full dataset"="blue",
                              "Per individual" = "grey30")) +
  xlab("isoLIB 'group_cutoff' setting") +
  ylab("% species captured") +
  theme(panel.border = element_rect(color ="black",
                                    fill = NA,
                                    linewidth = 0.5,
                                    linetype = 1),
        axis.text.x=element_text(angle=0,
                                 hjust=0.5,
                                 size=8),
        axis.text.y=element_text(size=8),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        legend.box.background = element_rect(colour = "black",
                                             linewidth=0.5),
        legend.title=element_blank(),
        legend.position = "none",
        panel.grid.minor = element_blank())


g3 <- ggplot(final.list.df.16SV3V6.counts,
             aes(x=cutoff, y=group_props*100,
                 groups=result,
                 fill=main_groups,
                 colour=main_groups)) +
  geom_line(linewidth=0.6, alpha=0.6) +
  scale_y_continuous(limits=c(0,100)) +
  scale_x_continuous(limits=c(0.85,1)) +
  scale_fill_manual(values=c("Full dataset"="blue",
                             "Per individual" = "grey30")) +
  scale_color_manual(values=c("Full dataset"="blue",
                              "Per individual" = "grey30")) +
  xlab("isoLIB 'group_cutoff' setting") +
  ylab("% of unique genomes captured") +
  theme(panel.border = element_rect(color ="black",
                                    fill = NA,
                                    linewidth = 0.5,
                                    linetype = 1),
        axis.text.x=element_text(angle=0,
                                 hjust=0.5,
                                 size=8),
        axis.text.y=element_text(size=8),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        legend.box.background = element_rect(colour = "black",
                                             linewidth=0.5),
        legend.title=element_blank(),
        legend.position = "none",
        panel.grid.minor = element_blank())

#---------------------
g4 <- ggplot(final.list.df.16SV3V6.counts,
             aes(x=cutoff,
                 y=species_props*100,
                 groups=result,
                 fill=main_groups,
                 colour=main_groups)) +
  geom_line(linewidth=0.6,
            alpha=0.6) +
  scale_y_continuous(limits=c(0,100)) +
  scale_x_continuous(limits=c(0.85,1)) +
  scale_fill_manual(values=c("Full dataset"="blue",
                             "Per individual" = "grey30")) +
  scale_color_manual(values=c("Full dataset"="blue",
                              "Per individual" = "grey30")) +
  xlab("isoLIB 'group_cutoff' setting") +
  ylab("% species captured") +
  theme(panel.border = element_rect(color ="black",
                                    fill = NA,
                                    linewidth = 0.5,
                                    linetype = 1),
        axis.text.x=element_text(angle=0,
                                 hjust=0.5,
                                 size=8),
        axis.text.y=element_text(size=8),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        legend.box.background = element_rect(colour = "black",
                                             linewidth=0.5),
        legend.title=element_blank(),
        legend.position = "none",
        panel.grid.minor = element_blank())

g5 <- ggplot(final.list.df.cpn60.counts,
             aes(x=cutoff,
                 y=group_props*100,
                 groups=result,
                 fill=main_groups,
                 colour=main_groups)) +
  geom_line(linewidth=0.6,
            alpha=0.6) +
  scale_y_continuous(limits=c(0,100)) +
  scale_x_continuous(limits=c(0.85,1)) +
  scale_fill_manual(values=c("Full dataset"="blue",
                             "Per individual" = "grey30")) +
  scale_color_manual(values=c("Full dataset"="blue",
                              "Per individual" = "grey30")) +
  xlab("isoLIB 'group_cutoff' setting") +
  ylab("% of unique genomes captured") +
  theme(panel.border = element_rect(color ="black",
                                    fill = NA,
                                    linewidth = 0.5,
                                    linetype = 1),
        axis.text.x=element_text(angle=0,
                                 hjust=0.5,
                                 size=8),
        axis.text.y=element_text(size=8),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        legend.box.background = element_rect(colour = "black",
                                             linewidth=0.5),
        legend.title=element_blank(),
        legend.position = "none",
        panel.grid.minor = element_blank())

#---------------------
g6 <- ggplot(final.list.df.cpn60.counts,
             aes(x=cutoff,
                 y=species_props*100,
                 groups=result,
                 fill=main_groups,
                 colour=main_groups)) +
  geom_line(linewidth=0.6,
            alpha=0.6) +
  scale_y_continuous(limits=c(0,100)) +
  scale_x_continuous(limits=c(0.85,1)) +
  scale_fill_manual(values=c("Full dataset"="blue",
                             "Per individual" = "grey30")) +
  scale_color_manual(values=c("Full dataset"="blue",
                              "Per individual" = "grey30")) +
  xlab("isoLIB 'group_cutoff' setting") +
  ylab("% species captured") +
  theme(panel.border = element_rect(color ="black",
                                    fill = NA,
                                    linewidth = 0.5,
                                    linetype = 1),
        axis.text.x=element_text(angle=0,
                                 hjust=0.5,
                                 size=8),
        axis.text.y=element_text(size=8),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        legend.box.background = element_rect(colour = "black",
                                             linewidth=0.5),
        legend.title=element_blank(),
        legend.position = "none",
        panel.grid.minor = element_blank())


dd
cowplot::plot_grid(g1,g3,g5,
                   g2,g4,g6,
                   ncol=3)

4. Meta-analysis

4.1 Inner Mongolian dataset

Download files from supplementary of Jin et al. 2023, Nature Microbiology (https://doi.org/10.1038/s41564-022-01270-1)


#dir.create("imgg")
download.file("https://static-content.springer.com/esm/art%3A10.1038%2Fs41564-022-01270-1/MediaObjects/41564_2022_1270_MOESM4_ESM.xlsx", "imgg/41564_2022_1270_MOESM4_ESM.xlsx", method= "libcurl", mode='wb', headers=NULL)

#Load Supplementary Table S7
mongo.meta <- readxl::read_xlsx("imgg/41564_2022_1270_MOESM4_ESM.xlsx", sheet="Table S7", skip=1) %>%
  mutate(subject_id = stringr::str_split_fixed(IMGG_ID, "\\.", 2)[,1]) %>%
  mutate(subject_id = gsub("^[A-Z]", "s", subject_id)) %>%
  group_by(subject_id) %>%
  mutate(per_subject_counts=n()) %>%
  ungroup()
## New names:
## • `Completeness` -> `Completeness...8`
## • `Contamination` -> `Contamination...9`
## • `N_contigs` -> `N_contigs...10`
## • `N50` -> `N50...11`
## • `GC_content` -> `GC_content...14`
## • `N_contigs` -> `N_contigs...28`
## • `N50` -> `N50...29`
## • `GC_content` -> `GC_content...30`
## • `Completeness` -> `Completeness...31`
## • `Contamination` -> `Contamination...32`
#Print head to inspect columns of data available
print(mongo.meta)
## # A tibble: 12,391 × 44
##    IMGG_ID     Cluster_ID Rep_MAG_ID  CMAG  HQ    MIMAG_HQ CheckM_lineage
##    <chr>       <chr>      <chr>       <chr> <chr> <chr>    <chr>
##  1 C59.CMAG_1  C140       C59.CMAG_1  N     Y     Y        p__Actinobacteria
##  2 A102.CMAG_1 C135       C46.CMAG_1  N     Y     Y        f__Bifidobacteriaceae
##  3 C58.CMAG_2  C135       C46.CMAG_1  N     Y     Y        f__Bifidobacteriaceae
##  4 C127.CMAG_1 C548       C38.CMAG_7  N     Y     Y        p__Firmicutes
##  5 B49.CMAG_6  C138       C130.CMAG_5 N     Y     Y        p__Actinobacteria
##  6 C55.CMAG_3  C138       C130.CMAG_5 N     Y     Y        p__Actinobacteria
##  7 B15.CMAG_4  C138       C130.CMAG_5 N     Y     Y        p__Actinobacteria
##  8 C97.CMAG_3  C138       C130.CMAG_5 N     Y     Y        p__Actinobacteria
##  9 C40.CMAG_4  C138       C130.CMAG_5 N     Y     Y        p__Actinobacteria
## 10 B34.CMAG_3  C138       C130.CMAG_5 N     Y     Y        p__Actinobacteria
## # … with 12,381 more rows, and 37 more variables: Completeness...8 <dbl>,
## #   Contamination...9 <dbl>, N_contigs...10 <dbl>, N50...11 <dbl>,
## #   Longest_contig <dbl>, Genome_size <dbl>, GC_content...14 <dbl>,
## #   Coding_density <dbl>, Predicted_genes <dbl>, N_16S_rRNA <dbl>,
## #   N_23S_rRNA <dbl>, N_5S_rRNA <dbl>, N_tRNA <dbl>, UHGG_ID <chr>, ANI <chr>,
## #   N_aligned_fragment <chr>, N_fragment <chr>, Study_set <chr>,
## #   Genome_type <chr>, Length <chr>, N_contigs...28 <chr>, N50...29 <chr>, …
#Calculate number of genomes per species
mongo.meta.counts <- mongo.meta %>%
  mutate(result=paste("result_", "all", sep="")) %>%
  group_by(Lineage) %>%
  mutate(strains_per_species=n()) %>%
  ungroup() %>%
  group_by(Lineage, subject_id) %>%
  mutate(strains_per_species_per_subject=n()) %>%
  ungroup() %>%
  group_by(Lineage) %>%
  mutate(mean_strains_per_species_per_subject=mean(strains_per_species_per_subject)) %>%
  ungroup()

mongo.meta.counts.g <- rbind(mongo.meta.counts %>%
                               distinct(Lineage,.keep_all=TRUE) %>%
                               mutate(variable="01_full", value=strains_per_species),
                             mongo.meta.counts %>%
                               distinct(Lineage,.keep_all=TRUE) %>%
                               mutate(variable="02_per_subject", value=mean_strains_per_species_per_subject))

#Print list of subject IDs
print(unique(mongo.meta.counts$subject_id))
##  [1] "s59"  "s102" "s58"  "s127" "s49"  "s55"  "s15"  "s97"  "s40"  "s34"
## [11] "s54"  "s50"  "s1"   "s124" "s41"  "s125" "s134" "s137" "s112" "s11"
## [21] "s104" "s142" "s36"  "s62"  "s38"  "s162" "s82"  "s171" "s91"  "s46"
## [31] "s77"  "s65"  "s103" "s10"  "s43"  "s72"  "s130" "s165" "s111" "s37"
## [41] "s163" "s83"  "s31"  "s19"  "s152" "s64"  "s95"  "s129" "s126" "s135"
## [51] "s6"   "s149" "s57"  "s74"  "s108" "s89"  "s151" "s73"  "s94"  "s93"

4.2 Hadza dataset

Download files from supplementary of Carter et al. 2023, Cell (https://doi.org/10.1016/j.cell.2023.05.046)

#dir.create("hadza")
#httr::GET("https://www.cell.com/cms/10.1016/j.cell.2023.05.046/attachment/b21a482b-edf0-4049-bd78-38a9bdc171da/mmc1.csv", user_agent("Mozilla/5.0"), #write_disk(file.path(getwd(), "hadza/mmc1.csv"), overwrite=TRUE))
#httr::GET("https://www.cell.com/cms/10.1016/j.cell.2023.05.046/attachment/2fbb050a-bd92-4d0f-8fa5-652c1e95da12/mmc2.zip", user_agent("Mozilla/5.0"), write_disk(file.path(getwd(), "hadza/mmc2.zip")))
#unzip("hadza/mmc2.zip", exdir="hadza")
#Convert .xlsb to .csv
#system2("soffice --convert-to csv hadza/SupplementalTable_S2.xlsb --outdir hadza")

#Load Supplementary Table S1
hadza.meta.subjects <- read.csv("hadza/mmc1.csv")
index.subject <- setNames(hadza.meta.subjects$Subject_ID, hadza.meta.subjects$Sample_Name)
index.camp <- setNames(hadza.meta.subjects$Bush_camp, hadza.meta.subjects$Sample_Name)

#Load Supplementary Table S2
hadza.meta <- readr::read_csv("hadza/SupplementalTable_S2.csv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
#Combine subject identifiers and MAG metadata together > Calculate number of genomes per species
hadza.meta.counts <- hadza.meta %>%
  filter(grepl("Hadza", Population)) %>%
  mutate(Subject_ID = dplyr::recode(!!!index.subject, Sample_Name)) %>%
  mutate(Bush_camp = dplyr::recode(!!!index.camp, Sample_Name)) %>%
  group_by(Species_cluster) %>%
  mutate(strains_per_species=n()) %>%
  ungroup() %>%
  group_by(Species_cluster, Subject_ID) %>%
  mutate(strains_per_species_per_subject=n()) %>%
  ungroup() %>%
  group_by(Species_cluster) %>%
  mutate(mean_strains_per_species_per_subject = mean(strains_per_species_per_subject)) %>%
  ungroup() %>%
  group_by(Species_cluster, Bush_camp) %>%
  mutate(strains_per_species_per_camp=n()) %>% ungroup() %>%
  group_by(Species_cluster) %>%
  mutate(mean_strains_per_species_per_camp = mean(strains_per_species_per_camp)) %>%
  ungroup() %>%
  group_by(Subject_ID) %>%
  mutate(per_subject_counts=n_distinct(genome)) %>%
  mutate(per_subject_species_counts=n_distinct(Species_cluster)) %>%
  ungroup() %>%
  group_by(Bush_camp) %>%
  mutate(per_camp_counts=n_distinct(genome)) %>%
  mutate(per_camp_species_counts=n_distinct(Species_cluster)) %>%
  ungroup() %>%
  select(genome, Species_cluster ,Population, Subject_ID, Bush_camp, matches("per")) %>%
  arrange(Species_cluster)

#Organize data for graphing
hadza.meta.counts.g <- rbind(hadza.meta.counts %>%
                               distinct(Species_cluster,.keep_all=TRUE) %>%
                               mutate(variable="03_full", value=strains_per_species),
                             hadza.meta.counts %>%
                               distinct(Species_cluster,.keep_all=TRUE) %>%
                               mutate(variable="04_per_bush", value=mean_strains_per_species_per_camp),
                             hadza.meta.counts %>%
                               distinct(Species_cluster,.keep_all=TRUE) %>%
                               mutate(variable="05_per_subject", value=mean_strains_per_species_per_subject))

print(hadza.meta.counts.g)
## # A tibble: 7,311 × 16
##    genome       Species_cluster Population Subject_ID Bush_camp strains_per_spe…
##    <chr>        <chr>           <chr>      <chr>      <chr>                <int>
##  1 REFINED_MET… 1002_0          Hadza      INDIV_445  GIDERU                   2
##  2 REFINED_MET… 1003_0          Hadza      INDIV_157  HUKAMAKO                 1
##  3 REFINED_MET… 1016_0          Hadza (in… INDIV_130  KIPAMBA                  2
##  4 REFINED_MET… 1020_0          Hadza      INDIV_84   SENGELI                 34
##  5 REFINED_MET… 1021_0          Hadza (in… INDIV_349  SENGELI                  5
##  6 REFINED_MET… 1022_0          Hadza (in… INDIV_62   SENGELI                  7
##  7 REFINED_MET… 1023_0          Hadza      INDIV_162  HUKAMAKO                 1
##  8 REFINED_MET… 1028_1          Hadza      INDIV_88   SENGELI                 29
##  9 REFINED_MET… 1028_2          Hadza      INDIV_94   HUKAMAKO                12
## 10 REFINED_MET… 1029_1          Hadza      INDIV_275  HUKAMAKO                28
## # … with 7,301 more rows, and 10 more variables:
## #   strains_per_species_per_subject <int>,
## #   mean_strains_per_species_per_subject <dbl>,
## #   strains_per_species_per_camp <int>,
## #   mean_strains_per_species_per_camp <dbl>, per_subject_counts <int>,
## #   per_subject_species_counts <int>, per_camp_counts <int>,
## #   per_camp_species_counts <int>, variable <chr>, value <dbl>

4.3 Generate Figure 5C

combined.g <- rbind(hadza.meta.counts.g %>% select(variable,value) %>% mutate(dataset="Hadza"),
                    mongo.meta.counts.g %>% select(variable,value) %>% mutate(dataset="IMGG"))

g1e <- ggplot(combined.g, aes(x=variable, y=value)) +
  geom_point(position=position_jitter(width=0.4),
             size=0.3,
             colour="grey40",
             alpha=0.3) +
  stat_summary(geom='crossbar',
               fun="mean",
               width=0.5,
               color='blue',
               size=0.3) +
  stat_summary(geom='errorbar',
               width=0.3,
               size=0.5,
               color="blue",
               fun.data ="mean_cl_boot") +
  stat_summary(aes(label=round(..y..,2)),
               fun.y=mean,
               geom="text",
               vjust=-8,
               size=6,
               color="blue") +
  ylab("Number of unique genomes per species") +
  geom_vline(xintercept=c(2.5)) +
  scale_x_discrete(labels=c("Full dataset",
                            "Per subject",
                            'Full dataset',
                            'Per bush camp',
                            "Per subject"))+
  annotate(geom="text",
           hjust="left",
           label=paste("IMGG dataset",
                       "\nN = ", length(unique(mongo.meta.counts$Cluster_ID)), " species",
                       "\nN = ", length(unique(mongo.meta.counts$IMGG_ID)), " unique genomes",
                       "\nN = ", length(unique(mongo.meta.counts$subject_id)), " adults",
                       sep=""), x=0.7, y=250, size=3) +
  annotate(geom="text",
           hjust="left",
           label=paste("Hadza dataset",
                       "\nN = ", length(unique(hadza.meta.counts$Species_cluster)), " species",
                       "\nN = ", length(unique(hadza.meta.counts$genome)), " unique genomes",
                       "\nN = ", length(unique(hadza.meta.counts$Subject_ID)), " adults",
                        sep=""), x=3.0, y=250, size=3) +
  theme(panel.border = element_rect(color ="black",
                                    fill = NA,
                                    linewidth = 0.5,
                                    linetype = 1),
        axis.text.x=element_text(angle=0,
                                 hjust=0.5,
                                 size=10),
        axis.text.y=element_text(size=10),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size=12),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  theme(legend.position = "none")

g1e + geom_text(data=ggplot_build(g1e)$data[[3]],
                aes(x = group,
                    y = ymin,
                    label = round(ymax-ymin,2)),
                color = "blue",
                vjust = -7,
                size=4)

4.4 Optional: Compare UHGG


Unified Human Gastrointestinal Genome (UHGG) catalouge.

From Almeida et al. Nature Biotechnology; https://doi.org/10.1038/s41587-020-0603-3

Here we are using V2.0.2 from EBI ftp server: https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.2

#dir.create("uhgg")
#download.file("https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.2/genomes-all_metadata.tsv", "uhgg/genomes-all_metadata.tsv" , extra = options(timeout=1500))

UHGG.meta <- readr::read_tsv("uhgg/genomes-all_metadata.tsv", col_names = TRUE, show_col_types = FALSE)

#Count number of distinct genomes per species group
UHGG.meta.counts <-  UHGG.meta %>%
  group_by(Species_rep) %>%
  mutate(value= n_distinct(Genome)) %>%
  ungroup()

#Organize data for graphing
UHGG.meta.counts.g <- UHGG.meta.counts %>%
  distinct(Species_rep, .keep_all=TRUE) %>%
  mutate(variable="06_UGHH")

combined.g <- rbind(hadza.meta.counts.g %>%
                      select(variable,value) %>%
                      mutate(dataset="Hadza"),
                    mongo.meta.counts.g %>%
                      select(variable,value) %>%
                      mutate(dataset="IMGG"),
                    UHGG.meta.counts.g %>%
                      select(variable,value) %>%
                      mutate(dataset="UGHH"))


g1e.uhgg <- ggplot(combined.g,
              aes(x=variable, y=value)) +
  geom_point(position=position_jitter(width=0.4),
             size=0.3, colour="grey40",
             alpha=0.3) +
  stat_summary(geom='crossbar',
               fun="mean",
               width=0.5,
               color='blue',
               size=0.3) +
  stat_summary(geom='errorbar',
               width=0.3,
               size=0.5,
               color="blue",
               fun.data ="mean_cl_boot") +
  stat_summary(aes(label=round(..y..,2)),
               fun.y=mean,
               geom="text",
               vjust=-8,
               size=6,
               color="blue") +
  ylab("Number of unique genomes per species") +
  geom_vline(xintercept=c(2.5, 5.5)) +
  scale_x_discrete(labels=c('Full dataset',
                            "Per subject",
                            'Full dataset',
                            'Per bush camp',
                            'Per subject',
                            'Full dataset'))+
  annotate(geom="text",
           hjust="left",
           label=paste("IMGG dataset",
                       "\nN = ", length(unique(mongo.meta.counts$Cluster_ID)), " species",
                       "\nN = ", length(unique(mongo.meta.counts$IMGG_ID)), " unique genomes",
                       "\nN = ", length(unique(mongo.meta.counts$subject_id)), " adults",
                        sep=""), x=0.7, y=5000, size=3) +
  annotate(geom="text",
           hjust="left",
           label=paste("Hadza dataset",
                       "\nN = ", length(unique(hadza.meta.counts$Species_cluster)), " species",
                       "\nN = ", length(unique(hadza.meta.counts$genome)), " unique genomes",
                       "\nN = ", length(unique(hadza.meta.counts$Subject_ID)), " adults",
                       sep=""), x=3.0, y=5000, size=3) +
  annotate(geom="text",
           hjust="left",
           label=paste("UHGG dataset",
                      "\nN = ",
                      length(unique(UHGG.meta.counts$Species_rep)),
                      " species",
                       "\nN = ", length(unique(UHGG.meta.counts$Genome)), " unique genomes",
                       "\nN = ", length(unique(UHGG.meta.counts$Sample_accession)), " biosamples",
                       sep=""), x=5.6, y=5000, size=3) +
  theme(panel.border = element_rect(color ="black",
                                    fill = NA,
                                    linewidth = 0.5,
                                    linetype = 1),
        axis.text.x=element_text(angle=0,
                                 hjust=0.5,
                                 size=10),
        axis.text.y=element_text(size=10),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size=12),
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  theme(legend.position = "none")


g1e.uhgg + geom_text(data=ggplot_build(g1e)$data[[3]], aes(x = group, y = ymin, label = round(ymax-ymin,2)), color = "blue", vjust = -7, size=4)