Related article: Daisley et al. (2024). isolateR: an R package for generating microbial libraries from Sanger sequencing data.
In draft
Related GitHub: https://github.com/bdaisley/isolateR
The R code provided below was used to generate the following figures:
See here for the in silico PCR script used to extract full length 16S rRNA, V3-V6 region 16S rRNA, and cpn60 UT marker genes from IMGG genomes:
library(knitr)
library(isolateR)
library(DECIPHER)
library(readr)
library(dplyr)
library(stringr)
library(ggplot2)
library(ggvenn)
library(reshape2)
library(stringr)
library(httr)
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] httr_1.4.3 reshape2_1.4.4 ggvenn_0.1.10
## [4] ggplot2_3.5.0 stringr_1.4.0 readr_2.1.2
## [7] DECIPHER_2.16.1 RSQLite_2.2.14 isolateR_0.0.0.9003
## [10] reactablefmtr_2.0.0 reactable_0.4.3 dplyr_1.0.9
## [13] Biostrings_2.64.0 GenomeInfoDb_1.24.2 XVector_0.36.0
## [16] IRanges_2.30.0 S4Vectors_0.34.0 BiocGenerics_0.42.0
## [19] knitr_1.39
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-3 ggtree_3.4.0 sangeranalyseR_1.6.1
## [4] seqinr_4.2-23 ellipsis_0.3.2 ggdendro_0.1.23
## [7] aplot_0.1.6 rstudioapi_0.13 DT_0.26
## [10] bit64_4.0.5 fansi_1.0.3 R.methodsS3_1.8.1
## [13] cachem_1.0.6 ade4_1.7-19 jsonlite_1.8.8
## [16] R.oo_1.24.0 shinydashboard_0.7.2 msa_1.30.1
## [19] shiny_1.7.1 LPSN_0.14.1 BiocManager_1.30.18
## [22] rentrez_1.2.3 sangerseqR_1.32.0 compiler_4.2.0
## [25] Matrix_1.4-1 assertthat_0.2.1 fastmap_1.1.0
## [28] lazyeval_0.2.2 cli_3.6.2 later_1.3.0
## [31] htmltools_0.5.2 tools_4.2.0 gtable_0.3.0
## [34] glue_1.6.2 GenomeInfoDbData_1.2.8 Rcpp_1.0.8.3
## [37] jquerylib_0.1.4 excelR_0.4.0 vctrs_0.6.5
## [40] ape_5.6-2 nlme_3.1-157 crosstalk_1.2.0
## [43] xfun_0.39 openxlsx_4.2.5 mime_0.12
## [46] lifecycle_1.0.4 shinycssloaders_1.0.0 xmlconvert_0.1.2
## [49] XML_3.99-0.16.1 zlibbioc_1.42.0 MASS_7.3-57
## [52] scales_1.3.0 hms_1.1.1 promises_1.2.0.1
## [55] yaml_2.3.5 curl_4.3.2 memoise_2.0.1
## [58] dataui_0.0.1 gridExtra_2.3 ggfun_0.0.6
## [61] pander_0.6.5 yulab.utils_0.0.4 sass_0.4.1
## [64] stringi_1.7.6 tidytree_0.3.9 zip_2.2.0
## [67] rlang_1.1.3 pkgconfig_2.0.3 bitops_1.0-7
## [70] evaluate_0.23 lattice_0.20-45 purrr_1.0.2
## [73] treeio_1.12.0 patchwork_1.1.1 htmlwidgets_1.5.4
## [76] bit_4.0.4 tidyselect_1.2.0 plyr_1.8.7
## [79] logger_0.2.2 magrittr_2.0.3 R6_2.5.1
## [82] generics_0.1.2 DBI_1.1.2 withr_2.5.0
## [85] pillar_1.7.0 svMisc_1.2.3 RCurl_1.98-1.6
## [88] tibble_3.1.7 crayon_1.5.1 shinyWidgets_0.7.6
## [91] utf8_1.2.2 plotly_4.10.0 tzdb_0.3.0
## [94] rmarkdown_2.14 data.table_1.14.2 blob_1.2.3
## [97] digest_0.6.34 xtable_1.8-4 tidyr_1.2.0
## [100] httpuv_1.6.5 gridGraphics_0.5-1 R.utils_2.11.0
## [103] munsell_0.5.0 viridisLite_0.4.0 ggplotify_0.1.0
## [106] bslib_0.3.1 shinyjs_2.1.0
This section assumes the marker genes have already been extracted from genomes via in silico PCR as described here:
#Load IMGG Study Supplementary Table S7
dir.create("imgg")
download.file("https://static-content.springer.com/esm/art%3A10.1038%2Fs41564-022-01270-1/MediaObjects/41564_2022_1270_MOESM4_ESM.xlsx", "imgg/41564_2022_1270_MOESM4_ESM.xlsx", method= "libcurl", mode='wb', headers=NULL)
mongo.meta <- readxl::read_xlsx("imgg/41564_2022_1270_MOESM4_ESM.xlsx", sheet="Table S7", skip=1) %>%
mutate(subject_id = stringr::str_split_fixed(IMGG_ID, "\\.", 2)[,1]) %>%
mutate(subject_id = gsub("^[A-Z]", "s", subject_id)) %>%
group_by(subject_id) %>%
mutate(per_subject_counts=n()) %>%
ungroup()
## New names:
## • `Completeness` -> `Completeness...8`
## • `Contamination` -> `Contamination...9`
## • `N_contigs` -> `N_contigs...10`
## • `N50` -> `N50...11`
## • `GC_content` -> `GC_content...14`
## • `N_contigs` -> `N_contigs...28`
## • `N50` -> `N50...29`
## • `GC_content` -> `GC_content...30`
## • `Completeness` -> `Completeness...31`
## • `Contamination` -> `Contamination...32`
full.pcr.dir <- "/mnt/md0/isolateR_meta_analysis/Rmarkdown/02_IMGG_marker_genes_pcr"
pcr.summary.raw <- readr::read_tsv(paste(full.pcr.dir, "/summary_info.txt", sep=""), show_col_types = FALSE)
print(colSums(pcr.summary.raw[,2:4] > 0))
## primers_16S_FULL primers_16S_V3V6 primers_cpn60_degen
## 6705 6726 6540
#Print venn diagram of overlap of marker genes across the genomes
ggvenn::ggvenn(list(FULL_16S = (pcr.summary.raw %>% filter(primers_16S_FULL > 0))$name,
V3V6_16S = (pcr.summary.raw %>% filter(primers_16S_V3V6 > 0 ))$name,
cpn60_UT = (pcr.summary.raw %>% filter(primers_cpn60_degen > 0 ))$name),
fill_color = c("#868686FF", "#0073C2FF", "#EFC000FF"),
stroke_size = 0.5,
set_name_size = 5)
#Plot copy number of marker genes per genome
pcr.summary.mer <- pcr.summary.raw %>% dplyr::rename("IMGG_ID" = "name") %>%
dplyr::left_join(., mongo.meta, by="IMGG_ID") %>%
mutate(Domain=stringr::str_split_fixed(Lineage, ";", 8)[,1]) %>%
mutate_at(vars(Domain), funs(gsub("d__", "", .))) %>%
filter(Domain=="Bacteria") %>%
mutate(Phylum=stringr::str_split_fixed(Lineage, ";", 8)[,2]) %>%
mutate_at(vars(Phylum), funs(gsub("p__", "", .))) %>%
filter(Phylum!="") %>%
group_by(Phylum) %>%
mutate(phylum_counts=n()) %>%
ungroup() %>%
reshape2::melt(., id.vars=c("IMGG_ID", "Phylum", "phylum_counts")) %>%
filter(grepl("primers_16S_FULL|primers_16S_V3V6|primers_cpn60_degen", variable)) %>% droplevels() %>%
mutate_at(vars(value, phylum_counts), as.numeric)
ggplot(subset(pcr.summary.mer, phylum_counts > 3),
aes(x = value, y=Phylum, fill=Phylum)) +
ggridges::geom_density_ridges(scale = 0.9,
linewidth = 0.3) +
scale_y_discrete(limits=rev) +
facet_grid(~variable) +
xlab("Copy number per genome") +
theme_bw() + theme(legend.position = "none")
## Picking joint bandwidth of 0.404
## Picking joint bandwidth of 0.35
## Picking joint bandwidth of 0.092
#Filter only the genomes that have all marker genes present
pcr.summary.filt <- readr::read_tsv(paste(full.pcr.dir, "/summary_info.txt", sep=""), show_col_types = FALSE) %>%
filter(primers_16S_FULL >0 & primers_16S_V3V6 >0 & primers_cpn60_degen > 0)
print(colSums(pcr.summary.filt[,2:4] > 0))
## primers_16S_FULL primers_16S_V3V6 primers_cpn60_degen
## 6533 6533 6533
Run isoLIB at different cutoffs for each of the marker genes, per-subject for each of the N=60 individuals and then the pooled dataset together.
These steps will take a very long time to run on a standard laptop/desktop computer. Recommended to run on a HPC with 48+ cores/physical processing units. Alternatively, skip to Step 3.4 and download the RDS files to continue.
group_cutoff.list <- c(1,0.999,0.998,0.997,0.996,0.995,0.994,0.993,0.992,0.991,0.990,
0.989,0.988,0.987,0.986,0.985,0.984,0.983,0.982,0.981,0.980,
0.979,0.978,0.977,0.976,0.975,0.974,0.973,0.972,0.971,0.970,
0.969,0.968,0.967,0.966,0.965,0.964,0.963,0.962,0.961,0.960,
0.959,0.958,0.957,0.956,0.955,0.954,0.953,0.952,0.951,0.950,
0.940,0.930,0.920,0.910,0.900,0.890,0.880,0.870,0.860,0.850)
#Make index list for pulling marker gene sequences
pcr.file.names <- paste("_", colnames(pcr.summary.filt[,2:4]), "_seqs.fasta", sep="")
#Load in sequences
seq.list.u <- parallel::mclapply(1:length(pcr.summary.filt$name), mc.cores=16, function(i) {
seq.list1 <- Biostrings::readBStringSet(paste(full.pcr.dir, "/seqs/", pcr.summary.filt$name[i], "_primers_16S_FULL_seqs.fasta", sep=""))
if(length(seq.list1) > 1){
seq.list2 <- DECIPHER::AlignSeqs(DNAStringSet(seq.list1))
seq.list3 <- BStringSet(DECIPHER::ConsensusSequence(seq.list2, ignoreNonBases=TRUE))
names(seq.list3) <- names(seq.list1)[1]
seq.list3
} else {
seq.list3 <- seq.list1
}
})
seq.list <- do.call(c, seq.list.u)
#Make holders to allow input for isoLIB transformation of data
date.formatted <- gsub("-", "_", Sys.Date())
seq.list.df <- cbind(input=rep("path",length(names(seq.list))),
warning=rep("xNA", length(names(seq.list))),
date=rep(date.formatted, length(names(seq.list))),
filename=names(seq.list),
seqs_raw=rep("xNA", length(names(seq.list))),
phred_raw=rep("xNA", length(names(seq.list))),
Ns_raw=rep("xNA", length(names(seq.list))),
length_raw=rep("xNA", length(names(seq.list))),
phred_spark_raw=rep("xNA", length(names(seq.list))),
seqs_trim=paste(seq.list),
phred_trim=rep(0, length(names(seq.list))),
Ns_trim=rep(0, length(names(seq.list))),
length_trim=rep(0, length(names(seq.list))),
closest_match=rep("xNA", length(names(seq.list))),
NCBI_acc=rep("xNA", length(names(seq.list))),
ID=rep(0, length(names(seq.list))),
rank_phylum=rep("xNA", length(names(seq.list))),
rank_class=rep("xNA", length(names(seq.list))),
rank_order=rep("xNA", length(names(seq.list))),
rank_family=rep("xNA", length(names(seq.list))),
rank_genus=rep("xNA", length(names(seq.list))),
rank_species=rep("xNA", length(names(seq.list))),
phylum_threshold=rep(0, length(names(seq.list))),
class_threshold=rep(0, length(names(seq.list))),
order_threshold=rep(0, length(names(seq.list))),
family_threshold=rep(0, length(names(seq.list))),
genus_threshold=rep(0, length(names(seq.list))),
species_threshold=rep(0, length(names(seq.list)))) %>% as.data.frame() %>%
mutate(filename=gsub("_16S_copy[0-9]+", "", filename)) %>%
mutate(length_trim= nchar(seqs_trim))
collector.list1 <- list()
for(i in 1:length(unique(mongo.meta$subject_id))){
sample.list <- (mongo.meta %>% arrange(desc(per_subject_counts)) %>% filter(subject_id %in% unique(subject_id)[i]))$IMGG_ID
seq.list.df.filt <- seq.list.df %>% filter(filename %in% sample.list)
seq.list.df.filt <- seq.list.df.filt[1:10,]
write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
#----------------------------------------------------------------------------------isoLIB function
for(ii in 1:length(group_cutoff.list)){
isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
mutate(group_counts=length(unique(sequence_group))) %>%
mutate(group_props=group_counts/n()) %>%
mutate(result=paste("result_", i, sep=""))
collector.list1[[paste("result_", i, "_cutoff_", ii, sep="")]] <- isoLIB.df
}
}
final.list.individuals <- dplyr::bind_rows(c(collector.list1))
saveRDS(final.list.individuals, "individual_list_16S_FULL")
#---For all
collector.list2 <- list()
for(ii in 1:length(group_cutoff.list)){
seq.list.df.filt <- seq.list.df[1:10,]
write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
mutate(group_counts=length(unique(sequence_group))) %>%
mutate(group_props=group_counts/n()) %>%
mutate(result=paste("result_", "all", sep=""))
collector.list2[[paste("result_all", "_cutoff_", ii, sep="")]] <- isoLIB.df
}
final.list.pooled <- dplyr::bind_rows(c(collector.list2))
saveRDS(final.list.pooled, "all_list_16S_FULL.RDS")
final.list.individuals <- readRDS("individual_list_16S_FULL.RDS")
isoLIB.S4.mongo <- readRDS("all_list_16S_FULL.RDS")
res.list.df.mongo.all <- bind_rows(isoLIB.S4.mongo) %>%
group_by(cutoff) %>% mutate(group_counts= length(unique(sequence_group))) %>%
mutate(group_props=group_counts/n()) %>% ungroup() %>% mutate(result=paste("result_", "all", sep=""))
final.list.df.16SFULL <- rbind(bind_rows(final.list.individuals),
res.list.df.mongo.all) %>% dplyr::rename("IMGG_ID" = "filename") %>%
dplyr::left_join(., mongo.meta, by="IMGG_ID") %>% arrange(desc(representative)) %>% mutate(lineage=Lineage)
#----------Count number of genomes captured per cutoff
final.list.df.16SFULL.counts <- final.list.df.16SFULL %>%
mutate(main_groups = ifelse(result=="result_all", "Full dataset", "Per individual")) %>%
group_by(cutoff, sequence_group, result) %>% mutate(species_rep = first(lineage)) %>%
ungroup() %>%
group_by(cutoff, result) %>% mutate(species_counts=length(unique(species_rep))) %>% mutate(species_props=species_counts/length(unique(lineage))) %>%
ungroup() %>% group_by(result) %>% distinct(cutoff, .keep_all=TRUE) %>% ungroup()
saveRDS(final.list.df.16SFULL.counts, "isoLIB_results_16S_full_length.RDS")
#Load in sequences
seq.list.u <- parallel::mclapply(1:length(pcr.summary.filt$name), mc.cores=16, function(i) {
seq.list1 <- Biostrings::readBStringSet(paste(full.pcr.dir, "/seqs/", pcr.summary.filt$name[i], "_primers_16S_V3V6_seqs.fasta", sep=""))
if(length(seq.list1) > 1){
seq.list2 <- DECIPHER::AlignSeqs(DNAStringSet(seq.list1))
seq.list3 <- BStringSet(DECIPHER::ConsensusSequence(seq.list2, ignoreNonBases=TRUE))
names(seq.list3) <- names(seq.list1)[1]
seq.list3
} else {
seq.list3 <- seq.list1
}
})
seq.list <- do.call(c, seq.list.u)
#Make holders to allow input for isoLIB transformation of data
date.formatted <- gsub("-", "_", Sys.Date())
seq.list.df <- cbind(input=rep("path",length(names(seq.list))),
warning=rep("xNA", length(names(seq.list))),
date=rep(date.formatted, length(names(seq.list))),
filename=names(seq.list),
seqs_raw=rep("xNA", length(names(seq.list))),
phred_raw=rep("xNA", length(names(seq.list))),
Ns_raw=rep("xNA", length(names(seq.list))),
length_raw=rep("xNA", length(names(seq.list))),
phred_spark_raw=rep("xNA", length(names(seq.list))),
seqs_trim=paste(seq.list),
phred_trim=rep(0, length(names(seq.list))),
Ns_trim=rep(0, length(names(seq.list))),
length_trim=rep(0, length(names(seq.list))),
closest_match=rep("xNA", length(names(seq.list))),
NCBI_acc=rep("xNA", length(names(seq.list))),
ID=rep(0, length(names(seq.list))),
rank_phylum=rep("xNA", length(names(seq.list))),
rank_class=rep("xNA", length(names(seq.list))),
rank_order=rep("xNA", length(names(seq.list))),
rank_family=rep("xNA", length(names(seq.list))),
rank_genus=rep("xNA", length(names(seq.list))),
rank_species=rep("xNA", length(names(seq.list))),
phylum_threshold=rep(0, length(names(seq.list))),
class_threshold=rep(0, length(names(seq.list))),
order_threshold=rep(0, length(names(seq.list))),
family_threshold=rep(0, length(names(seq.list))),
genus_threshold=rep(0, length(names(seq.list))),
species_threshold=rep(0, length(names(seq.list)))) %>% as.data.frame() %>%
mutate(filename=gsub("_16S_copy[0-9]+", "", filename)) %>%
mutate(length_trim= nchar(seqs_trim))
collector.list1 <- list()
for(i in 1:length(unique(mongo.meta$subject_id))){
sample.list <- (mongo.meta %>% arrange(desc(per_subject_counts)) %>% filter(subject_id %in% unique(subject_id)[i]))$IMGG_ID
seq.list.df.filt <- seq.list.df %>% filter(filename %in% sample.list)
seq.list.df.filt <- seq.list.df.filt[1:10,]
write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
#----------------------------------------------------------------------------------isoLIB function
for(ii in 1:length(group_cutoff.list)){
isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
mutate(group_counts=length(unique(sequence_group))) %>%
mutate(group_props=group_counts/n()) %>%
mutate(result=paste("result_", i, sep=""))
collector.list1[[paste("result_", i, "_cutoff_", ii, sep="")]] <- isoLIB.df
}
}
final.list.individuals <- dplyr::bind_rows(c(collector.list1))
saveRDS(final.list.individuals, "individual_list_16S_V3V6.RDS")
#---For all
collector.list2 <- list()
for(ii in 1:length(group_cutoff.list)){
seq.list.df.filt <- seq.list.df[1:10,]
write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
mutate(group_counts=length(unique(sequence_group))) %>%
mutate(group_props=group_counts/n()) %>%
mutate(result=paste("result_", "all", sep=""))
collector.list2[[paste("result_all", "_cutoff_", ii, sep="")]] <- isoLIB.df
}
final.list.pooled <- dplyr::bind_rows(c(collector.list2))
saveRDS(final.list.pooled, "all_list_16S_V3V6.RDS")
final.list.individuals <- readRDS("individual_list_16S_V3V6.RDS")
isoLIB.S4.mongo <- readRDS("all_list_16S_V3V6.RDS")
res.list.df.mongo.all <- bind_rows(isoLIB.S4.mongo) %>%
group_by(cutoff) %>% mutate(group_counts= length(unique(sequence_group))) %>% #ungroup() #%>% filter(cutoff==0.956)
mutate(group_props=group_counts/n()) %>% ungroup() %>% mutate(result=paste("result_", "all", sep="")) #%>% distinct(cutoff, .keep_all=TRUE)
final.list.df.16SV3V6 <- rbind(bind_rows(final.list.individuals),
res.list.df.mongo.all) %>% dplyr::rename("IMGG_ID" = "filename") %>%
dplyr::left_join(., mongo.meta, by="IMGG_ID") %>% arrange(desc(representative)) %>% mutate(lineage=Lineage)
#----------Count number of genomes captured per cutoff
final.list.df.16SV3V6.counts <- final.list.df.16SV3V6 %>%
mutate(main_groups = ifelse(result=="result_all", "Full dataset", "Per individual")) %>%
group_by(cutoff, sequence_group, result) %>% mutate(species_rep = first(lineage)) %>%
ungroup() %>%
group_by(cutoff, result) %>% mutate(species_counts=length(unique(species_rep))) %>% mutate(species_props=species_counts/length(unique(lineage))) %>%
ungroup() %>% group_by(result) %>% distinct(cutoff, .keep_all=TRUE) %>% ungroup()
saveRDS(final.list.df.16SV3V6.counts, "isoLIB_results_16S_V3V6.RDS")
#Load in sequences
seq.list.u <- parallel::mclapply(1:length(pcr.summary.filt$name), mc.cores=16, function(i) {
seq.list1 <- Biostrings::readBStringSet(paste(full.pcr.dir, "/seqs/", pcr.summary.filt$name[i], "_primers_cpn60_degen_seqs.fasta", sep=""))
if(length(seq.list1) > 1){
seq.list2 <- DECIPHER::AlignSeqs(DNAStringSet(seq.list1))
seq.list3 <- BStringSet(DECIPHER::ConsensusSequence(seq.list2, ignoreNonBases=TRUE))
names(seq.list3) <- names(seq.list1)[1]
seq.list3
} else {
seq.list3 <- seq.list1
}
})
seq.list <- do.call(c, seq.list.u)
#Make holders to allow input for isoLIB transformation of data
date.formatted <- gsub("-", "_", Sys.Date())
seq.list.df <- cbind(input=rep("path",length(names(seq.list))),
warning=rep("xNA", length(names(seq.list))),
date=rep(date.formatted, length(names(seq.list))),
filename=names(seq.list),
seqs_raw=rep("xNA", length(names(seq.list))),
phred_raw=rep("xNA", length(names(seq.list))),
Ns_raw=rep("xNA", length(names(seq.list))),
length_raw=rep("xNA", length(names(seq.list))),
phred_spark_raw=rep("xNA", length(names(seq.list))),
seqs_trim=paste(seq.list),
phred_trim=rep(0, length(names(seq.list))),
Ns_trim=rep(0, length(names(seq.list))),
length_trim=rep(0, length(names(seq.list))),
closest_match=rep("xNA", length(names(seq.list))),
NCBI_acc=rep("xNA", length(names(seq.list))),
ID=rep(0, length(names(seq.list))),
rank_phylum=rep("xNA", length(names(seq.list))),
rank_class=rep("xNA", length(names(seq.list))),
rank_order=rep("xNA", length(names(seq.list))),
rank_family=rep("xNA", length(names(seq.list))),
rank_genus=rep("xNA", length(names(seq.list))),
rank_species=rep("xNA", length(names(seq.list))),
phylum_threshold=rep(0, length(names(seq.list))),
class_threshold=rep(0, length(names(seq.list))),
order_threshold=rep(0, length(names(seq.list))),
family_threshold=rep(0, length(names(seq.list))),
genus_threshold=rep(0, length(names(seq.list))),
species_threshold=rep(0, length(names(seq.list)))) %>% as.data.frame() %>%
mutate(filename=gsub("_cpn60_copy[0-9]+", "", filename)) %>%
mutate(length_trim= nchar(seqs_trim))
collector.list1 <- list()
for(i in 1:length(unique(mongo.meta$subject_id))){
sample.list <- (mongo.meta %>% arrange(desc(per_subject_counts)) %>% filter(subject_id %in% unique(subject_id)[i]))$IMGG_ID
seq.list.df.filt <- seq.list.df %>% filter(filename %in% sample.list)
seq.list.df.filt <- seq.list.df.filt[1:10,]
write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
#----------------------------------------------------------------------------------isoLIB function
for(ii in 1:length(group_cutoff.list)){
isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
mutate(group_counts=length(unique(sequence_group))) %>%
mutate(group_props=group_counts/n()) %>%
mutate(result=paste("result_", i, sep=""))
collector.list1[[paste("result_", i, "_cutoff_", ii, sep="")]] <- isoLIB.df
}
}
final.list.individuals <- dplyr::bind_rows(c(collector.list1))
saveRDS(final.list.individuals, "individual_list_cpn60.RDS")
#---For all
collector.list2 <- list()
for(ii in 1:length(group_cutoff.list)){
seq.list.df.filt <- seq.list.df[1:10,]
write.csv(seq.list.df.filt, paste("seqs.csv", sep=""), row.names = FALSE)
isoLIB.S4 <- isoLIB(input=file.path(getwd(), "seqs.csv"), export_html=FALSE, group_cutoff=group_cutoff.list[ii])
isoLIB.df <- S4_to_dataframe(isoLIB.S4) %>%
mutate(cutoff= as.numeric(paste(group_cutoff.list[ii]))) %>%
mutate(group_counts=length(unique(sequence_group))) %>%
mutate(group_props=group_counts/n()) %>%
mutate(result=paste("result_", "all", sep=""))
collector.list2[[paste("result_all", "_cutoff_", ii, sep="")]] <- isoLIB.df
}
final.list.pooled <- dplyr::bind_rows(c(collector.list2))
saveRDS(final.list.pooled, "all_list_cpn60.RDS")
final.list.individuals <- readRDS("individual_list_cpn60.RDS")
isoLIB.S4.mongo <- readRDS("all_list_cpn60.RDS")
res.list.df.mongo.all <- bind_rows(isoLIB.S4.mongo) %>%
group_by(cutoff) %>% mutate(group_counts= length(unique(sequence_group))) %>% #ungroup() #%>% filter(cutoff==0.956)
mutate(group_props=group_counts/n()) %>% ungroup() %>% mutate(result=paste("result_", "all", sep="")) #%>% distinct(cutoff, .keep_all=TRUE)
final.list.df.cpn60 <- rbind(bind_rows(final.list.individuals),
res.list.df.mongo.all) %>% dplyr::rename("IMGG_ID" = "filename") %>%
dplyr::left_join(., mongo.meta, by="IMGG_ID") %>% arrange(desc(representative)) %>% mutate(lineage=Lineage)
#----------Count number of genomes captured per cutoff
final.list.df.cpn60.counts <- final.list.df.cpn60 %>%
mutate(main_groups = ifelse(result=="result_all", "Full dataset", "Per individual")) %>%
group_by(cutoff, sequence_group, result) %>% mutate(species_rep = first(lineage)) %>%
ungroup() %>%
group_by(cutoff, result) %>% mutate(species_counts=length(unique(species_rep))) %>% mutate(species_props=species_counts/length(unique(lineage))) %>%
ungroup() %>% group_by(result) %>% distinct(cutoff, .keep_all=TRUE) %>% ungroup()
saveRDS(final.list.df.cpn60.counts, "isoLIB_results_cpn60_UT.RDS")
final.list.df.16SFULL.counts <- readRDS(url("https://github.com/bdaisley/isolateR/raw/main/benchmark/isoLIB_results_16S_full_length.RDS"))
final.list.df.16SV3V6.counts <- readRDS(url("https://github.com/bdaisley/isolateR/raw/main/benchmark/isoLIB_results_16S_V3V6.RDS"))
final.list.df.cpn60.counts <- readRDS(url("https://github.com/bdaisley/isolateR/raw/main/benchmark/isoLIB_results_cpn60_UT.RDS"))
#----------Graph results ---------------------------- "group_props" and "species_props"
g1 <- ggplot(final.list.df.16SFULL.counts,
aes(x=cutoff,
y=group_props*100,
groups=result,
fill=main_groups,
colour=main_groups)) +
geom_line(linewidth=0.6,
alpha=0.6) +
scale_y_continuous(limits=c(0,100)) +
scale_x_continuous(limits=c(0.85,1)) +
scale_fill_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
scale_color_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
xlab("isoLIB 'group_cutoff' setting") +
ylab("% of unique genomes captured") +
theme(panel.border = element_rect(color ="black",
fill = NA,
linewidth = 0.5,
linetype = 1),
axis.text.x=element_text(angle=0,
hjust=0.5,
size=8),
axis.text.y=element_text(size=8),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
panel.background = element_blank(),
panel.grid.major = element_blank(),
legend.box.background = element_rect(colour = "black",
linewidth=0.5),
legend.title=element_blank(),
legend.position = "none",
panel.grid.minor = element_blank())
#---------------------
g2 <- ggplot(final.list.df.16SFULL.counts,
aes(x=cutoff,
y=species_props*100,
groups=result,
fill=main_groups,
colour=main_groups)) +
geom_line(linewidth=0.6,
alpha=0.6) +
scale_y_continuous(limits=c(0,100)) +
scale_x_continuous(limits=c(0.85,1)) +
scale_fill_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
scale_color_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
xlab("isoLIB 'group_cutoff' setting") +
ylab("% species captured") +
theme(panel.border = element_rect(color ="black",
fill = NA,
linewidth = 0.5,
linetype = 1),
axis.text.x=element_text(angle=0,
hjust=0.5,
size=8),
axis.text.y=element_text(size=8),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
panel.background = element_blank(),
panel.grid.major = element_blank(),
legend.box.background = element_rect(colour = "black",
linewidth=0.5),
legend.title=element_blank(),
legend.position = "none",
panel.grid.minor = element_blank())
g3 <- ggplot(final.list.df.16SV3V6.counts,
aes(x=cutoff, y=group_props*100,
groups=result,
fill=main_groups,
colour=main_groups)) +
geom_line(linewidth=0.6, alpha=0.6) +
scale_y_continuous(limits=c(0,100)) +
scale_x_continuous(limits=c(0.85,1)) +
scale_fill_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
scale_color_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
xlab("isoLIB 'group_cutoff' setting") +
ylab("% of unique genomes captured") +
theme(panel.border = element_rect(color ="black",
fill = NA,
linewidth = 0.5,
linetype = 1),
axis.text.x=element_text(angle=0,
hjust=0.5,
size=8),
axis.text.y=element_text(size=8),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
panel.background = element_blank(),
panel.grid.major = element_blank(),
legend.box.background = element_rect(colour = "black",
linewidth=0.5),
legend.title=element_blank(),
legend.position = "none",
panel.grid.minor = element_blank())
#---------------------
g4 <- ggplot(final.list.df.16SV3V6.counts,
aes(x=cutoff,
y=species_props*100,
groups=result,
fill=main_groups,
colour=main_groups)) +
geom_line(linewidth=0.6,
alpha=0.6) +
scale_y_continuous(limits=c(0,100)) +
scale_x_continuous(limits=c(0.85,1)) +
scale_fill_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
scale_color_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
xlab("isoLIB 'group_cutoff' setting") +
ylab("% species captured") +
theme(panel.border = element_rect(color ="black",
fill = NA,
linewidth = 0.5,
linetype = 1),
axis.text.x=element_text(angle=0,
hjust=0.5,
size=8),
axis.text.y=element_text(size=8),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
panel.background = element_blank(),
panel.grid.major = element_blank(),
legend.box.background = element_rect(colour = "black",
linewidth=0.5),
legend.title=element_blank(),
legend.position = "none",
panel.grid.minor = element_blank())
g5 <- ggplot(final.list.df.cpn60.counts,
aes(x=cutoff,
y=group_props*100,
groups=result,
fill=main_groups,
colour=main_groups)) +
geom_line(linewidth=0.6,
alpha=0.6) +
scale_y_continuous(limits=c(0,100)) +
scale_x_continuous(limits=c(0.85,1)) +
scale_fill_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
scale_color_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
xlab("isoLIB 'group_cutoff' setting") +
ylab("% of unique genomes captured") +
theme(panel.border = element_rect(color ="black",
fill = NA,
linewidth = 0.5,
linetype = 1),
axis.text.x=element_text(angle=0,
hjust=0.5,
size=8),
axis.text.y=element_text(size=8),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
panel.background = element_blank(),
panel.grid.major = element_blank(),
legend.box.background = element_rect(colour = "black",
linewidth=0.5),
legend.title=element_blank(),
legend.position = "none",
panel.grid.minor = element_blank())
#---------------------
g6 <- ggplot(final.list.df.cpn60.counts,
aes(x=cutoff,
y=species_props*100,
groups=result,
fill=main_groups,
colour=main_groups)) +
geom_line(linewidth=0.6,
alpha=0.6) +
scale_y_continuous(limits=c(0,100)) +
scale_x_continuous(limits=c(0.85,1)) +
scale_fill_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
scale_color_manual(values=c("Full dataset"="blue",
"Per individual" = "grey30")) +
xlab("isoLIB 'group_cutoff' setting") +
ylab("% species captured") +
theme(panel.border = element_rect(color ="black",
fill = NA,
linewidth = 0.5,
linetype = 1),
axis.text.x=element_text(angle=0,
hjust=0.5,
size=8),
axis.text.y=element_text(size=8),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
panel.background = element_blank(),
panel.grid.major = element_blank(),
legend.box.background = element_rect(colour = "black",
linewidth=0.5),
legend.title=element_blank(),
legend.position = "none",
panel.grid.minor = element_blank())
dd
cowplot::plot_grid(g1,g3,g5,
g2,g4,g6,
ncol=3)
Download files from supplementary of Jin et al. 2023, Nature Microbiology (https://doi.org/10.1038/s41564-022-01270-1)
#dir.create("imgg")
download.file("https://static-content.springer.com/esm/art%3A10.1038%2Fs41564-022-01270-1/MediaObjects/41564_2022_1270_MOESM4_ESM.xlsx", "imgg/41564_2022_1270_MOESM4_ESM.xlsx", method= "libcurl", mode='wb', headers=NULL)
#Load Supplementary Table S7
mongo.meta <- readxl::read_xlsx("imgg/41564_2022_1270_MOESM4_ESM.xlsx", sheet="Table S7", skip=1) %>%
mutate(subject_id = stringr::str_split_fixed(IMGG_ID, "\\.", 2)[,1]) %>%
mutate(subject_id = gsub("^[A-Z]", "s", subject_id)) %>%
group_by(subject_id) %>%
mutate(per_subject_counts=n()) %>%
ungroup()
## New names:
## • `Completeness` -> `Completeness...8`
## • `Contamination` -> `Contamination...9`
## • `N_contigs` -> `N_contigs...10`
## • `N50` -> `N50...11`
## • `GC_content` -> `GC_content...14`
## • `N_contigs` -> `N_contigs...28`
## • `N50` -> `N50...29`
## • `GC_content` -> `GC_content...30`
## • `Completeness` -> `Completeness...31`
## • `Contamination` -> `Contamination...32`
#Print head to inspect columns of data available
print(mongo.meta)
## # A tibble: 12,391 × 44
## IMGG_ID Cluster_ID Rep_MAG_ID CMAG HQ MIMAG_HQ CheckM_lineage
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 C59.CMAG_1 C140 C59.CMAG_1 N Y Y p__Actinobacteria
## 2 A102.CMAG_1 C135 C46.CMAG_1 N Y Y f__Bifidobacteriaceae
## 3 C58.CMAG_2 C135 C46.CMAG_1 N Y Y f__Bifidobacteriaceae
## 4 C127.CMAG_1 C548 C38.CMAG_7 N Y Y p__Firmicutes
## 5 B49.CMAG_6 C138 C130.CMAG_5 N Y Y p__Actinobacteria
## 6 C55.CMAG_3 C138 C130.CMAG_5 N Y Y p__Actinobacteria
## 7 B15.CMAG_4 C138 C130.CMAG_5 N Y Y p__Actinobacteria
## 8 C97.CMAG_3 C138 C130.CMAG_5 N Y Y p__Actinobacteria
## 9 C40.CMAG_4 C138 C130.CMAG_5 N Y Y p__Actinobacteria
## 10 B34.CMAG_3 C138 C130.CMAG_5 N Y Y p__Actinobacteria
## # … with 12,381 more rows, and 37 more variables: Completeness...8 <dbl>,
## # Contamination...9 <dbl>, N_contigs...10 <dbl>, N50...11 <dbl>,
## # Longest_contig <dbl>, Genome_size <dbl>, GC_content...14 <dbl>,
## # Coding_density <dbl>, Predicted_genes <dbl>, N_16S_rRNA <dbl>,
## # N_23S_rRNA <dbl>, N_5S_rRNA <dbl>, N_tRNA <dbl>, UHGG_ID <chr>, ANI <chr>,
## # N_aligned_fragment <chr>, N_fragment <chr>, Study_set <chr>,
## # Genome_type <chr>, Length <chr>, N_contigs...28 <chr>, N50...29 <chr>, …
#Calculate number of genomes per species
mongo.meta.counts <- mongo.meta %>%
mutate(result=paste("result_", "all", sep="")) %>%
group_by(Lineage) %>%
mutate(strains_per_species=n()) %>%
ungroup() %>%
group_by(Lineage, subject_id) %>%
mutate(strains_per_species_per_subject=n()) %>%
ungroup() %>%
group_by(Lineage) %>%
mutate(mean_strains_per_species_per_subject=mean(strains_per_species_per_subject)) %>%
ungroup()
mongo.meta.counts.g <- rbind(mongo.meta.counts %>%
distinct(Lineage,.keep_all=TRUE) %>%
mutate(variable="01_full", value=strains_per_species),
mongo.meta.counts %>%
distinct(Lineage,.keep_all=TRUE) %>%
mutate(variable="02_per_subject", value=mean_strains_per_species_per_subject))
#Print list of subject IDs
print(unique(mongo.meta.counts$subject_id))
## [1] "s59" "s102" "s58" "s127" "s49" "s55" "s15" "s97" "s40" "s34"
## [11] "s54" "s50" "s1" "s124" "s41" "s125" "s134" "s137" "s112" "s11"
## [21] "s104" "s142" "s36" "s62" "s38" "s162" "s82" "s171" "s91" "s46"
## [31] "s77" "s65" "s103" "s10" "s43" "s72" "s130" "s165" "s111" "s37"
## [41] "s163" "s83" "s31" "s19" "s152" "s64" "s95" "s129" "s126" "s135"
## [51] "s6" "s149" "s57" "s74" "s108" "s89" "s151" "s73" "s94" "s93"
Download files from supplementary of Carter et al. 2023, Cell (https://doi.org/10.1016/j.cell.2023.05.046)
#dir.create("hadza")
#httr::GET("https://www.cell.com/cms/10.1016/j.cell.2023.05.046/attachment/b21a482b-edf0-4049-bd78-38a9bdc171da/mmc1.csv", user_agent("Mozilla/5.0"), #write_disk(file.path(getwd(), "hadza/mmc1.csv"), overwrite=TRUE))
#httr::GET("https://www.cell.com/cms/10.1016/j.cell.2023.05.046/attachment/2fbb050a-bd92-4d0f-8fa5-652c1e95da12/mmc2.zip", user_agent("Mozilla/5.0"), write_disk(file.path(getwd(), "hadza/mmc2.zip")))
#unzip("hadza/mmc2.zip", exdir="hadza")
#Convert .xlsb to .csv
#system2("soffice --convert-to csv hadza/SupplementalTable_S2.xlsb --outdir hadza")
#Load Supplementary Table S1
hadza.meta.subjects <- read.csv("hadza/mmc1.csv")
index.subject <- setNames(hadza.meta.subjects$Subject_ID, hadza.meta.subjects$Sample_Name)
index.camp <- setNames(hadza.meta.subjects$Bush_camp, hadza.meta.subjects$Sample_Name)
#Load Supplementary Table S2
hadza.meta <- readr::read_csv("hadza/SupplementalTable_S2.csv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
#Combine subject identifiers and MAG metadata together > Calculate number of genomes per species
hadza.meta.counts <- hadza.meta %>%
filter(grepl("Hadza", Population)) %>%
mutate(Subject_ID = dplyr::recode(!!!index.subject, Sample_Name)) %>%
mutate(Bush_camp = dplyr::recode(!!!index.camp, Sample_Name)) %>%
group_by(Species_cluster) %>%
mutate(strains_per_species=n()) %>%
ungroup() %>%
group_by(Species_cluster, Subject_ID) %>%
mutate(strains_per_species_per_subject=n()) %>%
ungroup() %>%
group_by(Species_cluster) %>%
mutate(mean_strains_per_species_per_subject = mean(strains_per_species_per_subject)) %>%
ungroup() %>%
group_by(Species_cluster, Bush_camp) %>%
mutate(strains_per_species_per_camp=n()) %>% ungroup() %>%
group_by(Species_cluster) %>%
mutate(mean_strains_per_species_per_camp = mean(strains_per_species_per_camp)) %>%
ungroup() %>%
group_by(Subject_ID) %>%
mutate(per_subject_counts=n_distinct(genome)) %>%
mutate(per_subject_species_counts=n_distinct(Species_cluster)) %>%
ungroup() %>%
group_by(Bush_camp) %>%
mutate(per_camp_counts=n_distinct(genome)) %>%
mutate(per_camp_species_counts=n_distinct(Species_cluster)) %>%
ungroup() %>%
select(genome, Species_cluster ,Population, Subject_ID, Bush_camp, matches("per")) %>%
arrange(Species_cluster)
#Organize data for graphing
hadza.meta.counts.g <- rbind(hadza.meta.counts %>%
distinct(Species_cluster,.keep_all=TRUE) %>%
mutate(variable="03_full", value=strains_per_species),
hadza.meta.counts %>%
distinct(Species_cluster,.keep_all=TRUE) %>%
mutate(variable="04_per_bush", value=mean_strains_per_species_per_camp),
hadza.meta.counts %>%
distinct(Species_cluster,.keep_all=TRUE) %>%
mutate(variable="05_per_subject", value=mean_strains_per_species_per_subject))
print(hadza.meta.counts.g)
## # A tibble: 7,311 × 16
## genome Species_cluster Population Subject_ID Bush_camp strains_per_spe…
## <chr> <chr> <chr> <chr> <chr> <int>
## 1 REFINED_MET… 1002_0 Hadza INDIV_445 GIDERU 2
## 2 REFINED_MET… 1003_0 Hadza INDIV_157 HUKAMAKO 1
## 3 REFINED_MET… 1016_0 Hadza (in… INDIV_130 KIPAMBA 2
## 4 REFINED_MET… 1020_0 Hadza INDIV_84 SENGELI 34
## 5 REFINED_MET… 1021_0 Hadza (in… INDIV_349 SENGELI 5
## 6 REFINED_MET… 1022_0 Hadza (in… INDIV_62 SENGELI 7
## 7 REFINED_MET… 1023_0 Hadza INDIV_162 HUKAMAKO 1
## 8 REFINED_MET… 1028_1 Hadza INDIV_88 SENGELI 29
## 9 REFINED_MET… 1028_2 Hadza INDIV_94 HUKAMAKO 12
## 10 REFINED_MET… 1029_1 Hadza INDIV_275 HUKAMAKO 28
## # … with 7,301 more rows, and 10 more variables:
## # strains_per_species_per_subject <int>,
## # mean_strains_per_species_per_subject <dbl>,
## # strains_per_species_per_camp <int>,
## # mean_strains_per_species_per_camp <dbl>, per_subject_counts <int>,
## # per_subject_species_counts <int>, per_camp_counts <int>,
## # per_camp_species_counts <int>, variable <chr>, value <dbl>
combined.g <- rbind(hadza.meta.counts.g %>% select(variable,value) %>% mutate(dataset="Hadza"),
mongo.meta.counts.g %>% select(variable,value) %>% mutate(dataset="IMGG"))
g1e <- ggplot(combined.g, aes(x=variable, y=value)) +
geom_point(position=position_jitter(width=0.4),
size=0.3,
colour="grey40",
alpha=0.3) +
stat_summary(geom='crossbar',
fun="mean",
width=0.5,
color='blue',
size=0.3) +
stat_summary(geom='errorbar',
width=0.3,
size=0.5,
color="blue",
fun.data ="mean_cl_boot") +
stat_summary(aes(label=round(..y..,2)),
fun.y=mean,
geom="text",
vjust=-8,
size=6,
color="blue") +
ylab("Number of unique genomes per species") +
geom_vline(xintercept=c(2.5)) +
scale_x_discrete(labels=c("Full dataset",
"Per subject",
'Full dataset',
'Per bush camp',
"Per subject"))+
annotate(geom="text",
hjust="left",
label=paste("IMGG dataset",
"\nN = ", length(unique(mongo.meta.counts$Cluster_ID)), " species",
"\nN = ", length(unique(mongo.meta.counts$IMGG_ID)), " unique genomes",
"\nN = ", length(unique(mongo.meta.counts$subject_id)), " adults",
sep=""), x=0.7, y=250, size=3) +
annotate(geom="text",
hjust="left",
label=paste("Hadza dataset",
"\nN = ", length(unique(hadza.meta.counts$Species_cluster)), " species",
"\nN = ", length(unique(hadza.meta.counts$genome)), " unique genomes",
"\nN = ", length(unique(hadza.meta.counts$Subject_ID)), " adults",
sep=""), x=3.0, y=250, size=3) +
theme(panel.border = element_rect(color ="black",
fill = NA,
linewidth = 0.5,
linetype = 1),
axis.text.x=element_text(angle=0,
hjust=0.5,
size=10),
axis.text.y=element_text(size=10),
axis.title.x = element_blank(),
axis.title.y = element_text(size=12),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
theme(legend.position = "none")
g1e + geom_text(data=ggplot_build(g1e)$data[[3]],
aes(x = group,
y = ymin,
label = round(ymax-ymin,2)),
color = "blue",
vjust = -7,
size=4)
Unified Human Gastrointestinal Genome (UHGG) catalouge.
From Almeida et al. Nature Biotechnology; https://doi.org/10.1038/s41587-020-0603-3
Here
we are using V2.0.2 from EBI ftp server: https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.2
#dir.create("uhgg")
#download.file("https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.2/genomes-all_metadata.tsv", "uhgg/genomes-all_metadata.tsv" , extra = options(timeout=1500))
UHGG.meta <- readr::read_tsv("uhgg/genomes-all_metadata.tsv", col_names = TRUE, show_col_types = FALSE)
#Count number of distinct genomes per species group
UHGG.meta.counts <- UHGG.meta %>%
group_by(Species_rep) %>%
mutate(value= n_distinct(Genome)) %>%
ungroup()
#Organize data for graphing
UHGG.meta.counts.g <- UHGG.meta.counts %>%
distinct(Species_rep, .keep_all=TRUE) %>%
mutate(variable="06_UGHH")
combined.g <- rbind(hadza.meta.counts.g %>%
select(variable,value) %>%
mutate(dataset="Hadza"),
mongo.meta.counts.g %>%
select(variable,value) %>%
mutate(dataset="IMGG"),
UHGG.meta.counts.g %>%
select(variable,value) %>%
mutate(dataset="UGHH"))
g1e.uhgg <- ggplot(combined.g,
aes(x=variable, y=value)) +
geom_point(position=position_jitter(width=0.4),
size=0.3, colour="grey40",
alpha=0.3) +
stat_summary(geom='crossbar',
fun="mean",
width=0.5,
color='blue',
size=0.3) +
stat_summary(geom='errorbar',
width=0.3,
size=0.5,
color="blue",
fun.data ="mean_cl_boot") +
stat_summary(aes(label=round(..y..,2)),
fun.y=mean,
geom="text",
vjust=-8,
size=6,
color="blue") +
ylab("Number of unique genomes per species") +
geom_vline(xintercept=c(2.5, 5.5)) +
scale_x_discrete(labels=c('Full dataset',
"Per subject",
'Full dataset',
'Per bush camp',
'Per subject',
'Full dataset'))+
annotate(geom="text",
hjust="left",
label=paste("IMGG dataset",
"\nN = ", length(unique(mongo.meta.counts$Cluster_ID)), " species",
"\nN = ", length(unique(mongo.meta.counts$IMGG_ID)), " unique genomes",
"\nN = ", length(unique(mongo.meta.counts$subject_id)), " adults",
sep=""), x=0.7, y=5000, size=3) +
annotate(geom="text",
hjust="left",
label=paste("Hadza dataset",
"\nN = ", length(unique(hadza.meta.counts$Species_cluster)), " species",
"\nN = ", length(unique(hadza.meta.counts$genome)), " unique genomes",
"\nN = ", length(unique(hadza.meta.counts$Subject_ID)), " adults",
sep=""), x=3.0, y=5000, size=3) +
annotate(geom="text",
hjust="left",
label=paste("UHGG dataset",
"\nN = ",
length(unique(UHGG.meta.counts$Species_rep)),
" species",
"\nN = ", length(unique(UHGG.meta.counts$Genome)), " unique genomes",
"\nN = ", length(unique(UHGG.meta.counts$Sample_accession)), " biosamples",
sep=""), x=5.6, y=5000, size=3) +
theme(panel.border = element_rect(color ="black",
fill = NA,
linewidth = 0.5,
linetype = 1),
axis.text.x=element_text(angle=0,
hjust=0.5,
size=10),
axis.text.y=element_text(size=10),
axis.title.x = element_blank(),
axis.title.y = element_text(size=12),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
theme(legend.position = "none")
g1e.uhgg + geom_text(data=ggplot_build(g1e)$data[[3]], aes(x = group, y = ymin, label = round(ymax-ymin,2)), color = "blue", vjust = -7, size=4)