Related article: Daisley et al. (2024). isolateR: an R package for generating microbial libraries from Sanger sequencing data. In draft
Related GitHub: https://github.com/bdaisley/isolateR
Description: This script performs in silico PCR steps and
extracts full length 16S rRNA, V3-V6 region 16S rRNA, and cpn60 UT genes
from the 6,729 genomes of interest.
Links:
Dependencies:
#!/bin/bash
# Author: Brendan Daisley, PhD - University of Guelph (bdaisley@uoguelph.ca)
# Date: April 2, 2024
# Related GitHub: https://github.com/bdaisley/isolateR
# Related article: Daisley et al. (2024). isolateR: an R package for generating microbial libraries from Sanger sequencing data. In draft
# Description: This is an isolateR benchmark script that performs in silico PCR
# to extract marker genes of interest from the 6729 IMGG catalouge genomes.
#Dependencies:
# - Ubuntu 20.04.2 LTS ‘focal’ or later OS https://ubuntu.com
# - pigz v2.4 (https://zlib.net/pigz)
# - usearch v11.0.667 (https://www.drive5.com/usearch)
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Step 1: Setup directories
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
working_dir="/mnt/md0/isolateR_meta_analysis/Rmarkdown"
fna_dir=${working_dir}/01_IMGG_genomes_6729
pcr_dir=${working_dir}/02_IMGG_marker_genes_pcr
pcr_dir_seq_locations=${pcr_dir}/seq_locations_in_genome
pcr_dir_seqs=${pcr_dir}/seqs
mkdir $fna_dir
mkdir $pcr_dir
mkdir $pcr_dir_seq_locations
mkdir $pcr_dir_seqs
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Step 2: Download the 6729 genome files
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#These genome files were originally generated by Jin et al. 2023, Nature Microbiology (https://doi.org/10.1038/s41564-022-01270-1)
cd $working_dir
curl -L https://figshare.com/ndownloader/files/34918815 --output $fna_dir/6729_high_quality_MIMAG.tar.gz
pigz -d $fna_dir/6729_high_quality_MIMAG.tar.gz
tar -xf $fna_dir/6729_high_quality_MIMAG.tar $fna_dir
#Clean up
mv $fna_dir/06.3.1.MIMAG/* $fna_dir
rm $fna_dir/6729_high_quality_MIMAG.tar
rm -r $fna_dir/06.3.1.MIMAG
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Step 3: Make primer DB
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
primer_set_1="primers_16S_FULL"
primer_set_2="primers_16S_V3V6"
primer_set_3="primers_cpn60_degen"
#Make 1) primers_16S_FULL
echo ">16S_pos0001 TNANNGNAGAGTTTGATCATGGCTCAG >16S_pos1543 NAAGGAGGTGATCCAGCCGCA" | sed 's/ /\n/g' > ${pcr_dir}/${primer_set_1}.fasta
#Make 2) primers_16S_V3V6
echo ">16S_V3_F TACGGRAGGCAGCAG >16S_V6_R ACRACACGAGCTGACGAC" | sed 's/ /\n/g' > ${pcr_dir}/${primer_set_2}.fasta
#Make 3) primers_cpn60_degen
echo ">h279f GANNNNGCNGGNGAYGGNACNACNAC >h280r YKNYKNTCNCCRAANCCNGGNGCYTT" | sed 's/ /\n/g' > ${pcr_dir}/${primer_set_3}.fasta
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Step 4: Get refererence DBs for orienting marker gene sequences
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
cd $pcr_dir
curl -O https://ftp.ncbi.nlm.nih.gov/refseq/TargetedLoci/Bacteria/bacteria.16SrRNA.fna.gz
pigz -d ${pcr_dir}/bacteria.16SrRNA.fna.gz --force
usearch -makeudb_usearch ${pcr_dir}/bacteria.16SrRNA.fna -output ${pcr_dir}/reorient_16S.udb
rm ${pcr_dir}/bacteria.16SrRNA.fna
wget https://github.com/HillLabSask/cpn60-Classifier/releases/download/v11.1/cpn60-Classifier_v11.0_training.tar.gz -O cpn60-Classifier_v11.0_training.tar.gz --backups=0
pigz -d ${pcr_dir}/cpn60-Classifier_v11.0_training.tar.gz --force
tar -xvf ${pcr_dir}/cpn60-Classifier_v11.0_training.tar
usearch -makeudb_usearch ${pcr_dir}/cpn60-Classifier_v11.0_training/refseqs_v11.fasta -output ${pcr_dir}/reorient_cpn60.udb
rm ${pcr_dir}/cpn60-Classifier_v11.0_training.tar
rm -r ${pcr_dir}/cpn60-Classifier_v11.0_training
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Step 5: Perform in-silico PCR
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
cd $fna_dir
echo "name $primer_set_1 $primer_set_2 $primer_set_3" | sed 's/ /\t/g' > ${pcr_dir}/summary_info.txt
#Set up for multithreading
foo () {
local fna=$1
base_name=$(echo $fna | awk -F'.fa' '{print $1}')
db1=$(echo $primer_set_1 | awk -F'_' '{print $2}')
db2=$(echo $primer_set_2 | awk -F'_' '{print $2}')
db3=$(echo $primer_set_3 | awk -F'_' '{print $2}')
usearch -search_pcr $fna -db ${pcr_dir}/${primer_set_1}.fasta -strand both -maxdiffs 5 -minamp 1400 -maxamp 1650 -pcrout ${pcr_dir_seq_locations}/${base_name}_hits.txt -ampout ${pcr_dir_seqs}/${base_name}_${primer_set_1}_seqs1.fasta
usearch -orient ${pcr_dir_seqs}/${base_name}_${primer_set_1}_seqs1.fasta -db ${pcr_dir}/reorient_${db1}.udb -fastaout ${pcr_dir_seqs}/${base_name}_${primer_set_1}_seqs2.fasta
usearch -fastx_relabel ${pcr_dir_seqs}/${base_name}_${primer_set_1}_seqs2.fasta -prefix ${base_name}_${db1}_copy -fastaout ${pcr_dir_seqs}/${base_name}_${primer_set_1}_seqs.fasta
usearch -search_pcr $fna -db ${pcr_dir}/${primer_set_2}.fasta -strand both -maxdiffs 3 -minamp 600 -maxamp 740 -pcrout ${pcr_dir_seq_locations}/${base_name}_hits.txt -ampout ${pcr_dir_seqs}/${base_name}_${primer_set_2}_seqs1.fasta
usearch -orient ${pcr_dir_seqs}/${base_name}_${primer_set_2}_seqs1.fasta -db ${pcr_dir}/reorient_${db2}.udb -fastaout ${pcr_dir_seqs}/${base_name}_${primer_set_2}_seqs2.fasta
usearch -fastx_relabel ${pcr_dir_seqs}/${base_name}_${primer_set_2}_seqs2.fasta -prefix ${base_name}_${db2}_copy -fastaout ${pcr_dir_seqs}/${base_name}_${primer_set_2}_seqs.fasta
usearch -search_pcr $fna -db ${pcr_dir}/${primer_set_3}.fasta -strand both -maxdiffs 5 -minamp 590 -maxamp 620 -pcrout ${pcr_dir_seq_locations}/${base_name}_hits.txt -ampout ${pcr_dir_seqs}/${base_name}_${primer_set_3}_seqs1.fasta
usearch -orient ${pcr_dir_seqs}/${base_name}_${primer_set_3}_seqs1.fasta -db ${pcr_dir}/reorient_${db3}.udb -fastaout ${pcr_dir_seqs}/${base_name}_${primer_set_3}_seqs2.fasta
usearch -fastx_relabel ${pcr_dir_seqs}/${base_name}_${primer_set_3}_seqs2.fasta -prefix ${base_name}_${db3}_copy -fastaout ${pcr_dir_seqs}/${base_name}_${primer_set_3}_seqs.fasta
seq_count_1=$(grep -o '>' ${pcr_dir_seqs}/${base_name}_${primer_set_1}_seqs.fasta | wc -l)
seq_count_2=$(grep -o '>' ${pcr_dir_seqs}/${base_name}_${primer_set_2}_seqs.fasta | wc -l)
seq_count_3=$(grep -o '>' ${pcr_dir_seqs}/${base_name}_${primer_set_3}_seqs.fasta | wc -l)
paste <(echo "$base_name") <(echo "$seq_count_1") <(echo "$seq_count_2") <(echo "$seq_count_3") --delimiters '\t' | cat >> ${pcr_dir}/summary_info.txt
rm ${pcr_dir_seqs}/${base_name}_${primer_set_1}_seqs1.fasta
rm ${pcr_dir_seqs}/${base_name}_${primer_set_1}_seqs2.fasta
rm ${pcr_dir_seqs}/${base_name}_${primer_set_2}_seqs1.fasta
rm ${pcr_dir_seqs}/${base_name}_${primer_set_2}_seqs2.fasta
rm ${pcr_dir_seqs}/${base_name}_${primer_set_3}_seqs1.fasta
rm ${pcr_dir_seqs}/${base_name}_${primer_set_3}_seqs2.fasta
}
for fna in *fa ;
do running=($(jobs -rp))
while [ ${#running[@]} -ge 4 ] ; do
sleep 1
running=($(jobs -rp))
done
foo "$fna" &
done