diff --git a/README.md b/README.md index 87c102c..102177a 100755 --- a/README.md +++ b/README.md @@ -1,52 +1,42 @@ -# sbx_virus_id +# sbx_cenote_taker -[![Tests](https://github.com/sunbeam-labs/sbx_virus_id/actions/workflows/tests.yml/badge.svg)](https://github.com/sunbeam-labs/sbx_virus_id/actions/workflows/tests.yml) -[![DockerHub](https://img.shields.io/docker/pulls/sunbeamlabs/sbx_virus_id)](https://hub.docker.com/repository/docker/sunbeamlabs/sbx_virus_id/) +[![Tests](https://github.com/sunbeam-labs/sbx_cenote_taker/actions/workflows/tests.yml/badge.svg)](https://github.com/sunbeam-labs/sbx_cenote_taker/actions/workflows/tests.yml) +[![DockerHub](https://img.shields.io/docker/pulls/sunbeamlabs/sbx_cenote_taker)](https://hub.docker.com/repository/docker/sunbeamlabs/sbx_cenote_taker/) ## Introduction -sbx_virus_id is a [sunbeam](https://github.com/sunbeam-labs/sunbeam) extension for identifying viruses in samples. This pipeline uses [MEGAHIT](https://github.com/voutcn/megahit) or [SPAdes](https://github.com/ablab/spades) for assembly of contigs and [Cenote-Taker2](https://github.com/mtisza1/Cenote-Taker2) or [Virsorter2](https://github.com/jiarong/VirSorter2) for viral identification. +sbx_cenote_taker is a [sunbeam](https://github.com/sunbeam-labs/sunbeam) extension for identifying viruses in samples with [Cenote-Taker3](https://github.com/jedvachey/Cenote-Taker3). This pipeline uses [MEGAHIT](https://github.com/voutcn/megahit) for assembly of contigs and then processes assemblies with Cenote-Taker3. -N.B. If using Megahit for assembly, this extension requires also having sbx_assembly installed. +N.B. This extension requires also having sbx_assembly installed. ### Installation ``` -sunbeam extend https://github.com/sunbeam-labs/sbx_virus_id.git +sunbeam extend https://github.com/sunbeam-labs/sbx_assembly.git +sunbeam extend https://github.com/sunbeam-labs/sbx_cenote_taker.git ``` -# Installing blast dbs +### Cenote-Taker database -Install blast db: +sbx_cenote_taker expects the Cenote-Taker3 reference database to be available locally. Download the database following the official instructions, for example: ``` -conda create -n blast -conda activate blast -conda install -c bioconda blast -mkdir refseq_select_prot/ -cd refseq_select_prot/ -perl `which update_blastdb.pl` --decompress refseq_select_prot +conda activate cenote-taker +get_ct3_dbs -o /path/to/ct3_db --hmm T --hallmark_tax T --refseq_tax T --mmseqs_cdd T --domain_list T --hhCDD T --hhPFAM T --hhPDB T ``` -Install viral blast db: +Update the `cenote_taker_db` entry in your Sunbeam configuration to point at the resulting directory. -``` -conda stuff from above ^^^ -mkdir viral_prot/ && cd viral_prot/ -wget https://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.1.protein.faa.gz && gzip -d viral.1.protein.faa.gz -makeblastdb -in viral.1.protein.faa -parse_seqids -title "viral" -dbtype prot -``` +### Running -## Running - -Run with sunbeam on the target `all_virus_id`, +Run with sunbeam on the target `all_cenote_taker`: ``` -sunbeam run --profile /path/to/project/ all_virus_id +sunbeam run --profile /path/to/project/ all_cenote_taker ``` ### Options for config.yml @@ -54,16 +44,5 @@ sunbeam run --profile /path/to/project/ all_virus_id - blast_db: path to blast db (default: "") (NOTE: this should be the database file not just the directory it's in) - blastx_threads: number of threads for running blastx (default: 4) - bowtie2_build_threads: number of threads for running bowtie2-build (default: 4) - - cenote_taker2_db: path to cenote-taker2 db (default: "") (NOTE: this should be a directory) - - virsorter_db: path to virsorter2 db (default: "") (NOTE: this should be a directory) + - cenote_taker_db: path to cenote-taker3 db (default: "") (NOTE: this should be a directory) - include_phages: Whether to include phages in the output (default: False) - - use_spades: Whether to use SPAdes instead of MEGAHIT (default: False) - - use_virsorter: Whether to use Virsorter2 instead of Cenote-Taker2 (default: False) - -## Legacy Installation - -``` -git clone https://github.com/sunbeam-labs/sbx_virus_id.git extensions/sbx_virus_id -cd extensions/sbx_virus_id -cat config.yml >> /path/to/sunbeam_config.yml -``` diff --git a/VERSION b/VERSION index 8a9ecc2..6c6aa7c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.0.1 \ No newline at end of file +0.1.0 \ No newline at end of file diff --git a/config.yml b/config.yml index 193b1b9..4055dfd 100755 --- a/config.yml +++ b/config.yml @@ -1,10 +1,6 @@ -sbx_virus_id: +sbx_cenote_taker: blast_db: '' blastx_threads: 4 bowtie2_build_threads: 4 cenote_taker_db: '' - cenote_taker_extra_dbs: True # Download hhsuite databases (True, ~70GB) or only basic (False, ~3GB) - virsorter_db: '' include_phages: False - use_spades: False # Default: Megahit - use_virsorter: False # Default: Cenote-Taker2 \ No newline at end of file diff --git a/envs/cenote_taker_env.Dockerfile b/envs/cenote_taker_env.Dockerfile index 4154ef4..f6d050c 100644 --- a/envs/cenote_taker_env.Dockerfile +++ b/envs/cenote_taker_env.Dockerfile @@ -1,17 +1,17 @@ -FROM condaforge/mambaforge:latest - -# Setup -WORKDIR /home/sbx_virus_id_env - -COPY envs/cenote_taker_env.yml ./ - -# Install environment -RUN conda env create --file cenote_taker_env.yml --name cenote_taker - -ENV PATH="/opt/conda/envs/cenote_taker/bin/:${PATH}" - -# "Activate" the environment -SHELL ["conda", "run", "-n", "cenote_taker", "/bin/bash", "-c"] - -# Run -CMD "bash" \ No newline at end of file +FROM condaforge/mambaforge:latest + +# Setup +WORKDIR /home/sbx_cenote_taker_env + +COPY envs/cenote_taker_env.yml ./ + +# Install environment +RUN conda env create --file cenote_taker_env.yml --name cenote_taker + +ENV PATH="/opt/conda/envs/cenote_taker/bin/:${PATH}" + +# "Activate" the environment +SHELL ["conda", "run", "-n", "cenote_taker", "/bin/bash", "-c"] + +# Run +CMD "bash" diff --git a/envs/sbx_cenote_taker.Dockerfile b/envs/sbx_cenote_taker.Dockerfile new file mode 100644 index 0000000..82f90fc --- /dev/null +++ b/envs/sbx_cenote_taker.Dockerfile @@ -0,0 +1,17 @@ +FROM condaforge/mambaforge:latest + +# Setup +WORKDIR /home/sbx_cenote_taker_env + +COPY envs/sbx_cenote_taker.yml ./ + +# Install environment +RUN conda env create --file sbx_cenote_taker.yml --name sbx_cenote_taker + +ENV PATH="/opt/conda/envs/sbx_cenote_taker/bin/:${PATH}" + +# "Activate" the environment +SHELL ["conda", "run", "-n", "sbx_cenote_taker", "/bin/bash", "-c"] + +# Run +CMD "bash" diff --git a/envs/sbx_virus_id.linux-64.pin.txt b/envs/sbx_cenote_taker.linux-64.pin.txt similarity index 100% rename from envs/sbx_virus_id.linux-64.pin.txt rename to envs/sbx_cenote_taker.linux-64.pin.txt diff --git a/envs/sbx_virus_id.yml b/envs/sbx_cenote_taker.yml similarity index 71% rename from envs/sbx_virus_id.yml rename to envs/sbx_cenote_taker.yml index 57050ac..bffc5c0 100755 --- a/envs/sbx_virus_id.yml +++ b/envs/sbx_cenote_taker.yml @@ -1,4 +1,4 @@ -name: sbx_virus_id +name: sbx_cenote_taker channels: - conda-forge - bioconda @@ -7,4 +7,4 @@ dependencies: - blast - bowtie2 - samtools - - python>=3.10 \ No newline at end of file + - python>=3.10 diff --git a/envs/sbx_virus_id.Dockerfile b/envs/sbx_virus_id.Dockerfile deleted file mode 100644 index 06c4672..0000000 --- a/envs/sbx_virus_id.Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM condaforge/mambaforge:latest - -# Setup -WORKDIR /home/sbx_virus_id_env - -COPY envs/sbx_virus_id.yml ./ - -# Install environment -RUN conda env create --file sbx_virus_id.yml --name sbx_virus_id - -ENV PATH="/opt/conda/envs/sbx_virus_id/bin/:${PATH}" - -# "Activate" the environment -SHELL ["conda", "run", "-n", "sbx_virus_id", "/bin/bash", "-c"] - -# Run -CMD "bash" \ No newline at end of file diff --git a/envs/spades_env.Dockerfile b/envs/spades_env.Dockerfile deleted file mode 100644 index 44fb8f0..0000000 --- a/envs/spades_env.Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM condaforge/mambaforge:latest - -# Setup -WORKDIR /home/sbx_virus_id_env - -COPY envs/spades_env.yml ./ - -# Install environment -RUN conda env create --file spades_env.yml --name spades - -ENV PATH="/opt/conda/envs/spades/bin/:${PATH}" - -# "Activate" the environment -SHELL ["conda", "run", "-n", "spades", "/bin/bash", "-c"] - -# Run -CMD "bash" \ No newline at end of file diff --git a/envs/spades_env.linux-64.pin.txt b/envs/spades_env.linux-64.pin.txt deleted file mode 100644 index 79dc603..0000000 --- a/envs/spades_env.linux-64.pin.txt +++ /dev/null @@ -1,34 +0,0 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: linux-64 -@EXPLICIT -https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 -https://conda.anaconda.org/conda-forge/noarch/_sysroot_linux-64_curr_repodata_hack-3-h69a702a_13.conda#f6ce7955b53ae1ca83144adb3be9c600 -https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2023.7.22-hbcca054_0.conda#a73ecd2988327ad4c8f2c331482917f2 -https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda#7aca3059a1729aa76c597603f10b0dd3 -https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_2.conda#9172c297304f2a20134fc56c97fbe229 -https://conda.anaconda.org/conda-forge/noarch/tzdata-2023c-h71feb2d_0.conda#939e3e74d8be4dac89ce83b20de2492a -https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-h4a8ded7_13.tar.bz2#523bc836a954faf0cca94831971bb85a -https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_2.conda#e2042154faafe61969556f28bade94b9 -https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d -https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h4a8ded7_13.tar.bz2#57e5a5191ffe999b9f4dfdbcd0ddcba4 -https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_2.conda#c28003b0be0494f9a7664389146716ff -https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h7f98852_4.tar.bz2#a1fd65c7ccbf10880423d82bca54eb54 -https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.5.0-hcb278e6_1.conda#6305a3dd2752c76335295da4e581f2fd -https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 -https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 -https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b -https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad -https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-8.0.1-hc9558a2_0.tar.bz2#67590caab043d6d7ffc371f9cced7848 -https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4-hcb278e6_0.conda#681105bccc2a3f7f1a837d47d39c9179 -https://conda.anaconda.org/conda-forge/linux-64/openssl-3.1.3-hd590300_0.conda#7bb88ce04c8deb9f7d763ae04a1da72f -https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0 -https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.43.2-h2797004_0.conda#4b441a1ee22397d5a27dc1126b849edd -https://conda.anaconda.org/conda-forge/linux-64/openmp-8.0.1-0.tar.bz2#b35241079152e5cc891c99368395b2c6 -https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4 -https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-h2797004_0.conda#513336054f884f95d9fd925748f41ef3 -https://conda.anaconda.org/conda-forge/linux-64/python-3.12.0-hab00c5b_0_cpython.conda#7f97faab5bebcc2580f4f299285323da -https://conda.anaconda.org/conda-forge/noarch/setuptools-68.2.2-pyhd8ed1ab_0.conda#fc2166155db840c634a1291a5c35a709 -https://conda.anaconda.org/bioconda/linux-64/spades-3.15.5-h95f258a_1.tar.bz2#62ab35497479905dce6860525262104f -https://conda.anaconda.org/conda-forge/noarch/wheel-0.41.2-pyhd8ed1ab_0.conda#1ccd092478b3e0ee10d7a891adbf8a4f -https://conda.anaconda.org/conda-forge/noarch/pip-23.3-pyhd8ed1ab_0.conda#a06f102f59c8e3bb8b3e46e71c384709 diff --git a/envs/spades_env.yml b/envs/spades_env.yml deleted file mode 100755 index 2d7a018..0000000 --- a/envs/spades_env.yml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda -dependencies: - - spades -name: spades \ No newline at end of file diff --git a/envs/virsorter_env.Dockerfile b/envs/virsorter_env.Dockerfile deleted file mode 100644 index 8c08f56..0000000 --- a/envs/virsorter_env.Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM condaforge/mambaforge:latest - -# Setup -WORKDIR /home/sbx_virus_id_env - -COPY envs/virsorter_env.yml ./ - -# Install environment -RUN conda env create --file virsorter_env.yml --name virsorter - -ENV PATH="/opt/conda/envs/virsorter/bin/:${PATH}" - -# "Activate" the environment -SHELL ["conda", "run", "-n", "virsorter", "/bin/bash", "-c"] - -# Run -CMD "bash" \ No newline at end of file diff --git a/envs/virsorter_env.linux-64.pin.txt b/envs/virsorter_env.linux-64.pin.txt deleted file mode 100644 index 6ad0f8c..0000000 --- a/envs/virsorter_env.linux-64.pin.txt +++ /dev/null @@ -1,145 +0,0 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: linux-64 -@EXPLICIT -https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 -https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2023.7.22-hbcca054_0.conda#a73ecd2988327ad4c8f2c331482917f2 -https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda#7aca3059a1729aa76c597603f10b0dd3 -https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_2.conda#9172c297304f2a20134fc56c97fbe229 -https://conda.anaconda.org/conda-forge/noarch/pybind11-abi-4-hd8ed1ab_3.tar.bz2#878f923dd6acc8aeb47a75da6c4098be -https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda#26322ec5d7712c3ded99dd656142b8ce -https://conda.anaconda.org/conda-forge/noarch/tzdata-2023c-h71feb2d_0.conda#939e3e74d8be4dac89ce83b20de2492a -https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_2.conda#e2042154faafe61969556f28bade94b9 -https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d -https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_2.conda#c28003b0be0494f9a7664389146716ff -https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h7f98852_4.tar.bz2#a1fd65c7ccbf10880423d82bca54eb54 -https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.20.1-hd590300_0.conda#6642e4faa4804be3a0e7edfefbd16595 -https://conda.anaconda.org/conda-forge/linux-64/fmt-10.1.1-h00ab1b0_0.conda#5c875bdc09118cd3fc2edd39842e4c35 -https://conda.anaconda.org/conda-forge/linux-64/gettext-0.21.1-h27087fc_0.tar.bz2#14947d8770185e5153fdd04d4673ed37 -https://conda.anaconda.org/conda-forge/linux-64/icu-73.2-h59595ed_0.conda#cc47e1facc155f91abd89b11e48e72ff -https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 -https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-h516909a_1.tar.bz2#6f8720dff19e17ce5d48cfe7f3d2f0a3 -https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.5.0-hcb278e6_1.conda#6305a3dd2752c76335295da4e581f2fd -https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 -https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.2.0-ha4646dd_2.conda#78fdab09d9138851dde2b5fe2a11019e -https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.17-h166bdaf_0.tar.bz2#b62b52da46c39ee2bc3c162ac7f1804d -https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 -https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b -https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad -https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.4-hcb278e6_0.conda#318b08df404f9c9be5712aaa5a6f0bb0 -https://conda.anaconda.org/conda-forge/linux-64/lzo-2.10-h516909a_1000.tar.bz2#bb14fcb13341b81d5eb386423b9d2bac -https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4-hcb278e6_0.conda#681105bccc2a3f7f1a837d47d39c9179 -https://conda.anaconda.org/conda-forge/linux-64/openssl-3.1.3-hd590300_0.conda#7bb88ce04c8deb9f7d763ae04a1da72f -https://conda.anaconda.org/conda-forge/linux-64/reproc-14.2.4.post0-hd590300_0.conda#9f067e96da541ba572d160704984208f -https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0 -https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2#4cb3ad778ec2d5a7acbdf254eb1c42ae -https://conda.anaconda.org/conda-forge/linux-64/yaml-cpp-0.8.0-h59595ed_0.conda#965eaacd7c18eb8361fd12bb9e7a57d7 -https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1 -https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.2.0-h69a702a_2.conda#e75a75a6eaf6f318dae2631158c46575 -https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.52.0-h61bc06f_0.conda#613955a50485812985c059e7b269f42e -https://conda.anaconda.org/conda-forge/linux-64/libsolv-0.7.25-hfc55251_0.conda#17e88b01ca0601d5fd55bb72a9e352d9 -https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.43.2-h2797004_0.conda#4b441a1ee22397d5a27dc1126b849edd -https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda#1f5a58e686b13bcfde88b93f547d23fe -https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.11.5-h232c23b_1.conda#f3858448893839820d4bcfb14ad3ecdf -https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.40-hc3806b6_0.tar.bz2#69e2c796349cd9b273890bee0febfe1b -https://conda.anaconda.org/conda-forge/linux-64/perl-5.32.1-4_hd590300_perl5.conda#3e785bff761095eb7f8676f4694bd1b1 -https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4 -https://conda.anaconda.org/conda-forge/linux-64/reproc-cpp-14.2.4.post0-h59595ed_0.conda#33402b9a26cdc1a5a8eeecbe5ce6f486 -https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-h2797004_0.conda#513336054f884f95d9fd925748f41ef3 -https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda#04b88013080254850d6c01ed54810589 -https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda#cd95826dbd331ed1be26bdf401432844 -https://conda.anaconda.org/conda-forge/linux-64/libarchive-3.7.2-h039dbb9_0.conda#611d6c83d1130ea60c916531adfb11db -https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.24-pthreads_h413a1c8_0.conda#6e4ef6ca28655124dcde9bd500e44c32 -https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.43.2-h2c6b66d_0.conda#c37b95bcd6c6833dacfd5df0ae2f4303 -https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-19_linux64_openblas.conda#420f4e9be59d0dc9133a0f43f7bab3f3 -https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.4.0-hca28451_0.conda#1158ac1d2613b28685644931f11ee807 -https://conda.anaconda.org/conda-forge/linux-64/python-3.10.0-h543edf9_3_cpython.tar.bz2#67cdff58413ce9f034fb971188060313 -https://conda.anaconda.org/conda-forge/noarch/appdirs-1.4.4-pyh9f0ad1d_0.tar.bz2#5f095bc6454094e96f146491fd03633b -https://conda.anaconda.org/conda-forge/noarch/attrs-23.1.0-pyh71513ae_1.conda#3edfead7cedd1ab4400a6c588f3e75f8 -https://conda.anaconda.org/conda-forge/noarch/boltons-23.0.0-pyhd8ed1ab_0.conda#033eb25fffd222aceeca6d58cd953680 -https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py310hc6cd4ac_1.conda#1f95722c94f00b69af69a066c7433714 -https://conda.anaconda.org/conda-forge/noarch/certifi-2023.7.22-pyhd8ed1ab_0.conda#7f3dbc9179b4dde7da98dfb151d0ad22 -https://conda.anaconda.org/conda-forge/linux-64/chardet-5.2.0-py310hff52083_1.conda#a677136a83b823803d2f92045f885be2 -https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.3.0-pyhd8ed1ab_0.conda#fef8ef5f0a54546b9efee39468229917 -https://conda.anaconda.org/conda-forge/noarch/click-8.1.7-unix_pyh707e725_0.conda#f3ad426304898027fc619827ff428eca -https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_0.tar.bz2#3faab06a954c2a04039983f2c4a50d99 -https://conda.anaconda.org/conda-forge/noarch/configargparse-1.7-pyhd8ed1ab_0.conda#0d07dc29b1c1cc973f76b74beb44915f -https://conda.anaconda.org/conda-forge/linux-64/curl-8.4.0-hca28451_0.conda#2bcf7689cae931dd35d9a45626f49fce -https://conda.anaconda.org/conda-forge/linux-64/datrie-0.8.2-py310h2372a71_7.conda#e0a66d90c57bdf40cf659a4b0357a795 -https://conda.anaconda.org/conda-forge/linux-64/docutils-0.20.1-py310hff52083_2.conda#ac157d9b464d15fac78b13fcabc0f845 -https://conda.anaconda.org/conda-forge/noarch/idna-3.4-pyhd8ed1ab_0.tar.bz2#34272b248891bddccc64479f9a7fffed -https://conda.anaconda.org/conda-forge/linux-64/jsonpointer-2.4-py310hff52083_3.conda#08ec1463dbc5c806a32fc431874032ca -https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-19_linux64_openblas.conda#d12374af44575413fbbd4a217d46ea33 -https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-19_linux64_openblas.conda#9f100edf65436e3eabc2a51fc00b2c37 -https://conda.anaconda.org/conda-forge/linux-64/libmamba-1.5.1-had39da4_2.conda#c915e6866cde479b486f1f1fc1ca325b -https://conda.anaconda.org/conda-forge/linux-64/markupsafe-2.1.3-py310h2372a71_1.conda#b74e07a054c479e45a83a83fc5be713c -https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.0-pyhd8ed1ab_0.tar.bz2#f8dab71fdc13b1bf29a01248b156d268 -https://conda.anaconda.org/conda-forge/noarch/packaging-23.2-pyhd8ed1ab_0.conda#79002079284aa895f883c6b7f3f88fd6 -https://conda.anaconda.org/conda-forge/noarch/pkgutil-resolve-name-1.3.10-pyhd8ed1ab_1.conda#405678b942f2481cecdb3e010f4925d9 -https://conda.anaconda.org/conda-forge/noarch/pluggy-1.3.0-pyhd8ed1ab_0.conda#2390bd10bed1f3fdc7a537fb5a447d8d -https://conda.anaconda.org/conda-forge/linux-64/psutil-5.9.5-py310h2372a71_1.conda#cb25177acf28cc35cfa6c1ac1c679e22 -https://conda.anaconda.org/conda-forge/linux-64/pycosat-0.6.6-py310h2372a71_0.conda#0adaac9a86d59adae2bc86b3cdef2df1 -https://conda.anaconda.org/conda-forge/noarch/pycparser-2.21-pyhd8ed1ab_0.tar.bz2#076becd9e05608f8dc72757d5f3a91ff -https://conda.anaconda.org/conda-forge/noarch/pygments-2.16.1-pyhd8ed1ab_0.conda#40e5cb18165466773619e5c963f00a7b -https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.1.1-pyhd8ed1ab_0.conda#176f7d56f0cfe9008bdf1bccd7de02fb -https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2#2a7de29fb590ca14b5243c4c812c8025 -https://conda.anaconda.org/conda-forge/noarch/python-fastjsonschema-2.18.1-pyhd8ed1ab_0.conda#305141cff54af2f90e089d868fffce28 -https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.1-py310h2372a71_1.conda#bb010e368de4940771368bc3dc4c63e7 -https://conda.anaconda.org/conda-forge/noarch/ratelimiter-1.2.0-pyhd8ed1ab_1003.tar.bz2#432d4fa75ebc28bf4b337eeff0606cf4 -https://conda.anaconda.org/conda-forge/linux-64/rpds-py-0.10.6-py310hcb5633a_0.conda#43c12d8f7891a87378eb5339c49ef051 -https://conda.anaconda.org/conda-forge/linux-64/ruamel.yaml.clib-0.2.7-py310h2372a71_2.conda#7c9da9721ee545d57ad759f020172853 -https://conda.anaconda.org/conda-forge/noarch/setuptools-68.2.2-pyhd8ed1ab_0.conda#fc2166155db840c634a1291a5c35a709 -https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 -https://conda.anaconda.org/conda-forge/noarch/smmap-3.0.5-pyh44b312d_0.tar.bz2#3a8dc70789709aa315325d5df06fb7e4 -https://conda.anaconda.org/conda-forge/noarch/text-unidecode-1.3-pyhd8ed1ab_1.conda#ba8aba332d8868897ce44ad74015a7fe -https://conda.anaconda.org/conda-forge/noarch/toolz-0.12.0-pyhd8ed1ab_0.tar.bz2#92facfec94bc02d6ccf42e7173831a36 -https://conda.anaconda.org/conda-forge/noarch/toposort-1.10-pyhd8ed1ab_0.conda#aeef653e20028f19a3c2cc70e166b509 -https://conda.anaconda.org/conda-forge/noarch/traitlets-5.11.2-pyhd8ed1ab_0.conda#bd3f90f7551e1cffb1f402880eb2cef1 -https://conda.anaconda.org/conda-forge/noarch/types-python-dateutil-2.8.19.14-pyhd8ed1ab_0.conda#4df15c51a543e806d439490b862be1c6 -https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.8.0-pyha770c72_0.conda#5b1be40a26d10a06f6d4f1f9e19fa0c7 -https://conda.anaconda.org/conda-forge/noarch/wheel-0.41.2-pyhd8ed1ab_0.conda#1ccd092478b3e0ee10d7a891adbf8a4f -https://conda.anaconda.org/conda-forge/linux-64/wrapt-1.15.0-py310h2372a71_1.conda#43e5d746d736ae6c71060ed923179d6d -https://conda.anaconda.org/conda-forge/noarch/zipp-3.17.0-pyhd8ed1ab_0.conda#2e4d6bc0b14e10f895fc6791a7d9b26a -https://conda.anaconda.org/conda-forge/noarch/amply-0.1.6-pyhd8ed1ab_0.conda#a45a9cc65a438f465845ebff49c6fbf9 -https://conda.anaconda.org/conda-forge/noarch/binaryornot-0.4.4-py_1.tar.bz2#a556fa60840fcb9dd739d186bfd252f7 -https://conda.anaconda.org/conda-forge/linux-64/cffi-1.16.0-py310h2fee648_0.conda#45846a970e71ac98fd327da5d40a0a2c -https://conda.anaconda.org/conda-forge/linux-64/git-2.42.0-pl5321h86e50cf_0.conda#96ad24c67e0056d171385859c43218a2 -https://conda.anaconda.org/conda-forge/noarch/gitdb-4.0.10-pyhd8ed1ab_0.conda#3706d2f3d7cb5dae600c833345a76132 -https://conda.anaconda.org/conda-forge/noarch/importlib_resources-6.1.0-pyhd8ed1ab_0.conda#48b0d98e0c0ec810d3ccc2a0926c8c0e -https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.2-pyhd8ed1ab_1.tar.bz2#c8490ed5c70966d232fdd389d0dbed37 -https://conda.anaconda.org/conda-forge/noarch/jsonpatch-1.33-pyhd8ed1ab_0.conda#bfdb7c5c6ad1077c82a69a8642c87aff -https://conda.anaconda.org/conda-forge/linux-64/liblapacke-3.9.0-19_linux64_openblas.conda#685e99d3214f5ac9d1ec6b37983985a6 -https://conda.anaconda.org/conda-forge/linux-64/libmambapy-1.5.1-py310h39ff949_2.conda#d1fb9113b71380d7f3eca059ac6cd14b -https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_0.conda#93a8e71256479c62074356ef6ebf501b -https://conda.anaconda.org/conda-forge/noarch/pip-23.3-pyhd8ed1ab_0.conda#a06f102f59c8e3bb8b3e46e71c384709 -https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2#dd999d1cc9f79e67dbb855c8924c7984 -https://conda.anaconda.org/conda-forge/noarch/python-slugify-8.0.1-pyhd8ed1ab_2.conda#519897ff446e0dc056e12402e6785cd5 -https://conda.anaconda.org/conda-forge/noarch/referencing-0.30.2-pyhd8ed1ab_0.conda#a33161b983172ba6ef69d5fc850650cd -https://conda.anaconda.org/conda-forge/linux-64/ruamel.yaml-0.16.12-py310h5764c6d_3.tar.bz2#bbf52d5fb8daa88be7517abd50722942 -https://conda.anaconda.org/conda-forge/noarch/tqdm-4.66.1-pyhd8ed1ab_0.conda#03c97908b976498dcae97eb4e4f3149c -https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.8.0-hd8ed1ab_0.conda#384462e63262a527bda564fa2d9126c0 -https://conda.anaconda.org/conda-forge/noarch/urllib3-2.0.6-pyhd8ed1ab_0.conda#d5f8944ff9ab24a292511c83dce33dea -https://conda.anaconda.org/conda-forge/noarch/arrow-1.3.0-pyhd8ed1ab_0.conda#b77d8c2313158e6e461ca0efb1c2c508 -https://conda.anaconda.org/conda-forge/linux-64/coin-or-utils-2.11.9-hee58242_0.conda#0f5e6a4d88aac3fa5fcd2fa929862711 -https://conda.anaconda.org/conda-forge/linux-64/conda-package-handling-1.9.0-py310h5764c6d_1.tar.bz2#1b23ed7479259e9bb83bc4cf4b964e88 -https://conda.anaconda.org/conda-forge/linux-64/cryptography-41.0.4-py310h75e40e8_0.conda#ad06c4db71ba0b6d153c66de88a41fdc -https://conda.anaconda.org/conda-forge/noarch/gitpython-3.1.37-pyhd8ed1ab_0.conda#8b94c329190fa6814f412adf2ab0f0a2 -https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2023.7.1-pyhd8ed1ab_0.conda#7c27ea1bdbe520bb830dcadd59f55cbf -https://conda.anaconda.org/conda-forge/noarch/platformdirs-3.11.0-pyhd8ed1ab_0.conda#8f567c0a74aa44cf732f15773b4083b0 -https://conda.anaconda.org/conda-forge/noarch/requests-2.31.0-pyhd8ed1ab_0.conda#a30144e4156cdbb236f99ebb49828f8b -https://conda.anaconda.org/conda-forge/noarch/rich-13.6.0-pyhd8ed1ab_0.conda#3ca4829f40710f581ca1d76bc907e99f -https://conda.anaconda.org/conda-forge/linux-64/coin-or-osi-0.108.8-ha2443b9_0.conda#7e4adb609b8bf87746d6fab1062348c8 -https://conda.anaconda.org/conda-forge/noarch/cookiecutter-2.4.0-pyhca7485f_0.conda#d51520e0dc4e4d6ef149c41be36541e8 -https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.19.1-pyhd8ed1ab_0.conda#78aff5d2af74e6537c1ca73017f01f4f -https://conda.anaconda.org/conda-forge/linux-64/jupyter_core-5.4.0-py310hff52083_0.conda#28cdf08d2d44db099a95a176f01f7120 -https://conda.anaconda.org/conda-forge/noarch/pyopenssl-23.2.0-pyhd8ed1ab_1.conda#34f7d568bf59d18e3fef8c405cbece21 -https://conda.anaconda.org/conda-forge/linux-64/coin-or-clp-1.17.8-h1ee7a9c_0.conda#a2d4935dc3955aa906c17136039db06c -https://conda.anaconda.org/conda-forge/linux-64/conda-23.7.4-py310hff52083_0.conda#27638d3af384a315d5f4ca8f056ac23b -https://conda.anaconda.org/conda-forge/noarch/nbformat-5.9.2-pyhd8ed1ab_0.conda#61ba076de6530d9301a0053b02f093d2 -https://conda.anaconda.org/conda-forge/linux-64/coin-or-cgl-0.60.7-h516709c_0.conda#9170a2b48868d5a340d7076979a775f8 -https://conda.anaconda.org/conda-forge/linux-64/mamba-1.5.1-py310h51d5547_2.conda#974872f0c75edb645d51c619ea155807 -https://conda.anaconda.org/conda-forge/linux-64/coin-or-cbc-2.10.10-h9002f0b_0.conda#f004ee86906bc133df1775e4b3be00ce -https://conda.anaconda.org/conda-forge/noarch/coincbc-2.10.10-0_metapackage.conda#f1170e44b26962b3e8adade2dd0f3902 -https://conda.anaconda.org/conda-forge/linux-64/pulp-2.7.0-py310hff52083_1.conda#8382eec14f1c1429401549d23694b998 -https://conda.anaconda.org/bioconda/noarch/snakemake-minimal-5.26.0-py_0.tar.bz2#6ddc923aada5bbc448c9b3f14e05ee6a -https://conda.anaconda.org/bioconda/noarch/virsorter-2.2.4-pyhdfd78af_1.tar.bz2#b2ae5573e2d42548a165dbaa64d52890 diff --git a/envs/virsorter_env.yml b/envs/virsorter_env.yml deleted file mode 100755 index 0c06f18..0000000 --- a/envs/virsorter_env.yml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - virsorter =2 -name: virsorter \ No newline at end of file diff --git a/sbx_cenote_taker.smk b/sbx_cenote_taker.smk new file mode 100755 index 0000000..fa63ecd --- /dev/null +++ b/sbx_cenote_taker.smk @@ -0,0 +1,305 @@ +try: + SBX_CENOTE_TAKER_VERSION = get_ext_version("sbx_cenote_taker") +except (NameError, ValueError): + # For backwards compatibility with older versions of Sunbeam + SBX_CENOTE_TAKER_VERSION = "0.0.0" +VIRUS_FP = output_subdir(Cfg, "virus") + + +def get_extension_path() -> Path: + return Path(__file__).parent.resolve() + + +def cenote_output() -> Path: + return VIRUS_FP / "cenote_taker" / "{sample}.fasta" + + +rule all_cenote_taker: + input: + expand( + VIRUS_FP / "alignments" / "{sample}.gene_coverage.tsv", + sample=Samples.keys(), + ), + expand( + VIRUS_FP / "blastx" / "{sample}.btf", + sample=Samples.keys(), + ), + VIRUS_FP / "summary" / "all_align_summary.txt", + + +rule cenote_taker: + input: + contigs=ASSEMBLY_FP / "megahit" / "{sample}_asm" / "final.contigs.fa", + output: + contigs=VIRUS_FP / "cenote_taker" / "{sample}" / "final.contigs.fasta", + summary=VIRUS_FP + / "cenote_taker" + / "{sample}" + / "{sample}" + / "{sample}_CONTIG_SUMMARY.tsv", + benchmark: + BENCHMARK_FP / "cenote_taker_{sample}.tsv" + log: + LOG_FP / "cenote_taker_{sample}.log", + params: + out_dir=str(VIRUS_FP / "cenote_taker"), + sample="{sample}", + db_fp=Cfg["sbx_cenote_taker"]["cenote_taker_db"], + resources: + mem_mb=24000, + runtime=720, + conda: + "envs/cenote_taker_env.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-cenote-taker" + shell: + """ + SAMPLE={params.sample} + if [[ ${{#SAMPLE}} -lt 18 ]] && [[ {params.sample} =~ ^[a-zA-Z0-9_]+$ ]]; then + echo "Sample name format is valid" >> {log} + else + echo "Cenote-Taker requires a sample name that is less than 18 characters and contains only alphanumeric characters and underscores" >> {log} + exit 1 + fi + + if [ -s {input.contigs} ]; then + echo "Contigs file exists and is not empty" >> {log} + else + echo "Contigs file is empty" >> {log} + touch {output.contigs} {output.summary} + exit 0 + fi + + if [ ! -d {params.db_fp} ] || [ ! "$(ls -A {params.db_fp})" ]; then + echo "Cenote-Taker database path {params.db_fp} is missing or empty" >> {log} + exit 1 + fi + + cd {params.out_dir} + cenotetaker3 --contigs {input.contigs} -r {params.sample} -p T >> {log} 2>&1 + """ + + +rule filter_cenote_contigs: + input: + contigs=VIRUS_FP / "cenote_taker" / "{sample}" / "final.contigs.fasta", + summary=VIRUS_FP + / "cenote_taker" + / "{sample}" + / "{sample}" + / "{sample}_CONTIG_SUMMARY.tsv", + output: + VIRUS_FP / "cenote_taker" / "{sample}.fasta", + params: + include_phages=Cfg["sbx_cenote_taker"]["include_phages"], + script: + "scripts/filter_cenote_contigs.py" + + +rule build_virus_index: + input: + cenote_output(), + output: + str(cenote_output()) + ".1.bt2", # Don't use f-string, broken with python 3.12 + conda: + "envs/sbx_cenote_taker.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-sbx-cenote-taker" + threads: Cfg["sbx_cenote_taker"]["bowtie2_build_threads"] + shell: + "bowtie2-build --threads {threads} -f {input} {input}" + + +rule align_virus_reads: + input: + r1=QC_FP / "decontam" / "{sample}_1.fastq.gz", + r2=QC_FP / "decontam" / "{sample}_2.fastq.gz", + index=str(cenote_output()) + ".1.bt2", # Don't use f-string, broken with python 3.12 + output: + temp(VIRUS_FP / "alignments" / "{sample}.sam"), + params: + index=str(cenote_output()), + threads: 6 + conda: + "envs/sbx_cenote_taker.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-sbx-cenote-taker" + shell: + "bowtie2 -q --local -t --very-sensitive-local --threads {threads} --no-mixed --no-discordant -x {params.index} -1 {input.r1} -2 {input.r2} -S {output}" + + +rule process_virus_alignment: + input: + VIRUS_FP / "alignments" / "{sample}.sam", + output: + bam=temp(VIRUS_FP / "alignments" / "{sample}.bam"), + sorted=temp(VIRUS_FP / "alignments" / "{sample}.sorted.bam"), + bai=temp(VIRUS_FP / "alignments" / "{sample}.sorted.bam.bai"), + params: + target=str(cenote_output()), + conda: + "envs/sbx_cenote_taker.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-sbx-cenote-taker" + shell: + """ + samtools view -bT {params.target} {input} > {output.bam} + samtools sort -o {output.sorted} {output.bam} + samtools index {output.sorted} {output.bai} + """ + + +rule calculate_mapping_stats: + input: + bam=VIRUS_FP / "alignments" / "{sample}.sorted.bam", + idx=VIRUS_FP / "alignments" / "{sample}.sorted.bam.bai", + output: + VIRUS_FP / "alignments" / "{sample}.sorted.idxstats.tsv", + conda: + "envs/sbx_cenote_taker.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-sbx-cenote-taker" + shell: + """ + samtools idxstats {input.bam} > {output} + """ + + +rule virus_mpileup: + input: + bam=VIRUS_FP / "alignments" / "{sample}.sorted.bam", + idx=VIRUS_FP / "alignments" / "{sample}.sorted.bam.bai", + contigs=cenote_output(), + output: + VIRUS_FP / "alignments" / "{sample}.mpileup", + conda: + "envs/sbx_cenote_taker.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-sbx-cenote-taker" + shell: + """ + samtools mpileup -f {input.contigs} {input.bam} > {output} + """ + + +rule filter_virus_coverage: + input: + fa=cenote_output(), + idx=VIRUS_FP / "alignments" / "{sample}.sorted.idxstats.tsv", + output: + VIRUS_FP / "final_{sample}_contigs.fasta", + log: + LOG_FP / "filter_virus_coverage_{sample}.log", + script: + "scripts/filter_virus_coverage.py" + + +rule virus_blastx: + """Run blastx on untranslated genes against a target db and write to blast tabular format.""" + input: + VIRUS_FP / "final_{sample}_contigs.fasta", + output: + VIRUS_FP / "blastx" / "{sample}.btf", + benchmark: + BENCHMARK_FP / "run_virus_blastx_{sample}.tsv" + log: + LOG_FP / "run_virus_blastx_{sample}.log", + params: + blast_db=Cfg["sbx_cenote_taker"]["blast_db"], + threads: Cfg["sbx_cenote_taker"]["blastx_threads"] + resources: + mem_mb=24000, + runtime=720, + conda: + "envs/sbx_cenote_taker.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-sbx-cenote-taker" + shell: + """ + if [ -s {input} ]; then + export BLASTDB=$(dirname {params.blast_db}) + blastx \ + -query {input} \ + -db $(basename {params.blast_db}) \ + -outfmt "7 qacc sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle" \ + -num_threads {threads} \ + -evalue 0.05 \ + -max_target_seqs 100 \ + -out {output} \ + 2>&1 | tee {log} + else + echo "Caught empty query" >> {log} + touch {output} + fi + """ + + +rule calculate_coverage: + input: + bam=VIRUS_FP / "alignments" / "{sample}.sorted.bam", + idx=VIRUS_FP / "alignments" / "{sample}.sorted.bam.bai", + output: + VIRUS_FP / "alignments" / "{sample}.genomecoverage.txt", + params: + ext_fp=str(get_extension_path()), + conda: + "envs/sbx_cenote_taker.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-sbx-cenote-taker" + shell: + """ + samtools view -b {input.bam} | genomeCoverageBed -ibam stdin | grep -v 'genome'| perl {params.ext_fp}/scripts/coverage_counter.pl > {output} + """ + + +rule combine_coverage_stats: + input: + cov=VIRUS_FP / "alignments" / "{sample}.genomecoverage.txt", + stats=VIRUS_FP / "alignments" / "{sample}.sorted.idxstats.tsv", + output: + VIRUS_FP / "alignments" / "{sample}.align.summary.txt", + benchmark: + BENCHMARK_FP / "combine_coverage_stats_{sample}.tsv" + log: + LOG_FP / "combine_coverage_stats_{sample}.log", + params: + ext_fp=str(get_extension_path()), + conda: + "envs/r_env.yml" + container: + "docker://r-base:latest" + shell: + """ + Rscript {params.ext_fp}/scripts/combine_coverage_stats.R {input.cov} {input.stats} {output} 2>&1 | tee {log} + """ + + +rule virus_coverage_per_gene: + input: + mpileup=VIRUS_FP / "alignments" / "{sample}.mpileup", + btf=VIRUS_FP / "blastx" / "{sample}.btf", + output: + tsv=VIRUS_FP / "alignments" / "{sample}.gene_coverage.tsv", + params: + contigs=cenote_output(), + conda: + "envs/sbx_cenote_taker.yml" + container: + f"docker://sunbeamlabs/sbx_cenote_taker:{SBX_CENOTE_TAKER_VERSION}-sbx-cenote-taker" + script: + "scripts/virus_coverage_per_gene.py" + + +rule all_summary: + input: + expand( + VIRUS_FP / "alignments" / "{sample}.align.summary.txt", + sample=Samples.keys(), + ), + output: + VIRUS_FP / "summary" / "all_align_summary.txt", + shell: + """ + echo -e "Sample\tAlignTarget\tFractionCoverage\tTargetLength\tMappedReads" > {output} + cat {input} >> {output} + """ diff --git a/sbx_virus_id.smk b/sbx_virus_id.smk deleted file mode 100755 index 0d51461..0000000 --- a/sbx_virus_id.smk +++ /dev/null @@ -1,461 +0,0 @@ -# -*- mode: Snakemake -*- -# -# Rules for running Cenote-Taker2 and other tools in the viral id pipeline - -VIRUS_FP = Cfg["all"]["output_fp"] / "virus" - - -try: - BENCHMARK_FP -except NameError: - BENCHMARK_FP = Cfg["all"]["output_fp"] / "benchmarks" -try: - LOG_FP -except NameError: - LOG_FP = Cfg["all"]["output_fp"] / "logs" - - -def get_virus_ext_path() -> Path: - ext_path = Path(sunbeam_dir) / "extensions" / "sbx_virus_id" - if ext_path.exists(): - return ext_path - raise Error( - "Filepath for virus_id not found, are you sure it's installed under extensions/sbx_virus_id?" - ) - - -SBX_VIRUS_ID_VERSION = open(get_virus_ext_path() / "VERSION").read().strip() - - -def virus_sorter_input() -> Path: - if Cfg["sbx_virus_id"]["use_spades"]: - return ASSEMBLY_FP / "virus_id_spades" / "{sample}" / "scaffolds.fasta" - else: - return ASSEMBLY_FP / "megahit" / "{sample}_asm" / "final.contigs.fa" - - -def virus_sorter_output() -> Path: - if Cfg["sbx_virus_id"]["use_virsorter"]: - return VIRUS_FP / "virsorter" / "{sample}.fasta" - else: - return VIRUS_FP / "cenote_taker" / "{sample}.fasta" - - -rule all_virus_id: - input: - expand( - VIRUS_FP / "alignments" / "{sample}.gene_coverage.tsv", - sample=Samples.keys(), - ), - expand( - VIRUS_FP / "blastx" / "{sample}.btf", - sample=Samples.keys(), - ), - VIRUS_FP / "summary" / "all_align_summary.txt", - - -rule virus_id_spades_paired: - input: - r1=QC_FP / "decontam" / "{sample}_1.fastq.gz", - r2=QC_FP / "decontam" / "{sample}_2.fastq.gz", - output: - ASSEMBLY_FP / "virus_id_spades" / "{sample}" / "scaffolds.fasta", - benchmark: - BENCHMARK_FP / "virus_id_spades_paired_{sample}.tsv" - log: - LOG_FP / "virus_id_spades_paired_{sample}.log", - params: - out_fp=str(ASSEMBLY_FP / "virus_id_spades" / "{sample}"), - threads: 4 - conda: - "envs/spades_env.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-spades" - resources: - mem_mb=20000, - runtime=720, - shell: - """ - spades.py -1 {input.r1} -2 {input.r2} -t {threads} -o {params.out_fp} 2>&1 | tee {log} - """ - - -rule install_cenote_taker: - output: - VIRUS_FP / "cenote_taker" / ".installed", - benchmark: - BENCHMARK_FP / "install_cenote_taker.tsv" - log: - LOG_FP / "install_cenote_taker.log", - params: - db_fp=Cfg["sbx_virus_id"]["cenote_taker_db"], - extra_dbs=Cfg["sbx_virus_id"]["cenote_taker_extra_dbs"], - resources: - runtime=2400, - conda: - "envs/cenote_taker_env.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-cenote-taker" - shell: - """ - conda env config vars set CENOTE_DBS={params.db_fp} - - if [ -d {params.db_fp} ] && [ "$(ls -A {params.db_fp})" ]; then - echo "Cenote-Taker database already installed" >> {log} - touch {output} - exit 0 - fi - - if [[ {params.extra_dbs} == "True" ]]; then - echo "Installing Cenote-Taker database with hhsuite" >> {log} - get_ct3_dbs -o {params.db_fp} --hmm T --hallmark_tax T --refseq_tax T --mmseqs_cdd T --domain_list T --hhCDD T --hhPFAM T --hhPDB T >> {log} 2>&1 - else - echo "Installing Cenote-Taker database without hhsuite" >> {log} - get_ct3_dbs -o {params.db_fp} --hmm T --hallmark_tax T --refseq_tax T --mmseqs_cdd T --domain_list T >> {log} 2>&1 - fi - - touch {output} - """ - - -rule cenote_taker: - input: - contigs=virus_sorter_input(), - install=VIRUS_FP / "cenote_taker" / ".installed", - output: - VIRUS_FP / "cenote_taker" / "{sample}" / "final.contigs.fasta", - VIRUS_FP - / "cenote_taker" - / "{sample}" - / "{sample}" - / "{sample}_CONTIG_SUMMARY.tsv", - benchmark: - BENCHMARK_FP / "cenote_taker_{sample}.tsv" - log: - LOG_FP / "cenote_taker_{sample}.log", - params: - run_script=str(get_virus_ext_path() / "Cenote-Taker2" / "run_cenote-taker2.py"), - out_dir=str(VIRUS_FP / "cenote_taker"), - sample="{sample}", - db_fp=Cfg["sbx_virus_id"]["cenote_taker_db"], - resources: - mem_mb=24000, - runtime=720, - conda: - "envs/cenote_taker_env.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-cenote-taker" - shell: - """ - SAMPLE={params.sample} - if [[ ${{#SAMPLE}} -lt 18 ]] && [[ {params.sample} =~ ^[a-zA-Z0-9_]+$ ]]; then - echo "Sample name format is valid" >> {log} - else - echo "Cenote-Taker requires a sample name that is less than 18 characters and contains only alphanumeric characters and underscores" >> {log} - exit 1 - fi - - if [ -s {input.contigs} ]; then - echo "Contigs file exists and is not empty" >> {log} - else - echo "Contigs file is empty" >> {log} - exit 1 - fi - - cd {params.out_dir} - cenotetaker3 --contigs {input.contigs} -r {params.sample} -p T >> {log} 2>&1 - """ - - -rule install_virsorter: - output: - VIRUS_FP / "virsorter" / ".installed", - benchmark: - BENCHMARK_FP / "install_virsorter.tsv" - log: - LOG_FP / "install_virsorter.log", - params: - db_fp=Cfg["sbx_virus_id"]["virsorter_db"], - resources: - runtime=2400, - threads: 4 - conda: - "envs/virsorter_env.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-virsorter" - shell: - """ - # First check if directory exists and has files - if [ -d {params.db_fp} ] && [ "$(ls -A {params.db_fp})" ]; then - echo "VirSorter database already installed" - touch {output} - exit 0 - fi - - echo "Installing VirSorter database" - virsorter setup -d {params.db_fp} -j 4 - touch {output} - """ - - -rule virsorter: - input: - contigs=virus_sorter_input(), - install=VIRUS_FP / "virsorter" / ".installed", - output: - combined_viral=VIRUS_FP / "virsorter" / "{sample}" / "final-viral-combined.fa", - scores=VIRUS_FP / "virsorter" / "{sample}" / "final-viral-score.tsv", - boundaries=VIRUS_FP / "virsorter" / "{sample}" / "final-viral-boundary.tsv", - benchmark: - BENCHMARK_FP / "virsorter_{sample}.tsv" - log: - LOG_FP / "virsorter_{sample}.log", - params: - out_dir=str(VIRUS_FP / "virsorter" / "{sample}"), - db_fp=Cfg["sbx_virus_id"]["virsorter_db"], - resources: - mem_mb=24000, - runtime=720, - threads: 4 - conda: - "envs/virsorter_env.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-virsorter" - shell: - """ - virsorter run -w {params.out_dir} -i {input.contigs} --min-length 1000 -j {threads} --db-dir {params.db_fp} all - """ - - -rule filter_cenote_contigs: - input: - contigs=VIRUS_FP / "cenote_taker" / "{sample}" / "final.contigs.fasta", - summary=VIRUS_FP - / "cenote_taker" - / "{sample}" - / "{sample}" - / "{sample}_CONTIG_SUMMARY.tsv", - output: - VIRUS_FP / "cenote_taker" / "{sample}.fasta", - params: - include_phages=Cfg["sbx_virus_id"]["include_phages"], - script: - "scripts/filter_cenote_contigs.py" - - -rule filter_virsorter_contigs: - input: - contigs=VIRUS_FP / "virsorter" / "{sample}" / "final-viral-combined.fa", - output: - VIRUS_FP / "virsorter" / "{sample}.fasta", - script: - "scripts/filter_virsorter_contigs.py" - - -rule build_virus_index: - input: - virus_sorter_output(), - output: - str(virus_sorter_output()) + ".1.bt2", # Don't use f-string, broken with python 3.12 - conda: - "envs/sbx_virus_id.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-sbx-virus-id" - threads: Cfg["sbx_virus_id"]["bowtie2_build_threads"] - shell: - "bowtie2-build --threads {threads} -f {input} {input}" - - -rule align_virus_reads: - input: - r1=QC_FP / "decontam" / "{sample}_1.fastq.gz", - r2=QC_FP / "decontam" / "{sample}_2.fastq.gz", - index=str(virus_sorter_output()) + ".1.bt2", # Don't use f-string, broken with python 3.12 - output: - temp(VIRUS_FP / "alignments" / "{sample}.sam"), - params: - index=str(virus_sorter_output()), - threads: 6 - conda: - "envs/sbx_virus_id.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-sbx-virus-id" - shell: - "bowtie2 -q --local -t --very-sensitive-local --threads {threads} --no-mixed --no-discordant -x {params.index} -1 {input.r1} -2 {input.r2} -S {output}" - - -rule process_virus_alignment: - input: - VIRUS_FP / "alignments" / "{sample}.sam", - output: - bam=temp(VIRUS_FP / "alignments" / "{sample}.bam"), - sorted=temp(VIRUS_FP / "alignments" / "{sample}.sorted.bam"), - bai=temp(VIRUS_FP / "alignments" / "{sample}.sorted.bam.bai"), - params: - target=str(virus_sorter_output()), - conda: - "envs/sbx_virus_id.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-sbx-virus-id" - shell: - """ - samtools view -bT {params.target} {input} > {output.bam} - samtools sort -o {output.sorted} {output.bam} - samtools index {output.sorted} {output.bai} - """ - - -rule calculate_mapping_stats: - input: - bam=VIRUS_FP / "alignments" / "{sample}.sorted.bam", - idx=VIRUS_FP / "alignments" / "{sample}.sorted.bam.bai", - output: - VIRUS_FP / "alignments" / "{sample}.sorted.idxstats.tsv", - conda: - "envs/sbx_virus_id.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-sbx-virus-id" - shell: - """ - samtools idxstats {input.bam} > {output} - """ - - -rule virus_mpileup: - input: - bam=VIRUS_FP / "alignments" / "{sample}.sorted.bam", - idx=VIRUS_FP / "alignments" / "{sample}.sorted.bam.bai", - contigs=virus_sorter_output(), - output: - VIRUS_FP / "alignments" / "{sample}.mpileup", - conda: - "envs/sbx_virus_id.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-sbx-virus-id" - shell: - """ - samtools mpileup -f {input.contigs} {input.bam} > {output} - """ - - -rule filter_virus_coverage: - input: - fa=virus_sorter_output(), - idx=VIRUS_FP / "alignments" / "{sample}.sorted.idxstats.tsv", - output: - VIRUS_FP / "final_{sample}_contigs.fasta", - log: - LOG_FP / "filter_virus_coverage_{sample}.log", - script: - "scripts/filter_virus_coverage.py" - - -rule virus_blastx: - """Run blastx on untranslated genes against a target db and write to blast tabular format.""" - input: - VIRUS_FP / "final_{sample}_contigs.fasta", - output: - VIRUS_FP / "blastx" / "{sample}.btf", - benchmark: - BENCHMARK_FP / "run_virus_blastx_{sample}.tsv" - log: - LOG_FP / "run_virus_blastx_{sample}.log", - params: - blast_db=Cfg["sbx_virus_id"]["blast_db"], - threads: Cfg["sbx_virus_id"]["blastx_threads"] - resources: - mem_mb=24000, - runtime=720, - conda: - "envs/sbx_virus_id.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-sbx-virus-id" - shell: - """ - if [ -s {input} ]; then - export BLASTDB=$(dirname {params.blast_db}) - blastx \ - -query {input} \ - -db $(basename {params.blast_db}) \ - -outfmt "7 qacc sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle" \ - -num_threads {threads} \ - -evalue 0.05 \ - -max_target_seqs 100 \ - -out {output} \ - 2>&1 | tee {log} - else - echo "Caught empty query" >> {log} - touch {output} - fi - """ - - -rule calculate_coverage: - input: - bam=VIRUS_FP / "alignments" / "{sample}.sorted.bam", - idx=VIRUS_FP / "alignments" / "{sample}.sorted.bam.bai", - output: - VIRUS_FP / "alignments" / "{sample}.genomecoverage.txt", - params: - ext_fp=str(get_virus_ext_path()), - conda: - "envs/sbx_virus_id.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-sbx-virus-id" - shell: - """ - samtools view -b {input.bam} | genomeCoverageBed -ibam stdin | grep -v 'genome'| perl {params.ext_fp}/scripts/coverage_counter.pl > {output} - """ - - -rule combine_coverage_stats: - input: - cov=VIRUS_FP / "alignments" / "{sample}.genomecoverage.txt", - stats=VIRUS_FP / "alignments" / "{sample}.sorted.idxstats.tsv", - output: - VIRUS_FP / "alignments" / "{sample}.align.summary.txt", - benchmark: - BENCHMARK_FP / "combine_coverage_stats_{sample}.tsv" - log: - LOG_FP / "combine_coverage_stats_{sample}.log", - params: - ext_fp=str(get_virus_ext_path()), - conda: - "envs/r_env.yml" - container: - "docker://r-base:latest" - shell: - """ - Rscript {params.ext_fp}/scripts/combine_coverage_stats.R {input.cov} {input.stats} {output} 2>&1 | tee {log} - """ - - -rule virus_coverage_per_gene: - input: - mpileup=VIRUS_FP / "alignments" / "{sample}.mpileup", - btf=VIRUS_FP / "blastx" / "{sample}.btf", - output: - tsv=VIRUS_FP / "alignments" / "{sample}.gene_coverage.tsv", - params: - contigs=virus_sorter_output(), - conda: - "envs/sbx_virus_id.yml" - container: - f"docker://sunbeamlabs/sbx_virus_id:{SBX_VIRUS_ID_VERSION}-sbx-virus-id" - script: - "scripts/virus_coverage_per_gene.py" - - -rule all_summary: - input: - expand( - VIRUS_FP / "alignments" / "{sample}.align.summary.txt", - sample=Samples.keys(), - ), - output: - VIRUS_FP / "summary" / "all_align_summary.txt", - shell: - """ - echo -e "Sample\tAlignTarget\tFractionCoverage\tTargetLength\tMappedReads" > {output} - cat {input} >> {output} - """ diff --git a/scripts/filter_cenote_contigs.py b/scripts/filter_cenote_contigs.py index 04d93fc..6e23c00 100755 --- a/scripts/filter_cenote_contigs.py +++ b/scripts/filter_cenote_contigs.py @@ -1,17 +1,50 @@ import csv -from sunbeamlib.parse import parse_fasta, write_fasta +import os +from typing import Generator, TextIO -with open(snakemake.input.summary) as f_summary, open( - snakemake.input.contigs -) as f_contigs, open(snakemake.output[0], "w") as f_out: +def parse_fasta(f: TextIO) -> Generator[tuple[str, str], None, None]: + header_str = "" + seq_str = "" + for line in f.readlines(): + line = line.strip() + if line.startswith(">"): + if header_str: + yield header_str, seq_str + header_str = line + seq_str = "" + else: + seq_str += line + if header_str: + yield header_str, seq_str + + +def write_fasta(record: list[str], f: TextIO) -> None: + f.write(f"{record[0]}\n") + f.write(f"{record[1]}\n") + + +summary = snakemake.input.summary # type: ignore +contigs = snakemake.input.contigs # type: ignore +output_fp = snakemake.output[0] # type: ignore +include_phages = snakemake.params["include_phages"] # type: ignore + +# Empty output if empty contigs +if os.path.getsize(contigs) == 0: + with open(output_fp, "w") as f_out: + pass + exit(0) + +with open(summary) as f_summary, open(contigs) as f_contigs, open( + output_fp, "w" +) as f_out: dr = csv.DictReader(f_summary, delimiter="\t") cd = {} phages = ["phage", "siphoviridae", "conjugative transposon"] for line in dr: if ( all([x not in line["ORGANISM_NAME"].lower() for x in phages]) - or snakemake.params["include_phages"] + or include_phages ) and int(line["NUM_HALLMARKS"]) > 0: cd[line["ORIGINAL_NAME"]] = 1 diff --git a/scripts/filter_virsorter_contigs.py b/scripts/filter_virsorter_contigs.py deleted file mode 100755 index 5bb2f17..0000000 --- a/scripts/filter_virsorter_contigs.py +++ /dev/null @@ -1,8 +0,0 @@ -from sunbeamlib.parse import parse_fasta, write_fasta - -# This does nothing but I'm leaving it in case we want to add custom filtering here later -with open(snakemake.input.contigs) as f_contigs, open( - snakemake.output[0], "w" -) as f_out: - for header_str, seq_str in parse_fasta(f_contigs): - write_fasta((header_str, seq_str), f_out) diff --git a/scripts/filter_virus_coverage.py b/scripts/filter_virus_coverage.py index 8509ee2..dda0cdb 100755 --- a/scripts/filter_virus_coverage.py +++ b/scripts/filter_virus_coverage.py @@ -1,9 +1,36 @@ import csv -from sunbeamlib.parse import parse_fasta, write_fasta +import os +from typing import Generator, TextIO + +def parse_fasta(f: TextIO) -> Generator[tuple[str, str], None, None]: + header_str = "" + seq_str = "" + for line in f.readlines(): + line = line.strip() + if line.startswith(">"): + if header_str: + yield header_str, seq_str + header_str = line + seq_str = "" + else: + seq_str += line + if header_str: + yield header_str, seq_str + + +def write_fasta(record: list[str], f: TextIO) -> None: + f.write(f"{record[0]}\n") + f.write(f"{record[1]}\n") + + +idx = snakemake.input.idx # type: ignore +fa = snakemake.input.fa # type: ignore +output_fp = snakemake.output[0] # type: ignore +log_fp = snakemake.log[0] # type: ignore contigs = {} -with open(snakemake.input.idx) as f_idx: +with open(idx) as f_idx: rd = csv.reader(f_idx, delimiter="\t", quotechar='"') for row in rd: if row[0] != "*": @@ -12,11 +39,11 @@ else: contigs[row[0]] = 0 -with open(snakemake.log[0], "w") as f_log: +with open(log_fp, "w") as f_log: f_log.write(f"Contigs: {contigs}") -with open(snakemake.input.fa) as f_fa, open(snakemake.output[0], "w") as f_out: +with open(fa) as f_fa, open(output_fp, "w") as f_out: for header, seq in parse_fasta(f_fa): contig_name = header.split(" ")[0] if contigs[contig_name]: - write_fasta((contig_name, seq), f_out) + write_fasta([contig_name, seq], f_out)