11#!/usr/bin/env python
22
3- __version__ = '1.1 '
3+ __version__ = '1.2 '
44
55import argparse
66import logging
@@ -30,12 +30,16 @@ def syscall(syscmd):
3030 sys .exit ('ERROR: failed syscall ' + syscmd )
3131
3232def translateSeq (nuclSeq ):
33+ '''use biopython to take in a nucleotide sequence (string) and return a protein sequence'''
3334 proteinSeq = Seq (nuclSeq , generic_dna ).translate (cds = False , to_stop = False , stop_symbol = '*' )
3435 return proteinSeq
3536
3637def internalSTOPcodon (candidate ):
3738 '''counts number of internal stop codons; requires sequence from database to be in frame'''
38- nucSeq = candidate [10 ]
39+ if '-' in candidate [10 ]:
40+ nucSeq = candidate [10 ].replace ('-' , '' )
41+ else :
42+ nucSeq = candidate [10 ]
3943 if int (int (candidate [8 ])% 3 ) == 1 :
4044 frameStart = int (0 )
4145 elif int (int (candidate [8 ])% 3 ) == 0 :
@@ -44,13 +48,12 @@ def internalSTOPcodon(candidate):
4448 frameStart = int (2 )
4549 if frameStart > 0 :
4650 nucSeq = nucSeq [frameStart :]
47- frameEnd = int (int (candidate [9 ])% 3 )
48- if frameEnd > 0 :
49- nucSeq = nucSeq [:- frameEnd ]
5051 if len (nucSeq )% 3 == 0 :
5152 protein = translateSeq (nucSeq )
53+ elif len (nucSeq )% 3 > 0 :
54+ protein = translateSeq (nucSeq [:- (len (nucSeq )% 3 )])
5255 else :
53- sys .exit ('ERROR: incorrect nucleotide length (%s ) after trimming' % len (nucSeq ))
56+ sys .exit ('ERROR: incorrect nucleotide length ({} ) after trimming' . format ( len (nucSeq ) ))
5457 numInternalSTOPcodons = len (re .findall (r'\*[ABCDEFGHIKLMNPQRSTVWYZ]' , str (protein )))
5558 return (protein , str (numInternalSTOPcodons ))
5659
@@ -74,22 +77,22 @@ def main(args=None):
7477 baseGenome = os .path .splitext (os .path .basename (genome ))[0 ]
7578 if not os .path .exists (outdir ):
7679 os .mkdir (outdir )
77- logging .basicConfig (filename = '%s /c-SSTAR_%s .log' % (outdir , baseGenome ), format = '%(asctime)s: %(levelname)s: %(message)s' , datefmt = '%d-%m-%Y %I:%M:%S %p' , level = logging .INFO )
78- logging .info ('user: %s' % pwd .getpwuid (os .getuid ()).pw_name )
79- logging .info ('release: %s' % os .uname ()[3 ])
80- logging .info ('shell env: %s' % pwd .getpwuid (os .getuid ()).pw_shell )
81- logging .info ('cwd: %s' % pwd .getpwuid (os .getuid ()).pw_dir )
82- logging .info ('python version: %s' % sys .version )
80+ logging .basicConfig (filename = '{} /c-SSTAR_{} .log' . format (outdir , baseGenome ), format = '%(asctime)s: %(levelname)s: %(message)s' , datefmt = '%d-%m-%Y %I:%M:%S %p' , level = logging .INFO )
81+ logging .info ('user: {}' . format ( pwd .getpwuid (os .getuid ()).pw_name ) )
82+ logging .info ('release: {}' . format ( os .uname ()[3 ]) )
83+ logging .info ('shell env: {}' . format ( pwd .getpwuid (os .getuid ()).pw_shell ) )
84+ logging .info ('cwd: {}' . format ( pwd .getpwuid (os .getuid ()).pw_dir ) )
85+ logging .info ('python version: {}' . format ( sys .version ) )
8386 logging .info (subprocess .check_output ('command -v blastn' , shell = True ).rstrip ())
8487 logging .info (subprocess .check_output ('blastn -version | tail -n1' , shell = True ).rstrip ())
8588 database = args .database
8689 similarity = args .similarity
87- syscall ('makeblastdb -in ' + genome + ' -out ' + os .path .join (outdir , baseGenome ) + ' -dbtype nucl' )
88- syscall ('blastn -task blastn -query {0} -db {out} -out {out}.blastn.tsv -evalue 1e-5 -max_target_seqs 1 -ungapped - perc_identity {1} -culling_limit 1 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen sseq"' .format (database , similarity , out = os .path .join (outdir , baseGenome )))
89- syscall ( 'rm ' + os .path .join (outdir , baseGenome ) + '.nin' )
90- syscall ( 'rm ' + os .path .join (outdir , baseGenome ) + '.nsq' )
91- syscall ( 'rm ' + os .path .join (outdir , baseGenome ) + '.nhr' )
92- with open (os .path .join (outdir , baseGenome ) + '.blastn.tsv' ) as infile :
90+ syscall ('makeblastdb -in {} -out {} -dbtype nucl' . format ( genome , os .path .join (outdir , baseGenome )) )
91+ syscall ('blastn -task blastn -query {0} -db {out} -out {out}.blastn.tsv -evalue 1e-5 -max_target_seqs 1 -perc_identity {1} -culling_limit 1 -outfmt "6 qseqid sseqid pident length mismatch gaps qstart qend sstart send evalue bitscore qlen sseq"' .format (database , similarity , out = os .path .join (outdir , baseGenome )))
92+ os . remove ( os .path .join (outdir , baseGenome + '.nin' ) )
93+ os . remove ( os .path .join (outdir , baseGenome + '.nsq' ) )
94+ os . remove ( os .path .join (outdir , baseGenome + '.nhr' ) )
95+ with open (os .path .join (outdir , baseGenome + '.blastn.tsv' ) ) as infile :
9396 best = ['-1' ,'a' ,'a' ,'a' ,0 ,'-1' ,'a' ,0 ,'a' ,'a' ,'a' ]
9497 currentClusterNr = '-1'
9598 topHits = []
0 commit comments