Module DIGEST
[hide private]
[frames] | no frames]

Source Code for Module DIGEST

  1  #!/usr/bin/python 
  2  # -*- coding: iso-8859-1 -*- 
  3  import subprocess 
  4  import os 
  5  from DIGEST_functions import * 
  6  from math import * 
  7   
  8  __doc__=""" 
  9  Main script for DIGEST workflow CCRT version 
 10   
 11  @requires: U{python<https://www.python.org/downloads/>} (tested with 2.7.3) 
 12  @requires: U{hashlib<https://pypi.python.org/pypi/hashlib>} 
 13  @requires: U{BWA<http://sourceforge.net/projects/bio-bwa/files/>} 
 14  @requires: U{RayMeta<http://sourceforge.net/projects/denovoassembler/files/>} 
 15  @requires: U{Metagene<http://metagene.nig.ac.jp/metagene/download_mga.html>} 
 16  @requires: U{cd-hit<https://code.google.com/p/cdhit/downloads/list>} (tested with v4.5.8-2012-03-24) 
 17   
 18   
 19  @requires: DIGEST_functions.py (PYTHONPATH) 
 20  @requires: SAMparser.py 
 21  @requires: ExtractFastaFromSAM.py 
 22  @requires: extendTargets_allPossibility.py 
 23  @requires: U{hashlib<http://kirill-kryukov.com/study/tools/fasta-splitter/>}fasta-splitter.pl 
 24  @requires: extractORFsequences.py 
 25  @requires: removeIdenticalSeq.py 
 26  @requires: cd-hit-para-CCRT.py 
 27  @requires: DIGEST_clear.sh 
 28  @requires: DIGEST_check.sh 
 29   
 30   
 31  """ 
 32   
33 -def get_parser():
34 35 parser = argparse.ArgumentParser(description='Main script for DIGEST workflow') 36 37 parser.add_argument('-source', action="store", dest='DIGEST_HOME', 38 type=str, help='digest-ccrt path') 39 40 parser.add_argument('-R', action="store", dest='reference', 41 type=str, help='reference in FASTA format with its index files in the same folder') 42 43 parser.add_argument('-1', action="store", dest='pair1', 44 type=str, help='1st FASTQ file from a pair') 45 46 parser.add_argument('-2', action="store", dest='pair2', 47 type=str, help='2nd FASTQ file from a pair') 48 49 parser.add_argument('-o', action="store", dest='Prefix', 50 type=str, default='output', help='output prefix (default:output)') 51 52 parser.add_argument('-qual', action="store", dest='minMAPQ', 53 type=str, default='30', help='MAPQ min to keep alignment (default:30)') 54 55 parser.add_argument('-k', action="store", dest='kmerLength', 56 type=str, default='27', help='kmer length for Ray (default:27)') 57 58 parser.add_argument('-n', action="store", dest='limLength', 59 type=int, default=100, help='min length for partial ORF (default=100)') 60 61 parser.add_argument('-c', action="store", dest='clutserThreshold', 62 type=str, default='0.95', help='sequence identity threshold for clustering (default=0.95)') 63 64 parser.add_argument('-aS', action="store", dest='minAlig', 65 type=str, default='0.0', help='alignment coverage for the shorter sequence (default:0.0) if set to 0.9,the alignment must covers 90 pourcent of the sequence') 66 67 parser.add_argument('-A', action="store", dest='projid', 68 type=str, default=None, help='CCRT project/account name (default:None)') 69 70 parser.add_argument('-q', action="store", dest='Queue', 71 type=str, default='large', help='Job Priority (default:large)') 72 73 parser.add_argument('-t', action="store", dest='Processors', 74 type=int, default=16, help='Numbers of threads requested to run each job (default:16)') 75 76 parser.add_argument('-N', action="store", dest='Nodes', 77 type=int, default=10, help='maximum number of nodes to use together (default:1)') 78 79 #parser.add_argument('-m', action="store", dest='maxMemory', 80 # type=str, default='6000', help='max available memory in Mbyte for cd-hit(default=6000)') 81 82 parser.add_argument('--loop', dest='loop', action='store_true', help='DIGEST loop (default=False)', default=False) 83 84 85 86 return parser
87 88
89 -def main():
90 91 92 ##################### gets arguments ##################### 93 parser=get_parser() 94 95 if len(sys.argv)==1: 96 parser.print_help() 97 sys.exit(1) 98 99 Arguments=parser.parse_args() 100 101 if('/' in Arguments.Prefix): 102 liste = Arguments.Prefix.split("/") 103 dirName = "/".join(liste[0:len(liste)-1]) 104 chdir(dirName) 105 Arguments.Prefix = "".join(liste[len(liste)-1:len(liste)]) 106 107 108 ##################### check input files ##################### 109 110 print "Check input files..." 111 if not exist(Arguments.reference): 112 print "reference FASTA file not found!" 113 sys.exit(0) 114 if not exist(Arguments.reference + ".bwt"): 115 print "index BWA " + Arguments.reference + ".bwt not found!" 116 sys.exit(0) 117 if not exist(Arguments.pair1): 118 print "1st FASTQ file not found!" 119 sys.exit(0) 120 if not exist(Arguments.pair2): 121 print "2nd FASTQ file not found!" 122 sys.exit(0) 123 print "\tOK!" 124 125 126 #DIGEST_HOME 127 DIGEST_HOME = Arguments.DIGEST_HOME 128 if(DIGEST_HOME[len(DIGEST_HOME)-1]=="/"): 129 DIGEST_HOME = DIGEST_HOME[0:len(DIGEST_HOME)-1] 130 DIGEST_HOME = DIGEST_HOME + "/src/main/scripts" 131 132 133 134 135 ##################### alignment bwa ##################### 136 137 ####### bwa alignment 138 139 bwalignmentFile = open("bwalignment.sh", "wb") 140 141 #header 142 bwalignmentFile.write("#!/bin/bash\n#MSUB -n 2 \n") 143 bwalignmentFile.write("#MSUB -c " + str(Arguments.Processors) + " \n") 144 bwalignmentFile.write("#MSUB -q " + Arguments.Queue + " \n") 145 bwalignmentFile.write("#MSUB -o bwalignment.o \n") 146 bwalignmentFile.write("#MSUB -e bwalignment.e \n") 147 if(Arguments.projid): 148 bwalignmentFile.write("#MSUB -A " + Arguments.projid + " \n") 149 bwalignmentFile.write("set -x \n") 150 151 #load module 152 bwalignmentFile.write("module load extenv/fg\n") 153 bwalignmentFile.write("module load bwa\n") 154 155 #commands 156 bwalignmentFile.write("bwa aln " + Arguments.reference + " " + Arguments.pair1 + " -t " + str(Arguments.Processors) + " > " + Arguments.Prefix + "_p1.sai &\n") 157 bwalignmentFile.write("bwa aln " + Arguments.reference + " " + Arguments.pair2 + " -t " + str(Arguments.Processors) + " > " + Arguments.Prefix + "_p2.sai &\n") 158 bwalignmentFile.write("wait\n") 159 160 bwalignmentFile.close() 161 162 #launch job and gets the job ID 163 commandeLine = "ccc_msub bwalignment.sh" 164 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 165 result=process.communicate() 166 bwalignmentJobID = result[0].split()[3] 167 if("\n" in bwalignmentJobID): 168 bwalignmentJobID = bwalignmentJobID[0:len(bwalignmentJobID)-1] 169 print commandeLine + " --> job=" + bwalignmentJobID + result[1] 170 171 ####### bwa process 172 173 bwaProcess = open("bwaProcess.sh", "wb") 174 175 #header 176 bwaProcess.write("#!/bin/bash \n") 177 bwaProcess.write("#MSUB -q " + Arguments.Queue + " \n") 178 bwaProcess.write("#MSUB -o bwaProcess.o \n") 179 bwaProcess.write("#MSUB -e bwaProcess.e \n") 180 if(Arguments.projid): 181 bwaProcess.write("#MSUB -A " + Arguments.projid + " \n") 182 bwaProcess.write("set -x \n") 183 184 #load module 185 bwaProcess.write("module load extenv/fg\n") 186 bwaProcess.write("module load bwa samtools\n") 187 188 #Test 189 bwaProcess.write("if [ ! -f " + Arguments.Prefix + "_p1.sai ] || [ ! -f " + Arguments.Prefix + "_p2.sai ]\nthen\n") 190 bwaProcess.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 191 bwaProcess.write("\t\techo 'Error bwa alignment: .sai files not found' > errorProcess.txt\n") 192 bwaProcess.write("\tfi\n") 193 bwaProcess.write("\texit 1\n") 194 bwaProcess.write("fi\n") 195 196 #commands 197 bwaProcess.write("bwa sampe " + Arguments.reference + " " + Arguments.Prefix + "_p1.sai " + Arguments.Prefix + "_p2.sai " + Arguments.pair1 + \ 198 " " + Arguments.pair2 + " > " + Arguments.Prefix + "_p.sam \n") 199 bwaProcess.write("rm " + Arguments.Prefix + "*.sai \n") 200 bwaProcess.write("samtools view -Sb " + Arguments.Prefix + "_p.sam > " + Arguments.Prefix + "_p.bam \n") 201 bwaProcess.write("rm " + Arguments.Prefix + "_p.sam \n") 202 bwaProcess.write("samtools sort -n " + Arguments.Prefix + "_p.bam " + Arguments.Prefix + "_sorted \n") 203 bwaProcess.write("rm " + Arguments.Prefix + "_p.bam \n") 204 bwaProcess.write("samtools view -o " + Arguments.Prefix + "_sorted.sam " + Arguments.Prefix + "_sorted.bam \n") 205 206 bwaProcess.close() 207 208 #launch job and gets the job ID 209 commandeLine = "ccc_msub -a " + bwalignmentJobID + " bwaProcess.sh" 210 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 211 result=process.communicate() 212 bwaProcessJobID = result[0].split()[3] 213 if("\n" in bwaProcessJobID): 214 bwaProcessJobID = bwaProcessJobID[0:len(bwaProcessJobID)-1] 215 print commandeLine + " --> job=" + bwaProcessJobID + result[1] 216 217 218 219 220 ##################### Reads filtering ##################### 221 222 readFilter = open("readFilter.sh", "wb") 223 224 #header 225 readFilter.write("#!/bin/bash \n") 226 readFilter.write("#MSUB -q " + Arguments.Queue + " \n") 227 readFilter.write("#MSUB -o readFilter.o \n") 228 readFilter.write("#MSUB -e readFilter.e \n") 229 if(Arguments.projid): 230 readFilter.write("#MSUB -A " + Arguments.projid + " \n") 231 readFilter.write("set -x \n") 232 233 #load module 234 readFilter.write("module load extenv/fg\n") 235 readFilter.write("module load samtools\n") 236 readFilter.write("module load python/2.7.3\n") 237 238 #Test 239 readFilter.write("if [ ! -f " + Arguments.Prefix + "_sorted.sam ]\nthen\n") 240 readFilter.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 241 readFilter.write("\t\techo 'Error bwa process: " + Arguments.Prefix + "_sorted.sam file not found' > errorProcess.txt\n") 242 readFilter.write("\tfi\n") 243 readFilter.write("\texit 1\n") 244 readFilter.write("fi\n") 245 246 #commands 247 readFilter.write("python " + DIGEST_HOME + "/ExtractFastaFromSAM.py -sam " + Arguments.Prefix + \ 248 "_sorted.sam -out1 " + Arguments.Prefix + "_overlap -out2 " + Arguments.Prefix + "_unmap \n") 249 readFilter.write("rm " + Arguments.Prefix + "_sorted.sam\n") 250 251 readFilter.close() 252 253 #launch job and gets the job ID 254 commandeLine = "ccc_msub -a " + bwaProcessJobID + " readFilter.sh" 255 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 256 result=process.communicate() 257 readFilterJobID = result[0].split()[3] 258 if("\n" in readFilterJobID): 259 readFilterJobID = readFilterJobID[0:len(readFilterJobID)-1] 260 print commandeLine + " --> job=" + readFilterJobID + result[1] 261 262 263 264 265 ##################### Assembly with Ray ##################### 266 267 pair1 = Arguments.Prefix + "_overlap_p1.fasta" 268 pair2 = Arguments.Prefix + "_overlap_p2.fasta" 269 rayAssembly = open("rayAssembly.sh", "wb") 270 271 #header 272 rayAssembly.write("#!/bin/bash \n") 273 rayAssembly.write("#MSUB -q " + Arguments.Queue + " \n") 274 rayAssembly.write("#MSUB -o rayAssembly.o \n") 275 rayAssembly.write("#MSUB -e rayAssembly.e \n") 276 rayAssembly.write("#MSUB -n " + str(Arguments.Processors) + " \n") 277 if(Arguments.projid): 278 rayAssembly.write("#MSUB -A " + Arguments.projid + " \n") 279 rayAssembly.write("set -x \n") 280 281 #load module 282 rayAssembly.write("module load extenv/fg\n") 283 rayAssembly.write("module load ray\n") 284 285 #Test 286 rayAssembly.write("if [ ! -f " + pair1 + " ] || [ ! -f " + pair2 + " ]\nthen\n") 287 rayAssembly.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 288 rayAssembly.write("\t\techo 'Error ExtractFastaFromSAM.py: " + pair1 + " and/or " + pair2 + " files not found' > errorProcess.txt\n") 289 rayAssembly.write("\tfi\n") 290 rayAssembly.write("\texit 1\n") 291 rayAssembly.write("fi\n") 292 293 #commands 294 rayAssembly.write("ccc_mprun -n " + str(Arguments.Processors) + " Ray -p " + pair1 + " " + pair2 + " -k " + Arguments.kmerLength + " -o " + Arguments.Prefix + "_ray \n") 295 296 rayAssembly.close() 297 298 #launch job and gets the job ID 299 commandeLine = "ccc_msub -a " + readFilterJobID + " rayAssembly.sh" 300 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 301 result=process.communicate() 302 rayAssemblyJobID = result[0].split()[3] 303 if("\n" in rayAssemblyJobID): 304 rayAssemblyJobID = rayAssemblyJobID[0:len(rayAssemblyJobID)-1] 305 print commandeLine + " --> job=" + rayAssemblyJobID + result[1] 306 307 308 309 310 ##################### make contigs index ##################### 311 312 contigs = Arguments.Prefix + "_ray/Contigs.fasta" 313 contigIndex = open("contigIndex.sh", "wb") 314 315 #header 316 contigIndex.write("#!/bin/bash \n") 317 contigIndex.write("#MSUB -q " + Arguments.Queue + " \n") 318 contigIndex.write("#MSUB -o contigIndex.o \n") 319 contigIndex.write("#MSUB -e contigIndex.e \n") 320 if(Arguments.projid): 321 contigIndex.write("#MSUB -A " + Arguments.projid + " \n") 322 contigIndex.write("set -x \n") 323 324 #load module 325 contigIndex.write("module load extenv/fg\n") 326 contigIndex.write("module load bwa\n") 327 328 #Test 329 contigIndex.write("if [ ! -f " + contigs + " ]\nthen\n") 330 contigIndex.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 331 contigIndex.write("\t\techo 'Error ray assembly: no contigs found' > errorProcess.txt\n") 332 contigIndex.write("\tfi\n") 333 contigIndex.write("\texit 1\n") 334 contigIndex.write("fi\n") 335 336 #commands 337 contigIndex.write("bwa index -a bwtsw " + contigs + " \n") 338 339 contigIndex.close() 340 341 #launch job and gets the job ID 342 commandeLine = "ccc_msub -a " + rayAssemblyJobID + " contigIndex.sh" 343 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 344 result=process.communicate() 345 contigIndexJobID = result[0].split()[3] 346 if("\n" in contigIndexJobID): 347 contigIndexJobID = contigIndexJobID[0:len(contigIndexJobID)-1] 348 print commandeLine + " --> job=" + contigIndexJobID + result[1] 349 350 351 352 353 ##################### bwa mem mapping ##################### 354 355 bwaMEM = open("bwaMEM.sh", "wb") 356 357 #header 358 bwaMEM.write("#!/bin/bash \n") 359 bwaMEM.write("#MSUB -q " + Arguments.Queue + " \n") 360 bwaMEM.write("#MSUB -o bwaMEM.o \n") 361 bwaMEM.write("#MSUB -n 1 \n") 362 bwaMEM.write("#MSUB -c " + str(Arguments.Processors) + " \n") 363 bwaMEM.write("#MSUB -e bwaMEM.e \n") 364 if(Arguments.projid): 365 bwaMEM.write("#MSUB -A " + Arguments.projid + " \n") 366 bwaMEM.write("set -x \n") 367 368 #load module 369 bwaMEM.write("module load extenv/fg\n") 370 bwaMEM.write("module load bwa\n") 371 372 #Test 373 bwaMEM.write("if [ ! -f " + contigs + ".bwt ]\nthen\n") 374 bwaMEM.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 375 bwaMEM.write("\t\techo 'Error make contigs index: no bwa index found' > errorProcess.txt\n") 376 bwaMEM.write("\tfi\n") 377 bwaMEM.write("\texit 1\n") 378 bwaMEM.write("fi\n") 379 380 #commands 381 bwaMEM.write("bwa mem -t " + str(Arguments.Processors) + " " + contigs + " " + Arguments.reference + " > " + Arguments.Prefix + "_BWAmem.sam \n") 382 383 bwaMEM.close() 384 385 #launch job and gets the job ID 386 commandeLine = "ccc_msub -a " + contigIndexJobID + " bwaMEM.sh" 387 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 388 result=process.communicate() 389 bwaMEMJobID = result[0].split()[3] 390 if("\n" in bwaMEMJobID): 391 bwaMEMJobID = bwaMEMJobID[0:len(bwaMEMJobID)-1] 392 print commandeLine + " --> job=" + bwaMEMJobID + result[1] 393 394 395 396 397 ##################### alignment sorting ##################### 398 399 SAMparser = open("SAMparser.sh", "wb") 400 401 #header 402 SAMparser.write("#!/bin/bash \n") 403 SAMparser.write("#MSUB -q " + Arguments.Queue + " \n") 404 SAMparser.write("#MSUB -o SAMparser.o \n") 405 SAMparser.write("#MSUB -n 2 \n") 406 SAMparser.write("#MSUB -e SAMparser.e \n") 407 if(Arguments.projid): 408 SAMparser.write("#MSUB -A " + Arguments.projid + " \n") 409 SAMparser.write("set -x \n") 410 411 #load module 412 SAMparser.write("module load extenv/fg\n") 413 SAMparser.write("module load samtools\n") 414 SAMparser.write("module load python/2.7.3\n") 415 416 #Test 417 SAMparser.write("if [ ! -f " + Arguments.Prefix + "_BWAmem.sam ]\nthen\n") 418 SAMparser.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 419 SAMparser.write("\t\techo 'Error mapping bwa mem: sam file not found' > errorProcess.txt\n") 420 SAMparser.write("\tfi\n") 421 SAMparser.write("\texit 1\n") 422 SAMparser.write("fi\n") 423 424 #commands 425 SAMparser.write("python " + DIGEST_HOME + "/SAMparser.py -sam " + Arguments.Prefix + "_BWAmem.sam -unmap & \n") 426 SAMparser.write("python " + DIGEST_HOME + "/SAMparser.py -sam " + Arguments.Prefix + "_BWAmem.sam -map -Q \n wait \n") 427 428 SAMparser.close() 429 430 #launch job and gets the job ID 431 commandeLine = "ccc_msub -a " + bwaMEMJobID + " SAMparser.sh" 432 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 433 result=process.communicate() 434 SAMparserID = result[0].split()[3] 435 if("\n" in SAMparserID): 436 SAMparserID = SAMparserID[0:len(SAMparserID)-1] 437 print commandeLine + " --> job=" + SAMparserID + result[1] 438 439 440 441 442 ##################### extract unmapped target sequence ##################### 443 444 extractUnmapped = open("extractUnmapped.sh", "wb") 445 446 #header 447 extractUnmapped.write("#!/bin/bash \n") 448 extractUnmapped.write("#MSUB -q " + Arguments.Queue + " \n") 449 extractUnmapped.write("#MSUB -o extractUnmapped.o \n") 450 extractUnmapped.write("#MSUB -e extractUnmapped.e \n") 451 if(Arguments.projid): 452 extractUnmapped.write("#MSUB -A " + Arguments.projid + " \n") 453 extractUnmapped.write("set -x \n") 454 455 #Test 456 extractUnmapped.write("if [ ! -f " + Arguments.Prefix + "_BWAmem_unmapped.sam ]\nthen\n") 457 extractUnmapped.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 458 extractUnmapped.write("\t\techo 'Error SAMparser.py: BWAmem_unmapped.sam file not found' > errorProcess.txt\n") 459 extractUnmapped.write("\tfi\n") 460 extractUnmapped.write("\texit 1\n") 461 extractUnmapped.write("fi\n") 462 463 #commands 464 extractUnmapped.write("cat " + Arguments.Prefix + "_BWAmem_unmapped.sam | awk '{OFS=\"\t\"; print \">\"$1\"\\n\"$10}' > "\ 465 + Arguments.Prefix + "_unmappedTarget.fasta \n") 466 467 extractUnmapped.close() 468 469 #launch job and gets the job ID 470 commandeLine = "ccc_msub -a " + SAMparserID + " extractUnmapped.sh" 471 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 472 result=process.communicate() 473 extractUnmappedID = result[0].split()[3] 474 if("\n" in extractUnmappedID): 475 extractUnmappedID = extractUnmappedID[0:len(extractUnmappedID)-1] 476 print commandeLine + " --> job=" + extractUnmappedID + result[1] 477 478 479 480 481 ##################### extend genes ##################### 482 483 SAMfile = Arguments.Prefix + "_BWAmem_mapped_sortbyquery.sam" 484 extendGenes = open("extendGenes.sh", "wb") 485 486 #header 487 extendGenes.write("#!/bin/bash \n") 488 extendGenes.write("#MSUB -q " + Arguments.Queue + " \n") 489 extendGenes.write("#MSUB -n " + str(Arguments.Processors) + " \n") 490 extendGenes.write("#MSUB -o extendGenes.o \n") 491 extendGenes.write("#MSUB -e extendGenes.e \n") 492 if(Arguments.projid): 493 extendGenes.write("#MSUB -A " + Arguments.projid + " \n") 494 extendGenes.write("set -x \n") 495 496 #load module 497 extendGenes.write("module load python/2.7.3\n") 498 499 #Test 500 extendGenes.write("if [ ! -f " + SAMfile + " ]\nthen\n") 501 extendGenes.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 502 extendGenes.write("\t\techo 'Error SAMparser.py: BWAmem_mapped_sortbyquery.sam file not found' > errorProcess.txt\n") 503 extendGenes.write("\tfi\n") 504 extendGenes.write("\texit 1\n") 505 extendGenes.write("fi\n") 506 507 #commands 508 extendGenes.write("python " + DIGEST_HOME + "/extendTargets_allPossibility.py -sam " + SAMfile + \ 509 " -f " + contigs + " -qual " + Arguments.minMAPQ + " -o " + Arguments.Prefix + "_Extended.fasta \n") 510 511 if(Arguments.Processors==1): 512 extendGenes.write("metagene -m " + Arguments.Prefix + "_Extended.fasta" + " > " + Arguments.Prefix + "_metagene.txt \n") 513 514 else: # parallelization by fasta splitting 515 516 extendGenes.write(DIGEST_HOME + "/fasta-splitter.pl --n-parts " + str(Arguments.Processors) + " " + Arguments.Prefix + "_Extended.fasta \n") 517 for i in range(1,Arguments.Processors+1): 518 metagene = "metagene -m " + Arguments.Prefix + "_Extended.part-" + str(i) + ".fasta > metagene-part" + str(i) + ".txt &\n" 519 extendGenes.write(metagene) 520 extendGenes.write("wait \n") 521 extendGenes.write("cat metagene-part*.txt > " + Arguments.Prefix + "_metagene.txt \n") 522 extendGenes.write("rm " + Arguments.Prefix + "_Extended.part-*.fasta metagene-part*.txt \n") 523 524 extendGenes.write("python " + DIGEST_HOME + "/extractORFsequences.py -m " + Arguments.Prefix + "_metagene.txt -f " + Arguments.Prefix \ 525 + "_Extended.fasta -i info.txt -n " + str(Arguments.limLength) + " -o " + Arguments.Prefix + " \n") 526 527 528 extendGenes.close() 529 530 #launch job and gets the job ID 531 commandeLine = "ccc_msub -a " + SAMparserID + " extendGenes.sh" 532 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 533 result=process.communicate() 534 extendGenesID = result[0].split()[3] 535 if("\n" in extendGenesID): 536 extendGenesID = extendGenesID[0:len(extendGenesID)-1] 537 print commandeLine + " --> job=" + extendGenesID + result[1] 538 539 540 541 542 ##################### remove identical sequences ##################### 543 544 rmIdentiqSeq = open("rmIdentiqSeq.sh", "wb") 545 546 #header 547 rmIdentiqSeq.write("#!/bin/bash \n") 548 rmIdentiqSeq.write("#MSUB -q " + Arguments.Queue + " \n") 549 rmIdentiqSeq.write("#MSUB -o rmIdentiqSeq.o \n") 550 rmIdentiqSeq.write("#MSUB -e rmIdentiqSeq.e \n") 551 rmIdentiqSeq.write("#MSUB -n 2 \n") 552 if(Arguments.projid): 553 rmIdentiqSeq.write("#MSUB -A " + Arguments.projid + " \n") 554 rmIdentiqSeq.write("set -x \n") 555 556 #load module 557 rmIdentiqSeq.write("module load python/2.7.3\n") 558 559 #Test 560 rmIdentiqSeq.write("if [ ! -f " + Arguments.Prefix + "_complete.fasta ]\nthen\n") 561 rmIdentiqSeq.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n") 562 rmIdentiqSeq.write("\t\techo 'Error SAMparser.py: BWAmem_mapped_sortbyquery.sam file not found' > errorProcess.txt\n") 563 rmIdentiqSeq.write("\tfi\n") 564 rmIdentiqSeq.write("\texit 1\n") 565 rmIdentiqSeq.write("fi\n") 566 567 #commands 568 rmIdentiqSeq.write("python " + DIGEST_HOME + "/removeIdenticalSeq.py -i " + Arguments.Prefix + \ 569 "_partial.fasta -o " + Arguments.Prefix + "_partial_unique.fasta & \n") 570 rmIdentiqSeq.write("python " + DIGEST_HOME + "/removeIdenticalSeq.py -i " + Arguments.Prefix + \ 571 "_complete.fasta -o " + Arguments.Prefix + "_complete_unique.fasta & \n") 572 rmIdentiqSeq.write("wait \n") 573 rmIdentiqSeq.write("mv " + Arguments.Prefix + "_partial_unique.fasta " + Arguments.Prefix + "_partial.fasta & \n") 574 rmIdentiqSeq.write("mv " + Arguments.Prefix + "_complete_unique.fasta " + Arguments.Prefix + "_complete.fasta & \n") 575 rmIdentiqSeq.write("wait \n") 576 577 #rmIdentiqSeq.write("cat " + Arguments.Prefix + "_partial.fasta " + Arguments.Prefix + "_unmappedTarget.fasta > "\ 578 #+ Arguments.Prefix + "_incomplete.fasta \n") 579 rmIdentiqSeq.write("mv " + Arguments.Prefix + "_partial.fasta " + Arguments.Prefix + "_incomplete.fasta \n") 580 581 rmIdentiqSeq.close() 582 583 #launch job and gets the job ID 584 commandeLine = "ccc_msub -a " + extendGenesID + "," + extractUnmappedID + " rmIdentiqSeq.sh" 585 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 586 result=process.communicate() 587 rmIdentiqSeqID = result[0].split()[3] 588 if("\n" in rmIdentiqSeqID): 589 rmIdentiqSeqID = rmIdentiqSeqID[0:len(rmIdentiqSeqID)-1] 590 print commandeLine + " --> job=" + rmIdentiqSeqID + result[1] 591 592 593 594 595 ##################### Clustering ##################### 596 597 #complete ORF clustering 598 clustering = "python " + DIGEST_HOME + "/cd-hit-para-CCRT.py -i " + Arguments.Prefix + "_complete.fasta -o " + \ 599 Arguments.Prefix + "_complete -t " + str(Arguments.Processors) + " -q " + Arguments.Queue + " -n " + str(Arguments.Nodes) + \ 600 " -c " + Arguments.clutserThreshold + " -j " + rmIdentiqSeqID + " -aS " + Arguments.minAlig + " -A " + Arguments.projid 601 602 process = subprocess.Popen(clustering,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 603 result=process.communicate() 604 605 clusteringID = str(result[1]) 606 if("\n" in clusteringID): 607 clusteringID = clusteringID[0:len(clusteringID)-1] 608 print clustering + " --> job=" + clusteringID 609 610 611 # incomplete ORF clustering 612 clustering = "python " + DIGEST_HOME + "/cd-hit-para-CCRT.py -i " + Arguments.Prefix + "_incomplete.fasta -o " + \ 613 Arguments.Prefix + "_incomplete -t " + str(Arguments.Processors) + " -q " + Arguments.Queue + " -n " + str(Arguments.Nodes) + \ 614 " -c " + Arguments.clutserThreshold + " -j " + rmIdentiqSeqID + " -aS " + Arguments.minAlig + " -A " + Arguments.projid 615 616 process = subprocess.Popen(clustering,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 617 result=process.communicate() 618 619 clusteringID2 = str(result[1]) 620 if("\n" in clusteringID2): 621 clusteringID2 = clusteringID2[0:len(clusteringID2)-1] 622 print clustering + " --> job=" + clusteringID2 623 624 625 626 ##################### clear ##################### 627 628 clear = open("clear.sh", "wb") 629 630 #header 631 clear.write("#!/bin/bash \n") 632 clear.write("#MSUB -q " + Arguments.Queue + " \n") 633 clear.write("#MSUB -o clear.o \n") 634 clear.write("#MSUB -e clear.e \n") 635 if(Arguments.projid): 636 clear.write("#MSUB -A " + Arguments.projid + " \n") 637 clear.write("set -x \n") 638 639 #if loop --> set current loop number 640 if(Arguments.loop): 641 loopNb = 0 642 if(exist(Arguments.Prefix + "_DIGEST-loop.txt")): 643 fd = open(Arguments.Prefix + "_DIGEST-loop.txt", 'r') 644 while fd.readline(): 645 loopNb += 1 646 outputPrefix = "LOOP" + str(loopNb+1) + "_" + Arguments.Prefix 647 loopNb += 1 648 else: 649 loopNb = 1 650 outputPrefix = "LOOP1_" + Arguments.Prefix 651 else: 652 outputPrefix = Arguments.Prefix 653 654 #commands 655 clear.write("sh " + DIGEST_HOME + "/DIGEST_clear.sh " + outputPrefix) 656 657 clear.close() 658 659 #launch job and gets the job ID 660 commandeLine = "ccc_msub -a " + clusteringID + "," + clusteringID2 + " clear.sh" 661 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 662 result=process.communicate() 663 clearID = result[0].split()[3] 664 if("\n" in clearID): 665 clearID = clearID[0:len(clearID)-1] 666 print commandeLine + " --> job=" + clearID + result[1] 667 668 669 if(Arguments.loop==False): 670 sys.exit(clearID) 671 672 673 674 ##################### check for next loop ##################### 675 676 check = open("check.sh", "wb") 677 678 #header 679 check.write("#!/bin/bash \n") 680 check.write("#MSUB -q " + Arguments.Queue + " \n") 681 check.write("#MSUB -o check.o \n") 682 check.write("#MSUB -e check.e \n") 683 check.write("#MSUB -c " + str(Arguments.Processors) + "\n") 684 if(Arguments.projid): 685 check.write("#MSUB -A " + Arguments.projid + " \n") 686 check.write("set -x \n") 687 688 #load module 689 check.write("module load extenv/fg\n") 690 check.write("module load bwa\n") 691 692 #commands 693 check.write("sh " + DIGEST_HOME + "/DIGEST_check.sh " + Arguments.Prefix + " " + str(loopNb) + " " + \ 694 Arguments.clutserThreshold + " " + str(Arguments.Processors) + " " + Arguments.minAlig) 695 696 check.close() 697 698 #launch job and gets the job ID 699 commandeLine = "ccc_msub -a " + clearID + " check.sh" 700 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 701 result=process.communicate() 702 checkID = result[0].split()[3] 703 if("\n" in checkID): 704 checkID = checkID[0:len(checkID)-1] 705 print commandeLine + " --> job=" + checkID + result[1] 706 707 708 709 ##################### DIGEST new loop ##################### 710 711 newLoop = open("newLoop.sh", "wb") 712 713 #header 714 newLoop.write("#!/bin/bash \n") 715 newLoop.write("#MSUB -q " + Arguments.Queue + " \n") 716 newLoop.write("#MSUB -o newLoop.o \n") 717 newLoop.write("#MSUB -e newLoop.e \n") 718 if(Arguments.projid): 719 newLoop.write("#MSUB -A " + Arguments.projid + " \n") 720 newLoop.write("set -x \n") 721 722 #load module 723 newLoop.write("module load extenv/fg\n") 724 newLoop.write("module load bwa samtools\n") 725 newLoop.write("module load python/2.7.3\n") 726 727 #Test 728 newLoop.write("if [ ! -f " + Arguments.Prefix + "_DIGEST-loop.txt ]\nthen\n") 729 newLoop.write("\texit 1\n") 730 newLoop.write("fi\n") 731 732 #commands 733 newLoop.write("python " + DIGEST_HOME + "/DIGEST.py -source " + DIGEST_HOME + \ 734 " -R newRef.fasta -1 paired1.fasta -2 paired2.fasta -o " + Arguments.Prefix + \ 735 " -q " + Arguments.Queue + " -t " + str(Arguments.Processors) + " -N " + str(Arguments.Nodes) + \ 736 " -k " + Arguments.kmerLength + " -qual " + Arguments.minMAPQ + " -n " + str(Arguments.limLength) + \ 737 " -c " + Arguments.clutserThreshold + " -aS " + Arguments.minAlig + " -A " + Arguments.projid + " --loop") 738 739 newLoop.close() 740 741 #launch job and gets the job ID 742 commandeLine = "ccc_msub -a " + checkID + " newLoop.sh" 743 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) 744 result=process.communicate() 745 loopID = result[0].split()[3] 746 if("\n" in loopID): 747 loopID = loopID[0:len(loopID)-1] 748 print commandeLine + " --> job=" + loopID + result[1]
749 750 751 if __name__ == "__main__": 752 main() 753