1
2
3 import subprocess
4 import os
5 from DIGEST_functions import *
6 from math import *
7
8 __doc__="""
9 Main script for DIGEST workflow CCRT version
10
11 @requires: U{python<https://www.python.org/downloads/>} (tested with 2.7.3)
12 @requires: U{hashlib<https://pypi.python.org/pypi/hashlib>}
13 @requires: U{BWA<http://sourceforge.net/projects/bio-bwa/files/>}
14 @requires: U{RayMeta<http://sourceforge.net/projects/denovoassembler/files/>}
15 @requires: U{Metagene<http://metagene.nig.ac.jp/metagene/download_mga.html>}
16 @requires: U{cd-hit<https://code.google.com/p/cdhit/downloads/list>} (tested with v4.5.8-2012-03-24)
17
18
19 @requires: DIGEST_functions.py (PYTHONPATH)
20 @requires: SAMparser.py
21 @requires: ExtractFastaFromSAM.py
22 @requires: extendTargets_allPossibility.py
23 @requires: U{hashlib<http://kirill-kryukov.com/study/tools/fasta-splitter/>}fasta-splitter.pl
24 @requires: extractORFsequences.py
25 @requires: removeIdenticalSeq.py
26 @requires: cd-hit-para-CCRT.py
27 @requires: DIGEST_clear.sh
28 @requires: DIGEST_check.sh
29
30
31 """
32
34
35 parser = argparse.ArgumentParser(description='Main script for DIGEST workflow')
36
37 parser.add_argument('-source', action="store", dest='DIGEST_HOME',
38 type=str, help='digest-ccrt path')
39
40 parser.add_argument('-R', action="store", dest='reference',
41 type=str, help='reference in FASTA format with its index files in the same folder')
42
43 parser.add_argument('-1', action="store", dest='pair1',
44 type=str, help='1st FASTQ file from a pair')
45
46 parser.add_argument('-2', action="store", dest='pair2',
47 type=str, help='2nd FASTQ file from a pair')
48
49 parser.add_argument('-o', action="store", dest='Prefix',
50 type=str, default='output', help='output prefix (default:output)')
51
52 parser.add_argument('-qual', action="store", dest='minMAPQ',
53 type=str, default='30', help='MAPQ min to keep alignment (default:30)')
54
55 parser.add_argument('-k', action="store", dest='kmerLength',
56 type=str, default='27', help='kmer length for Ray (default:27)')
57
58 parser.add_argument('-n', action="store", dest='limLength',
59 type=int, default=100, help='min length for partial ORF (default=100)')
60
61 parser.add_argument('-c', action="store", dest='clutserThreshold',
62 type=str, default='0.95', help='sequence identity threshold for clustering (default=0.95)')
63
64 parser.add_argument('-aS', action="store", dest='minAlig',
65 type=str, default='0.0', help='alignment coverage for the shorter sequence (default:0.0) if set to 0.9,the alignment must covers 90 pourcent of the sequence')
66
67 parser.add_argument('-A', action="store", dest='projid',
68 type=str, default=None, help='CCRT project/account name (default:None)')
69
70 parser.add_argument('-q', action="store", dest='Queue',
71 type=str, default='large', help='Job Priority (default:large)')
72
73 parser.add_argument('-t', action="store", dest='Processors',
74 type=int, default=16, help='Numbers of threads requested to run each job (default:16)')
75
76 parser.add_argument('-N', action="store", dest='Nodes',
77 type=int, default=10, help='maximum number of nodes to use together (default:1)')
78
79
80
81
82 parser.add_argument('--loop', dest='loop', action='store_true', help='DIGEST loop (default=False)', default=False)
83
84
85
86 return parser
87
88
90
91
92
93 parser=get_parser()
94
95 if len(sys.argv)==1:
96 parser.print_help()
97 sys.exit(1)
98
99 Arguments=parser.parse_args()
100
101 if('/' in Arguments.Prefix):
102 liste = Arguments.Prefix.split("/")
103 dirName = "/".join(liste[0:len(liste)-1])
104 chdir(dirName)
105 Arguments.Prefix = "".join(liste[len(liste)-1:len(liste)])
106
107
108
109
110 print "Check input files..."
111 if not exist(Arguments.reference):
112 print "reference FASTA file not found!"
113 sys.exit(0)
114 if not exist(Arguments.reference + ".bwt"):
115 print "index BWA " + Arguments.reference + ".bwt not found!"
116 sys.exit(0)
117 if not exist(Arguments.pair1):
118 print "1st FASTQ file not found!"
119 sys.exit(0)
120 if not exist(Arguments.pair2):
121 print "2nd FASTQ file not found!"
122 sys.exit(0)
123 print "\tOK!"
124
125
126
127 DIGEST_HOME = Arguments.DIGEST_HOME
128 if(DIGEST_HOME[len(DIGEST_HOME)-1]=="/"):
129 DIGEST_HOME = DIGEST_HOME[0:len(DIGEST_HOME)-1]
130 DIGEST_HOME = DIGEST_HOME + "/src/main/scripts"
131
132
133
134
135
136
137
138
139 bwalignmentFile = open("bwalignment.sh", "wb")
140
141
142 bwalignmentFile.write("#!/bin/bash\n#MSUB -n 2 \n")
143 bwalignmentFile.write("#MSUB -c " + str(Arguments.Processors) + " \n")
144 bwalignmentFile.write("#MSUB -q " + Arguments.Queue + " \n")
145 bwalignmentFile.write("#MSUB -o bwalignment.o \n")
146 bwalignmentFile.write("#MSUB -e bwalignment.e \n")
147 if(Arguments.projid):
148 bwalignmentFile.write("#MSUB -A " + Arguments.projid + " \n")
149 bwalignmentFile.write("set -x \n")
150
151
152 bwalignmentFile.write("module load extenv/fg\n")
153 bwalignmentFile.write("module load bwa\n")
154
155
156 bwalignmentFile.write("bwa aln " + Arguments.reference + " " + Arguments.pair1 + " -t " + str(Arguments.Processors) + " > " + Arguments.Prefix + "_p1.sai &\n")
157 bwalignmentFile.write("bwa aln " + Arguments.reference + " " + Arguments.pair2 + " -t " + str(Arguments.Processors) + " > " + Arguments.Prefix + "_p2.sai &\n")
158 bwalignmentFile.write("wait\n")
159
160 bwalignmentFile.close()
161
162
163 commandeLine = "ccc_msub bwalignment.sh"
164 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
165 result=process.communicate()
166 bwalignmentJobID = result[0].split()[3]
167 if("\n" in bwalignmentJobID):
168 bwalignmentJobID = bwalignmentJobID[0:len(bwalignmentJobID)-1]
169 print commandeLine + " --> job=" + bwalignmentJobID + result[1]
170
171
172
173 bwaProcess = open("bwaProcess.sh", "wb")
174
175
176 bwaProcess.write("#!/bin/bash \n")
177 bwaProcess.write("#MSUB -q " + Arguments.Queue + " \n")
178 bwaProcess.write("#MSUB -o bwaProcess.o \n")
179 bwaProcess.write("#MSUB -e bwaProcess.e \n")
180 if(Arguments.projid):
181 bwaProcess.write("#MSUB -A " + Arguments.projid + " \n")
182 bwaProcess.write("set -x \n")
183
184
185 bwaProcess.write("module load extenv/fg\n")
186 bwaProcess.write("module load bwa samtools\n")
187
188
189 bwaProcess.write("if [ ! -f " + Arguments.Prefix + "_p1.sai ] || [ ! -f " + Arguments.Prefix + "_p2.sai ]\nthen\n")
190 bwaProcess.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
191 bwaProcess.write("\t\techo 'Error bwa alignment: .sai files not found' > errorProcess.txt\n")
192 bwaProcess.write("\tfi\n")
193 bwaProcess.write("\texit 1\n")
194 bwaProcess.write("fi\n")
195
196
197 bwaProcess.write("bwa sampe " + Arguments.reference + " " + Arguments.Prefix + "_p1.sai " + Arguments.Prefix + "_p2.sai " + Arguments.pair1 + \
198 " " + Arguments.pair2 + " > " + Arguments.Prefix + "_p.sam \n")
199 bwaProcess.write("rm " + Arguments.Prefix + "*.sai \n")
200 bwaProcess.write("samtools view -Sb " + Arguments.Prefix + "_p.sam > " + Arguments.Prefix + "_p.bam \n")
201 bwaProcess.write("rm " + Arguments.Prefix + "_p.sam \n")
202 bwaProcess.write("samtools sort -n " + Arguments.Prefix + "_p.bam " + Arguments.Prefix + "_sorted \n")
203 bwaProcess.write("rm " + Arguments.Prefix + "_p.bam \n")
204 bwaProcess.write("samtools view -o " + Arguments.Prefix + "_sorted.sam " + Arguments.Prefix + "_sorted.bam \n")
205
206 bwaProcess.close()
207
208
209 commandeLine = "ccc_msub -a " + bwalignmentJobID + " bwaProcess.sh"
210 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
211 result=process.communicate()
212 bwaProcessJobID = result[0].split()[3]
213 if("\n" in bwaProcessJobID):
214 bwaProcessJobID = bwaProcessJobID[0:len(bwaProcessJobID)-1]
215 print commandeLine + " --> job=" + bwaProcessJobID + result[1]
216
217
218
219
220
221
222 readFilter = open("readFilter.sh", "wb")
223
224
225 readFilter.write("#!/bin/bash \n")
226 readFilter.write("#MSUB -q " + Arguments.Queue + " \n")
227 readFilter.write("#MSUB -o readFilter.o \n")
228 readFilter.write("#MSUB -e readFilter.e \n")
229 if(Arguments.projid):
230 readFilter.write("#MSUB -A " + Arguments.projid + " \n")
231 readFilter.write("set -x \n")
232
233
234 readFilter.write("module load extenv/fg\n")
235 readFilter.write("module load samtools\n")
236 readFilter.write("module load python/2.7.3\n")
237
238
239 readFilter.write("if [ ! -f " + Arguments.Prefix + "_sorted.sam ]\nthen\n")
240 readFilter.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
241 readFilter.write("\t\techo 'Error bwa process: " + Arguments.Prefix + "_sorted.sam file not found' > errorProcess.txt\n")
242 readFilter.write("\tfi\n")
243 readFilter.write("\texit 1\n")
244 readFilter.write("fi\n")
245
246
247 readFilter.write("python " + DIGEST_HOME + "/ExtractFastaFromSAM.py -sam " + Arguments.Prefix + \
248 "_sorted.sam -out1 " + Arguments.Prefix + "_overlap -out2 " + Arguments.Prefix + "_unmap \n")
249 readFilter.write("rm " + Arguments.Prefix + "_sorted.sam\n")
250
251 readFilter.close()
252
253
254 commandeLine = "ccc_msub -a " + bwaProcessJobID + " readFilter.sh"
255 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
256 result=process.communicate()
257 readFilterJobID = result[0].split()[3]
258 if("\n" in readFilterJobID):
259 readFilterJobID = readFilterJobID[0:len(readFilterJobID)-1]
260 print commandeLine + " --> job=" + readFilterJobID + result[1]
261
262
263
264
265
266
267 pair1 = Arguments.Prefix + "_overlap_p1.fasta"
268 pair2 = Arguments.Prefix + "_overlap_p2.fasta"
269 rayAssembly = open("rayAssembly.sh", "wb")
270
271
272 rayAssembly.write("#!/bin/bash \n")
273 rayAssembly.write("#MSUB -q " + Arguments.Queue + " \n")
274 rayAssembly.write("#MSUB -o rayAssembly.o \n")
275 rayAssembly.write("#MSUB -e rayAssembly.e \n")
276 rayAssembly.write("#MSUB -n " + str(Arguments.Processors) + " \n")
277 if(Arguments.projid):
278 rayAssembly.write("#MSUB -A " + Arguments.projid + " \n")
279 rayAssembly.write("set -x \n")
280
281
282 rayAssembly.write("module load extenv/fg\n")
283 rayAssembly.write("module load ray\n")
284
285
286 rayAssembly.write("if [ ! -f " + pair1 + " ] || [ ! -f " + pair2 + " ]\nthen\n")
287 rayAssembly.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
288 rayAssembly.write("\t\techo 'Error ExtractFastaFromSAM.py: " + pair1 + " and/or " + pair2 + " files not found' > errorProcess.txt\n")
289 rayAssembly.write("\tfi\n")
290 rayAssembly.write("\texit 1\n")
291 rayAssembly.write("fi\n")
292
293
294 rayAssembly.write("ccc_mprun -n " + str(Arguments.Processors) + " Ray -p " + pair1 + " " + pair2 + " -k " + Arguments.kmerLength + " -o " + Arguments.Prefix + "_ray \n")
295
296 rayAssembly.close()
297
298
299 commandeLine = "ccc_msub -a " + readFilterJobID + " rayAssembly.sh"
300 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
301 result=process.communicate()
302 rayAssemblyJobID = result[0].split()[3]
303 if("\n" in rayAssemblyJobID):
304 rayAssemblyJobID = rayAssemblyJobID[0:len(rayAssemblyJobID)-1]
305 print commandeLine + " --> job=" + rayAssemblyJobID + result[1]
306
307
308
309
310
311
312 contigs = Arguments.Prefix + "_ray/Contigs.fasta"
313 contigIndex = open("contigIndex.sh", "wb")
314
315
316 contigIndex.write("#!/bin/bash \n")
317 contigIndex.write("#MSUB -q " + Arguments.Queue + " \n")
318 contigIndex.write("#MSUB -o contigIndex.o \n")
319 contigIndex.write("#MSUB -e contigIndex.e \n")
320 if(Arguments.projid):
321 contigIndex.write("#MSUB -A " + Arguments.projid + " \n")
322 contigIndex.write("set -x \n")
323
324
325 contigIndex.write("module load extenv/fg\n")
326 contigIndex.write("module load bwa\n")
327
328
329 contigIndex.write("if [ ! -f " + contigs + " ]\nthen\n")
330 contigIndex.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
331 contigIndex.write("\t\techo 'Error ray assembly: no contigs found' > errorProcess.txt\n")
332 contigIndex.write("\tfi\n")
333 contigIndex.write("\texit 1\n")
334 contigIndex.write("fi\n")
335
336
337 contigIndex.write("bwa index -a bwtsw " + contigs + " \n")
338
339 contigIndex.close()
340
341
342 commandeLine = "ccc_msub -a " + rayAssemblyJobID + " contigIndex.sh"
343 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
344 result=process.communicate()
345 contigIndexJobID = result[0].split()[3]
346 if("\n" in contigIndexJobID):
347 contigIndexJobID = contigIndexJobID[0:len(contigIndexJobID)-1]
348 print commandeLine + " --> job=" + contigIndexJobID + result[1]
349
350
351
352
353
354
355 bwaMEM = open("bwaMEM.sh", "wb")
356
357
358 bwaMEM.write("#!/bin/bash \n")
359 bwaMEM.write("#MSUB -q " + Arguments.Queue + " \n")
360 bwaMEM.write("#MSUB -o bwaMEM.o \n")
361 bwaMEM.write("#MSUB -n 1 \n")
362 bwaMEM.write("#MSUB -c " + str(Arguments.Processors) + " \n")
363 bwaMEM.write("#MSUB -e bwaMEM.e \n")
364 if(Arguments.projid):
365 bwaMEM.write("#MSUB -A " + Arguments.projid + " \n")
366 bwaMEM.write("set -x \n")
367
368
369 bwaMEM.write("module load extenv/fg\n")
370 bwaMEM.write("module load bwa\n")
371
372
373 bwaMEM.write("if [ ! -f " + contigs + ".bwt ]\nthen\n")
374 bwaMEM.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
375 bwaMEM.write("\t\techo 'Error make contigs index: no bwa index found' > errorProcess.txt\n")
376 bwaMEM.write("\tfi\n")
377 bwaMEM.write("\texit 1\n")
378 bwaMEM.write("fi\n")
379
380
381 bwaMEM.write("bwa mem -t " + str(Arguments.Processors) + " " + contigs + " " + Arguments.reference + " > " + Arguments.Prefix + "_BWAmem.sam \n")
382
383 bwaMEM.close()
384
385
386 commandeLine = "ccc_msub -a " + contigIndexJobID + " bwaMEM.sh"
387 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
388 result=process.communicate()
389 bwaMEMJobID = result[0].split()[3]
390 if("\n" in bwaMEMJobID):
391 bwaMEMJobID = bwaMEMJobID[0:len(bwaMEMJobID)-1]
392 print commandeLine + " --> job=" + bwaMEMJobID + result[1]
393
394
395
396
397
398
399 SAMparser = open("SAMparser.sh", "wb")
400
401
402 SAMparser.write("#!/bin/bash \n")
403 SAMparser.write("#MSUB -q " + Arguments.Queue + " \n")
404 SAMparser.write("#MSUB -o SAMparser.o \n")
405 SAMparser.write("#MSUB -n 2 \n")
406 SAMparser.write("#MSUB -e SAMparser.e \n")
407 if(Arguments.projid):
408 SAMparser.write("#MSUB -A " + Arguments.projid + " \n")
409 SAMparser.write("set -x \n")
410
411
412 SAMparser.write("module load extenv/fg\n")
413 SAMparser.write("module load samtools\n")
414 SAMparser.write("module load python/2.7.3\n")
415
416
417 SAMparser.write("if [ ! -f " + Arguments.Prefix + "_BWAmem.sam ]\nthen\n")
418 SAMparser.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
419 SAMparser.write("\t\techo 'Error mapping bwa mem: sam file not found' > errorProcess.txt\n")
420 SAMparser.write("\tfi\n")
421 SAMparser.write("\texit 1\n")
422 SAMparser.write("fi\n")
423
424
425 SAMparser.write("python " + DIGEST_HOME + "/SAMparser.py -sam " + Arguments.Prefix + "_BWAmem.sam -unmap & \n")
426 SAMparser.write("python " + DIGEST_HOME + "/SAMparser.py -sam " + Arguments.Prefix + "_BWAmem.sam -map -Q \n wait \n")
427
428 SAMparser.close()
429
430
431 commandeLine = "ccc_msub -a " + bwaMEMJobID + " SAMparser.sh"
432 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
433 result=process.communicate()
434 SAMparserID = result[0].split()[3]
435 if("\n" in SAMparserID):
436 SAMparserID = SAMparserID[0:len(SAMparserID)-1]
437 print commandeLine + " --> job=" + SAMparserID + result[1]
438
439
440
441
442
443
444 extractUnmapped = open("extractUnmapped.sh", "wb")
445
446
447 extractUnmapped.write("#!/bin/bash \n")
448 extractUnmapped.write("#MSUB -q " + Arguments.Queue + " \n")
449 extractUnmapped.write("#MSUB -o extractUnmapped.o \n")
450 extractUnmapped.write("#MSUB -e extractUnmapped.e \n")
451 if(Arguments.projid):
452 extractUnmapped.write("#MSUB -A " + Arguments.projid + " \n")
453 extractUnmapped.write("set -x \n")
454
455
456 extractUnmapped.write("if [ ! -f " + Arguments.Prefix + "_BWAmem_unmapped.sam ]\nthen\n")
457 extractUnmapped.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
458 extractUnmapped.write("\t\techo 'Error SAMparser.py: BWAmem_unmapped.sam file not found' > errorProcess.txt\n")
459 extractUnmapped.write("\tfi\n")
460 extractUnmapped.write("\texit 1\n")
461 extractUnmapped.write("fi\n")
462
463
464 extractUnmapped.write("cat " + Arguments.Prefix + "_BWAmem_unmapped.sam | awk '{OFS=\"\t\"; print \">\"$1\"\\n\"$10}' > "\
465 + Arguments.Prefix + "_unmappedTarget.fasta \n")
466
467 extractUnmapped.close()
468
469
470 commandeLine = "ccc_msub -a " + SAMparserID + " extractUnmapped.sh"
471 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
472 result=process.communicate()
473 extractUnmappedID = result[0].split()[3]
474 if("\n" in extractUnmappedID):
475 extractUnmappedID = extractUnmappedID[0:len(extractUnmappedID)-1]
476 print commandeLine + " --> job=" + extractUnmappedID + result[1]
477
478
479
480
481
482
483 SAMfile = Arguments.Prefix + "_BWAmem_mapped_sortbyquery.sam"
484 extendGenes = open("extendGenes.sh", "wb")
485
486
487 extendGenes.write("#!/bin/bash \n")
488 extendGenes.write("#MSUB -q " + Arguments.Queue + " \n")
489 extendGenes.write("#MSUB -n " + str(Arguments.Processors) + " \n")
490 extendGenes.write("#MSUB -o extendGenes.o \n")
491 extendGenes.write("#MSUB -e extendGenes.e \n")
492 if(Arguments.projid):
493 extendGenes.write("#MSUB -A " + Arguments.projid + " \n")
494 extendGenes.write("set -x \n")
495
496
497 extendGenes.write("module load python/2.7.3\n")
498
499
500 extendGenes.write("if [ ! -f " + SAMfile + " ]\nthen\n")
501 extendGenes.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
502 extendGenes.write("\t\techo 'Error SAMparser.py: BWAmem_mapped_sortbyquery.sam file not found' > errorProcess.txt\n")
503 extendGenes.write("\tfi\n")
504 extendGenes.write("\texit 1\n")
505 extendGenes.write("fi\n")
506
507
508 extendGenes.write("python " + DIGEST_HOME + "/extendTargets_allPossibility.py -sam " + SAMfile + \
509 " -f " + contigs + " -qual " + Arguments.minMAPQ + " -o " + Arguments.Prefix + "_Extended.fasta \n")
510
511 if(Arguments.Processors==1):
512 extendGenes.write("metagene -m " + Arguments.Prefix + "_Extended.fasta" + " > " + Arguments.Prefix + "_metagene.txt \n")
513
514 else:
515
516 extendGenes.write(DIGEST_HOME + "/fasta-splitter.pl --n-parts " + str(Arguments.Processors) + " " + Arguments.Prefix + "_Extended.fasta \n")
517 for i in range(1,Arguments.Processors+1):
518 metagene = "metagene -m " + Arguments.Prefix + "_Extended.part-" + str(i) + ".fasta > metagene-part" + str(i) + ".txt &\n"
519 extendGenes.write(metagene)
520 extendGenes.write("wait \n")
521 extendGenes.write("cat metagene-part*.txt > " + Arguments.Prefix + "_metagene.txt \n")
522 extendGenes.write("rm " + Arguments.Prefix + "_Extended.part-*.fasta metagene-part*.txt \n")
523
524 extendGenes.write("python " + DIGEST_HOME + "/extractORFsequences.py -m " + Arguments.Prefix + "_metagene.txt -f " + Arguments.Prefix \
525 + "_Extended.fasta -i info.txt -n " + str(Arguments.limLength) + " -o " + Arguments.Prefix + " \n")
526
527
528 extendGenes.close()
529
530
531 commandeLine = "ccc_msub -a " + SAMparserID + " extendGenes.sh"
532 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
533 result=process.communicate()
534 extendGenesID = result[0].split()[3]
535 if("\n" in extendGenesID):
536 extendGenesID = extendGenesID[0:len(extendGenesID)-1]
537 print commandeLine + " --> job=" + extendGenesID + result[1]
538
539
540
541
542
543
544 rmIdentiqSeq = open("rmIdentiqSeq.sh", "wb")
545
546
547 rmIdentiqSeq.write("#!/bin/bash \n")
548 rmIdentiqSeq.write("#MSUB -q " + Arguments.Queue + " \n")
549 rmIdentiqSeq.write("#MSUB -o rmIdentiqSeq.o \n")
550 rmIdentiqSeq.write("#MSUB -e rmIdentiqSeq.e \n")
551 rmIdentiqSeq.write("#MSUB -n 2 \n")
552 if(Arguments.projid):
553 rmIdentiqSeq.write("#MSUB -A " + Arguments.projid + " \n")
554 rmIdentiqSeq.write("set -x \n")
555
556
557 rmIdentiqSeq.write("module load python/2.7.3\n")
558
559
560 rmIdentiqSeq.write("if [ ! -f " + Arguments.Prefix + "_complete.fasta ]\nthen\n")
561 rmIdentiqSeq.write("\tif [ ! -f errorProcess.txt ]\n\tthen\n")
562 rmIdentiqSeq.write("\t\techo 'Error SAMparser.py: BWAmem_mapped_sortbyquery.sam file not found' > errorProcess.txt\n")
563 rmIdentiqSeq.write("\tfi\n")
564 rmIdentiqSeq.write("\texit 1\n")
565 rmIdentiqSeq.write("fi\n")
566
567
568 rmIdentiqSeq.write("python " + DIGEST_HOME + "/removeIdenticalSeq.py -i " + Arguments.Prefix + \
569 "_partial.fasta -o " + Arguments.Prefix + "_partial_unique.fasta & \n")
570 rmIdentiqSeq.write("python " + DIGEST_HOME + "/removeIdenticalSeq.py -i " + Arguments.Prefix + \
571 "_complete.fasta -o " + Arguments.Prefix + "_complete_unique.fasta & \n")
572 rmIdentiqSeq.write("wait \n")
573 rmIdentiqSeq.write("mv " + Arguments.Prefix + "_partial_unique.fasta " + Arguments.Prefix + "_partial.fasta & \n")
574 rmIdentiqSeq.write("mv " + Arguments.Prefix + "_complete_unique.fasta " + Arguments.Prefix + "_complete.fasta & \n")
575 rmIdentiqSeq.write("wait \n")
576
577
578
579 rmIdentiqSeq.write("mv " + Arguments.Prefix + "_partial.fasta " + Arguments.Prefix + "_incomplete.fasta \n")
580
581 rmIdentiqSeq.close()
582
583
584 commandeLine = "ccc_msub -a " + extendGenesID + "," + extractUnmappedID + " rmIdentiqSeq.sh"
585 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
586 result=process.communicate()
587 rmIdentiqSeqID = result[0].split()[3]
588 if("\n" in rmIdentiqSeqID):
589 rmIdentiqSeqID = rmIdentiqSeqID[0:len(rmIdentiqSeqID)-1]
590 print commandeLine + " --> job=" + rmIdentiqSeqID + result[1]
591
592
593
594
595
596
597
598 clustering = "python " + DIGEST_HOME + "/cd-hit-para-CCRT.py -i " + Arguments.Prefix + "_complete.fasta -o " + \
599 Arguments.Prefix + "_complete -t " + str(Arguments.Processors) + " -q " + Arguments.Queue + " -n " + str(Arguments.Nodes) + \
600 " -c " + Arguments.clutserThreshold + " -j " + rmIdentiqSeqID + " -aS " + Arguments.minAlig + " -A " + Arguments.projid
601
602 process = subprocess.Popen(clustering,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
603 result=process.communicate()
604
605 clusteringID = str(result[1])
606 if("\n" in clusteringID):
607 clusteringID = clusteringID[0:len(clusteringID)-1]
608 print clustering + " --> job=" + clusteringID
609
610
611
612 clustering = "python " + DIGEST_HOME + "/cd-hit-para-CCRT.py -i " + Arguments.Prefix + "_incomplete.fasta -o " + \
613 Arguments.Prefix + "_incomplete -t " + str(Arguments.Processors) + " -q " + Arguments.Queue + " -n " + str(Arguments.Nodes) + \
614 " -c " + Arguments.clutserThreshold + " -j " + rmIdentiqSeqID + " -aS " + Arguments.minAlig + " -A " + Arguments.projid
615
616 process = subprocess.Popen(clustering,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
617 result=process.communicate()
618
619 clusteringID2 = str(result[1])
620 if("\n" in clusteringID2):
621 clusteringID2 = clusteringID2[0:len(clusteringID2)-1]
622 print clustering + " --> job=" + clusteringID2
623
624
625
626
627
628 clear = open("clear.sh", "wb")
629
630
631 clear.write("#!/bin/bash \n")
632 clear.write("#MSUB -q " + Arguments.Queue + " \n")
633 clear.write("#MSUB -o clear.o \n")
634 clear.write("#MSUB -e clear.e \n")
635 if(Arguments.projid):
636 clear.write("#MSUB -A " + Arguments.projid + " \n")
637 clear.write("set -x \n")
638
639
640 if(Arguments.loop):
641 loopNb = 0
642 if(exist(Arguments.Prefix + "_DIGEST-loop.txt")):
643 fd = open(Arguments.Prefix + "_DIGEST-loop.txt", 'r')
644 while fd.readline():
645 loopNb += 1
646 outputPrefix = "LOOP" + str(loopNb+1) + "_" + Arguments.Prefix
647 loopNb += 1
648 else:
649 loopNb = 1
650 outputPrefix = "LOOP1_" + Arguments.Prefix
651 else:
652 outputPrefix = Arguments.Prefix
653
654
655 clear.write("sh " + DIGEST_HOME + "/DIGEST_clear.sh " + outputPrefix)
656
657 clear.close()
658
659
660 commandeLine = "ccc_msub -a " + clusteringID + "," + clusteringID2 + " clear.sh"
661 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
662 result=process.communicate()
663 clearID = result[0].split()[3]
664 if("\n" in clearID):
665 clearID = clearID[0:len(clearID)-1]
666 print commandeLine + " --> job=" + clearID + result[1]
667
668
669 if(Arguments.loop==False):
670 sys.exit(clearID)
671
672
673
674
675
676 check = open("check.sh", "wb")
677
678
679 check.write("#!/bin/bash \n")
680 check.write("#MSUB -q " + Arguments.Queue + " \n")
681 check.write("#MSUB -o check.o \n")
682 check.write("#MSUB -e check.e \n")
683 check.write("#MSUB -c " + str(Arguments.Processors) + "\n")
684 if(Arguments.projid):
685 check.write("#MSUB -A " + Arguments.projid + " \n")
686 check.write("set -x \n")
687
688
689 check.write("module load extenv/fg\n")
690 check.write("module load bwa\n")
691
692
693 check.write("sh " + DIGEST_HOME + "/DIGEST_check.sh " + Arguments.Prefix + " " + str(loopNb) + " " + \
694 Arguments.clutserThreshold + " " + str(Arguments.Processors) + " " + Arguments.minAlig)
695
696 check.close()
697
698
699 commandeLine = "ccc_msub -a " + clearID + " check.sh"
700 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
701 result=process.communicate()
702 checkID = result[0].split()[3]
703 if("\n" in checkID):
704 checkID = checkID[0:len(checkID)-1]
705 print commandeLine + " --> job=" + checkID + result[1]
706
707
708
709
710
711 newLoop = open("newLoop.sh", "wb")
712
713
714 newLoop.write("#!/bin/bash \n")
715 newLoop.write("#MSUB -q " + Arguments.Queue + " \n")
716 newLoop.write("#MSUB -o newLoop.o \n")
717 newLoop.write("#MSUB -e newLoop.e \n")
718 if(Arguments.projid):
719 newLoop.write("#MSUB -A " + Arguments.projid + " \n")
720 newLoop.write("set -x \n")
721
722
723 newLoop.write("module load extenv/fg\n")
724 newLoop.write("module load bwa samtools\n")
725 newLoop.write("module load python/2.7.3\n")
726
727
728 newLoop.write("if [ ! -f " + Arguments.Prefix + "_DIGEST-loop.txt ]\nthen\n")
729 newLoop.write("\texit 1\n")
730 newLoop.write("fi\n")
731
732
733 newLoop.write("python " + DIGEST_HOME + "/DIGEST.py -source " + DIGEST_HOME + \
734 " -R newRef.fasta -1 paired1.fasta -2 paired2.fasta -o " + Arguments.Prefix + \
735 " -q " + Arguments.Queue + " -t " + str(Arguments.Processors) + " -N " + str(Arguments.Nodes) + \
736 " -k " + Arguments.kmerLength + " -qual " + Arguments.minMAPQ + " -n " + str(Arguments.limLength) + \
737 " -c " + Arguments.clutserThreshold + " -aS " + Arguments.minAlig + " -A " + Arguments.projid + " --loop")
738
739 newLoop.close()
740
741
742 commandeLine = "ccc_msub -a " + checkID + " newLoop.sh"
743 process = subprocess.Popen(commandeLine,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
744 result=process.communicate()
745 loopID = result[0].split()[3]
746 if("\n" in loopID):
747 loopID = loopID[0:len(loopID)-1]
748 print commandeLine + " --> job=" + loopID + result[1]
749
750
751 if __name__ == "__main__":
752 main()
753