1
2
3
4 import os, sys
5 import csv
6 import argparse
7 import re
8 import subprocess
9
10 __doc__="""
11 Class and functions use by the DIGEST workflow
12
13 @requires: jobArrayLSFlauncher_modif.sh
14 @requires: mpirun-genoscope-modif.sh
15 """
16
26
28 """
29 Create class able to launch a list of jobs on SLURM or LSF on a Job Scheduler
30 @cvar queue : queue name to execute jobs
31 @type queue : string
32 @cvar nbProcesseur : number of processor used for each jobs
33 @type nbProcesseur : integer
34 @cvar jobName : job name
35 @type jobName : string
36 @cvar mode : LSF or SLURM
37 @type mode : string
38 @cvar option : resource requirements (default:'select[defined(mem64)],span[hosts=1]')
39 @type option : string
40 """
41
42 - def __init__(self, queue, nbProcesseur, jobName, mode, option="'select[defined(mem64)],span[hosts=1],rh5'"):
43 """
44 Initialize the jobLauncher class
45 @param queue : queue name to execute jobs
46 @type queue : string
47 @param nbProcesseur : number of processor used for each jobs
48 @type nbProcesseur : integer
49 @param jobName : job name
50 @type jobName : string
51 @param mode : LSF or SLURM
52 @type mode : string
53 @param option : resource requirements (default:'select[defined(mem64)],span[hosts=1]')
54 @type option : string
55 """
56 self.queue = queue
57 self.nbProcesseur = nbProcesseur
58 self.jobName = jobName
59 self.option = option
60 self.mode = mode
61
63 """
64 Launch a list of jobs on SLURM or LSF on a Job Scheduler
65 @param jobfile : file name containing one commande by line, one job by line will be launch
66 @type jobfile : string
67 """
68 if(self.mode=="LSF"):
69 commande = '$DIGEST_HOME/scripts/jobArrayLSFlauncher_modif.sh ' + ' -f ' + jobfile + ' -j ' + self.jobName + ' -q ' + self.queue + ' -p ' + str(self.nbProcesseur) + ' -r ' + self.option
70 process = subprocess.Popen(commande, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
71 process.communicate()
72
73 elif(self.mode=="SLURM"):
74 print "not working yet ...exit"
75 sys.exit()
76
78 """
79 Launch a single job on SLURM or LSF on a Job Scheduler
80 @param command : command line to be launched
81 @type command : string
82 """
83 if(self.mode=="LSF"):
84 head = "bsub -R " + self.option + " -q " + self.queue + " -n " + str(self.nbProcesseur) + " -J " + self.jobName + " -o " + self.jobName + ".out -e " + self.jobName+ ".err "
85 if('>' not in command):
86 commandLine = "'" + command + " > " + self.jobName + ".trash'"
87 else:
88 commandLine = "'" + command + "'"
89 process = subprocess.Popen(head + commandLine, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
90 process.communicate()
91
92 elif(self.mode=="SLURM"):
93
94 file = open("commande.sh", "wb")
95 file.write('#!bin/bash\n' + command)
96 commandLine = "ccc_msub -q " + self.queue + " -r " + self.jobName + " -o " + self.jobName + ".out -e " + self.jobName+ ".err commande.sh"
97
98 commandLine = "rm commande.sh"
99 print "not working yet ...exit"
100 sys.exit()
101
103 """
104 Launch a mpi job on SLURM or LSF on a Job Scheduler
105 @param command : command line to be launched
106 @type command : string
107 """
108 if(self.mode=="LSF"):
109 head = "$DIGEST_HOME/scripts/mpirun-genoscope-modif.sh "
110 jobFile = open(self.jobName + ".tmp", "wb")
111 if('>' not in command):
112 commandLine = head + " " + command + " > " + self.jobName + ".trash"
113 else:
114 commandLine = head + " " + command
115 jobFile.write(commandLine + "\n")
116 jobFile.close()
117 self.jobArrayLauncher(jobFile.name)
118 process = subprocess.Popen('rm ' + self.jobName + '.tmp', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
119 process.communicate()
120
121 elif(self.mode=="SLURM"):
122
123 print "not working yet ...exit"
124 sys.exit()
125
127 """
128 New sequence object
129 @cvar ID : sequence ID
130 @type ID : string
131 @cvar seq : nucleotide sequence
132 @type seq : string
133 @cvar length : sequence length
134 @type length : integer
135 """
136
138 """
139 Initialize the sequence class
140 @param header : sequence header
141 @type header : string
142 @param nucl : nucleotide sequence
143 @type nucl : string
144 """
145 self.ID = header
146 self.seq = nucl
147 self.length = len(nucl)
148
150 """
151 New ORF object from MetaGene output
152 @cvar posSTART : ORF start position in sequence
153 @type posSTART : integer
154 @cvar posEND : ORF stop position in sequence
155 @type posEND : integer
156 @cvar strand : strand (+ or -)
157 @type strand : character
158 @cvar frame : frame (1,2 or 3)
159 @type frame : integer
160 @cvar score : ORF score
161 @type score : float
162 @cvar statut : statut partial or complete
163 @type statut : string
164 """
165
167 """
168 Initialize the ORF class
169 @param ligne : ORF line from a MetaGene output file
170 @type ligne : list
171 """
172 self.posSTART = int(ligne[1])
173 self.posEND = int(ligne[2])
174 self.strand = ligne[3]
175 self.frame = int(ligne[4])
176 self.score = float(ligne[6])
177 if(ligne[5]=="11"):
178 self.statut = "complete"
179 else:
180 self.statut = "partial"
181
183 """
184 New contig ORF object from MetaGene output
185 @cvar GC : GC% in sequence
186 @type GC : float
187 @cvar model : ORF model (bacteria, archaea or eukaryote)
188 @type model : string
189 @cvar ORFlist : list of ORF objects
190 @type ORFlist : list
191 """
192
194 """
195 Initialize the ContigORF class
196 @param ligne1 : 1st line of a block in a MetaGene output file
197 @type ligne1 : list
198 @param ligne2 : 2nd line of a block in a MetaGene output file
199 @type ligne2 : list
200 """
201 pourc = ligne1.split(" ")[3]
202 self.GC = float(pourc[0:len(pourc)-1])
203 self.ORFlist = []
204
206 """
207 New alignment CIGAR object
208 @cvar SumDeletion : number of deletion
209 @type SumDeletion : integer
210 @cvar SumInsertion : number of insertion
211 @type SumInsertion : integer
212 @cvar AlignLength : number of match and mismatch
213 @type AlignLength : integer
214 @cvar Code : code containing int and char compound the CIGAR
215 @type Code : list
216 """
217
219 """
220 Initialize Cigar class
221 @param strcigar : CIGAR from a SAM line
222 @type strcigar : string
223 """
224
225 sumDeletion = 0
226 sumInsertion = 0
227 alignLength = 0
228 code = []
229
230 i = 0
231 j = 0
232 number = 0
233 chiffres = ('1','2','3','4','5','6','7','8','9','0')
234
235 if(len(strcigar)<2):
236 code="0"
237
238 else :
239 while(i<len(strcigar)):
240
241 j = i
242 while(strcigar[j+1] in chiffres):
243 j=j+1
244 number = int(strcigar[i:j+1])
245 code.append(number)
246 i = j
247
248 if(strcigar[i+1]=='M'):
249 alignLength = alignLength + number
250 code.append('M')
251 i=i+2
252
253 elif(strcigar[i+1]=='S'):
254 code.append('S')
255 i=i+2
256
257 elif(strcigar[i+1]=='D'):
258 code.append('D')
259 sumDeletion = sumDeletion + number
260 i=i+2
261
262 elif(strcigar[i+1]=='I'):
263 code.append('I')
264 sumInsertion = sumInsertion + number
265 i=i+2
266
267 self.SumDeletion = sumDeletion
268 self.SumInsertion = sumInsertion
269 self.AlignLength = alignLength
270 self.Code = code
271
273 """
274 New alignmentSAM object from a line of a SAM file,
275 see SAM format for more informations
276 @cvar Query : query sequence ID
277 @type Query : string
278 @cvar Flag : SAM flag
279 @type Flag : integer
280 @cvar Subject : Subject sequence ID
281 @type Subject : string
282 @cvar Pos : start alignment position on the subject sequence
283 @type Pos : integer
284 @cvar MAPQ : mapping quality score
285 @type MAPQ : integer
286 @cvar CIGAR : Cigar
287 @type CIGAR : Cigar
288 @cvar RNEXT : Reference sequence name of the primary alignment of the NEXT read
289 @type RNEXT : string
290 @cvar PNEXT : Position of the primary alignment of the NEXT read
291 @type PNEXT : string
292 @cvar Length : alignment length
293 @type Length : integer
294 @cvar QuerySequence : query nucleotide sequence
295 @type QuerySequence : string
296 """
297
299 """
300 Initialize the alignmentSAM class
301 @param ligne : SAM alignment line
302 @type ligne : list
303 """
304 self.Query = ligne[0]
305 self.Flag = int(ligne[1])
306 self.Subject = ligne[2]
307 self.Pos = int(ligne[3])
308 self.MAPQ = int(ligne[4])
309 self.CIGAR = Cigar(ligne[5])
310 if(ligne[6]=='*'):
311 self.RNEXT = None
312 else:
313 self.RNEXT = ligne[6]
314 if(ligne[7]=='*'):
315 self.PNEXT = None
316 else:
317 self.PNEXT = ligne[7]
318 self.Length = int(ligne[8])
319 self.QuerySequence = ligne[9]
320
322 """
323 New cluster object
324 @cvar listseq : list of clusterSequence object compound the cluster
325 @type listseq : list
326 @cvar ref : ID of the reference sequence
327 @type ref : string
328 @cvar length : cluster length
329 @type length : integer
330 @cvar meanPourcSimilarity : pourcent mean of similarity in the cluster
331 @type meanPourcSimilarity : float
332 """
333
335 """
336 Initialize the cluster class
337 """
338 self.listseq = []
339 self.ref = ""
340 self.length = 0
341 self.meanPourcSimilarity = 0.0
342
344 """Compute the length of the cluster"""
345 self.length = len(self.listseq)
346
348 """Add a clusterSequence in the list
349 @param clusterSequence : a clusterSequence object
350 @type clusterSequence : clusterSequence
351 """
352 self.listseq.append(clusterSequence)
353
355 """Compute the pourcent mean of similarity"""
356 sum = 0.0
357 nbSeq = 0
358 for clusterSequence in self.listseq :
359 nbSeq += 1
360 sum = sum + clusterSequence.pourcSimilarity
361 self.meanPourcSimilarity = sum/float(nbSeq)
362
364 """
365 New sequence in a cluster object
366 @cvar length : sequence length
367 @type length : integer
368 @cvar header : sequence header
369 @type header : string
370 @cvar pourcSimilarity : pourcent of similarity of the sequence against the cluster reference sequence
371 @type pourcSimilarity : float
372 """
373
374 - def __init__(self, length, header, pourc):
375 """
376 @param length : sequence length
377 @type length : integer
378 @param header : sequence header
379 @type header : string
380 @param pourc : pourcent of similarity of the sequence against the cluster reference sequence
381 @type pourc : float
382 """
383 self.length = length
384 self.header = header
385 self.pourcSimilarity = pourc
386
388 """
389 Check the existence of a file.
390 @param fname: file name
391 @type fname: string
392 @return: 1 if the file is present, 0 otherwise
393 @rtype: integer
394 """
395 try:
396 f = open(fname,'r')
397 f.close()
398 return 1
399 except:
400 return 0
401
403 """
404 parse .clstr file from cd hit
405 @param file: file name
406 @type file: string
407 @return: list of cluster object
408 @rtype: list
409 """
410
411 clstr = open(file, "rb")
412 lignes = clstr.readlines()
413 clstr.close()
414
415 clusterList = []
416
417 flag = 0
418
419 for ligne in lignes :
420 if ligne[0]=='>':
421 if flag != 0 :
422 C.computeLen()
423 C.computeMeanPourc()
424 clusterList.append(C)
425
426 C = cluster()
427 flag = 1
428 else :
429 l = ligne.split("\t")[1]
430 l = l.split(" ")
431 length = int(l[0][0:len(l[0])-3])
432 header = l[1][1:len(l[1])-3]
433 if(l[2]=='*\n'):
434 C.ref = header
435 pourc = 100.0
436 else:
437 pourc = float(l[3][0:len(l[3])-2])
438 Cseq = clusterSequence(length ,header ,pourc)
439 C.addSequence(Cseq)
440
441 return clusterList
442
444 """
445 Fasta parser
446 @param file: file name
447 @type file: string
448 @return: dictionnary of sequence object with sequence ID as key
449 @rtype: dictionary
450 """
451
452 fasta = open(file, "rb")
453 lignes = fasta.readlines()
454 fasta.close()
455
456 dict = {}
457 seq = ""
458
459 for ligne in lignes:
460 if(ligne[0]=='>'):
461 if(len(seq)>0):
462 seque = sequence(ID,seq)
463 dict[ID] = seque
464 seq = ""
465 ID = ligne[1:len(ligne)-1]
466 ID = ID.split(" ")[0]
467 else:
468 seq = seq + ligne[0:len(ligne)-1]
469
470 seque = sequence(ID,seq)
471 dict[ID] = seque
472
473 return dict
474
475 -def geneExtended(orfSTART, orfEND, alignSTART, alignEND, orfSTATUT):
476 """
477 Check if a gene has been extended
478 @param orfSTART : ORF start position in sequence extended
479 @type orfSTART : integer
480 @param orfEND : ORF end position in sequence extended
481 @type orfEND : integer
482 @param alignSTART : alignement start position of sequence on contig
483 @type alignSTART : integer
484 @param alignEND : alignement end position of sequence on contig
485 @type alignEND : integer
486 @param orfSTATUT : ORF complete or partial
487 @type orfSTATUT : string
488 @return: True if the gene is completed, Fasle otherwise
489 @rtype: booleen
490 """
491
492 orfLength = orfEND - orfSTART
493 alignLength = alignEND - alignSTART
494
495 if ((alignSTART>=orfSTART and alignSTART<=orfEND) and (alignEND>=orfSTART and alignEND<=orfEND)) and (orfLength > alignLength) and ("complete" in orfSTATUT):
496
497 return True
498
499 elif ((alignSTART>orfSTART and alignSTART<=orfEND) and alignEND>orfEND) and ("complete" in orfSTATUT):
500
501 return True
502
503 elif ((alignEND>=orfSTART and alignEND<orfEND) and alignSTART<orfSTART) and ("complete" in orfSTATUT):
504
505 return True
506
507 else :
508 return False
509
510 -def geneSeen(orfSTART, orfEND, alignSTART, alignEND):
511 """
512 Check if a gene has been seen
513 @param orfSTART : ORF start position in sequence extended
514 @type orfSTART : integer
515 @param orfEND : ORF end position in sequence extended
516 @type orfEND : integer
517 @param alignSTART : alignement start position of sequence on contig
518 @type alignSTART : integer
519 @param alignEND : alignement end position of sequence on contig
520 @type alignEND : integer
521 @return: True if the gene is seen, Fasle otherwise
522 @rtype: booleen
523 """
524
525
526 orfLength = orfEND - orfSTART
527 alignLength = alignEND - alignSTART
528
529 if ((alignSTART>=orfSTART and alignSTART<=orfEND) or (alignEND>=orfSTART and alignEND<=orfEND)):
530 return True
531
532 else :
533 return False
534
536 """
537 make the reverse complement of a sequence
538 @param sequen : nucleotide sequence
539 @type sequen : string
540 @return: reverse complement of sequence
541 @rtype: string
542 """
543
544 seqReverse = []
545
546 for base in sequen :
547 if(base.upper() == 'A') : seqReverse.insert(0,'T')
548 elif(base.upper() == 'T') : seqReverse.insert(0,'A')
549 elif(base.upper() == 'G') : seqReverse.insert(0,'C')
550 elif(base.upper() == 'C') : seqReverse.insert(0,'G')
551 else : seqReverse.append(base.upper())
552
553 return (''.join(seqReverse))
554
600
602 """
603 From an alignment and a length, compute the start and stop alignment position
604 @param alignment : alignmentSAM object
605 @type alignment : alignmentSAM
606 @param subjectLength : subject sequence length
607 @type subjectLength : integer
608 @return: a list with the position start and stop of the alignment (if start = -1 --> alignment start befor the subject sequence ; if stop = -2 --> alignment stop after the subject sequence)
609 @rtype: list
610 """
611
612 if(alignment.CIGAR.Code[1] == "S" and alignment.Pos == 1):
613 posStart = -1
614 else:
615 posStart = alignment.Pos
616
617 posEnd = alignment.Pos + alignment.CIGAR.AlignLength + alignment.CIGAR.SumDeletion -1
618
619 if(alignment.CIGAR.Code[len(alignment.CIGAR.Code)-1] == "S" and posEnd == subjectLength):
620 posEnd = -2
621
622 list = [posStart,posEnd]
623
624 return list
625
627 """
628 Compute the int number of lines from a file
629 @param file : file name
630 @type file : string
631 @return: number of lines
632 @rtype: integer
633 """
634 out = os.popen("wc -l " + file, "r").read()
635 out = int(out.split(" ")[0])
636 return out
637
639 """
640 Compute the number of sequences in a FASTA file
641 @param file : file name
642 @type file : string
643 @return: number of sequences
644 @rtype: integer
645 """
646 out = os.popen("grep -c '>' " + file, "r").read()
647 out = int(out.split(" ")[0])
648 return out
649
650 -def writeORF(ORFlist,prefix,ID,sequence,n):
651 """
652 write ORFs in PREFIX_complete.fasta file or PREFIX_partial.fasta file
653 @param ORFlist : list of ORF object
654 @type ORFlist : list
655 @param prefix : prefix of output file name
656 @type prefix : string
657 @param ID : sequence ID
658 @type ID : string
659 @param sequence : nucleotide sequence
660 @type sequence : string
661 @param n : limte length for partial ORF
662 @type n : integer
663 """
664
665 i = 0
666
667 for ORF in ORFlist:
668
669 if(ORF.statut == "complete"):
670 outfasta = open(prefix + "_complete.fasta","a")
671 outfasta.write(">" + ID + "_ORF" + str(i) + "_complete\n" + sequence[ORF.posSTART-1:ORF.posEND] + "\n")
672 outfasta.close()
673 i += 1
674
675 elif(ORF.posEND - ORF.posSTART > n):
676 outfasta = open(prefix + "_partial.fasta","a")
677 outfasta.write(">" + ID + "_ORF" + str(i) + "_partial\n" + sequence[ORF.posSTART-1:ORF.posEND] + "\n")
678 outfasta.close()
679 i += 1
680