DIGEST_functions

1 #!/usr/bin/python 2 # -*- coding: iso-8859-1 -*- 3 4 import os, sys 5 import csv 6 import argparse 7 import re 8 import subprocess 9 10 __doc__=""" 11 Class and functions use by the DIGEST workflow 12 13 @requires: jobArrayLSFlauncher_modif.sh 14 @requires: mpirun-genoscope-modif.sh 15 """ 16

17 -class MyDialect(csv.Dialect):

18 """csv class use to read csv files""" 19 delimiter = "\t" 20 quotechar = None 21 escapechar = "\\" 22 doublequote = False 23 lineterminator = "\n" 24 quoting = csv.QUOTE_NONE 25 skipinitialspace = False

26

27 -class jobLauncher(object) :

28 """ 29 Create class able to launch a list of jobs on SLURM or LSF on a Job Scheduler 30 @cvar queue : queue name to execute jobs 31 @type queue : string 32 @cvar nbProcesseur : number of processor used for each jobs 33 @type nbProcesseur : integer 34 @cvar jobName : job name 35 @type jobName : string 36 @cvar mode : LSF or SLURM 37 @type mode : string 38 @cvar option : resource requirements (default:'select[defined(mem64)],span[hosts=1]') 39 @type option : string 40 """ 41

42 - def __init__(self, queue, nbProcesseur, jobName, mode, option="'select[defined(mem64)],span[hosts=1],rh5'"):

43 """ 44 Initialize the jobLauncher class 45 @param queue : queue name to execute jobs 46 @type queue : string 47 @param nbProcesseur : number of processor used for each jobs 48 @type nbProcesseur : integer 49 @param jobName : job name 50 @type jobName : string 51 @param mode : LSF or SLURM 52 @type mode : string 53 @param option : resource requirements (default:'select[defined(mem64)],span[hosts=1]') 54 @type option : string 55 """ 56 self.queue = queue 57 self.nbProcesseur = nbProcesseur 58 self.jobName = jobName 59 self.option = option 60 self.mode = mode

61

62 - def jobArrayLauncher(self, jobfile):

63 """ 64 Launch a list of jobs on SLURM or LSF on a Job Scheduler 65 @param jobfile : file name containing one commande by line, one job by line will be launch 66 @type jobfile : string 67 """ 68 if(self.mode=="LSF"): 69 commande = '$DIGEST_HOME/scripts/jobArrayLSFlauncher_modif.sh ' + ' -f ' + jobfile + ' -j ' + self.jobName + ' -q ' + self.queue + ' -p ' + str(self.nbProcesseur) + ' -r ' + self.option 70 process = subprocess.Popen(commande, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 71 process.communicate() 72 73 elif(self.mode=="SLURM"):#incomplete 74 print "not working yet ...exit" 75 sys.exit()

76

77 - def jobOneLauncher(self, command):

78 """ 79 Launch a single job on SLURM or LSF on a Job Scheduler 80 @param command : command line to be launched 81 @type command : string 82 """ 83 if(self.mode=="LSF"): 84 head = "bsub -R " + self.option + " -q " + self.queue + " -n " + str(self.nbProcesseur) + " -J " + self.jobName + " -o " + self.jobName + ".out -e " + self.jobName+ ".err " 85 if('>' not in command): 86 commandLine = "'" + command + " > " + self.jobName + ".trash'" 87 else: 88 commandLine = "'" + command + "'" 89 process = subprocess.Popen(head + commandLine, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 90 process.communicate() 91 92 elif(self.mode=="SLURM"):#incomplete 93 # ccc_msub 94 file = open("commande.sh", "wb") 95 file.write('#!bin/bash\n' + command) 96 commandLine = "ccc_msub -q " + self.queue + " -r " + self.jobName + " -o " + self.jobName + ".out -e " + self.jobName+ ".err commande.sh" 97 98 commandLine = "rm commande.sh" 99 print "not working yet ...exit" 100 sys.exit()

101

102 - def mpiRun(self, command):

103 """ 104 Launch a mpi job on SLURM or LSF on a Job Scheduler 105 @param command : command line to be launched 106 @type command : string 107 """ 108 if(self.mode=="LSF"): 109 head = "$DIGEST_HOME/scripts/mpirun-genoscope-modif.sh " 110 jobFile = open(self.jobName + ".tmp", "wb") 111 if('>' not in command): 112 commandLine = head + " " + command + " > " + self.jobName + ".trash" 113 else: 114 commandLine = head + " " + command 115 jobFile.write(commandLine + "\n") 116 jobFile.close() 117 self.jobArrayLauncher(jobFile.name) 118 process = subprocess.Popen('rm ' + self.jobName + '.tmp', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 119 process.communicate() 120 121 elif(self.mode=="SLURM"):#incomplete 122 # ccc_mprun 123 print "not working yet ...exit" 124 sys.exit()

125

126 -class sequence(object) :

127 """ 128 New sequence object 129 @cvar ID : sequence ID 130 @type ID : string 131 @cvar seq : nucleotide sequence 132 @type seq : string 133 @cvar length : sequence length 134 @type length : integer 135 """ 136

137 - def __init__(self, header, nucl):

138 """ 139 Initialize the sequence class 140 @param header : sequence header 141 @type header : string 142 @param nucl : nucleotide sequence 143 @type nucl : string 144 """ 145 self.ID = header 146 self.seq = nucl 147 self.length = len(nucl)

148

149 -class ORF(object) :

150 """ 151 New ORF object from MetaGene output 152 @cvar posSTART : ORF start position in sequence 153 @type posSTART : integer 154 @cvar posEND : ORF stop position in sequence 155 @type posEND : integer 156 @cvar strand : strand (+ or -) 157 @type strand : character 158 @cvar frame : frame (1,2 or 3) 159 @type frame : integer 160 @cvar score : ORF score 161 @type score : float 162 @cvar statut : statut partial or complete 163 @type statut : string 164 """ 165

166 - def __init__(self, ligne):

167 """ 168 Initialize the ORF class 169 @param ligne : ORF line from a MetaGene output file 170 @type ligne : list 171 """ 172 self.posSTART = int(ligne[1]) 173 self.posEND = int(ligne[2]) 174 self.strand = ligne[3] 175 self.frame = int(ligne[4]) 176 self.score = float(ligne[6]) 177 if(ligne[5]=="11"): 178 self.statut = "complete" 179 else: 180 self.statut = "partial"

181

182 -class ContigORF(object) :

183 """ 184 New contig ORF object from MetaGene output 185 @cvar GC : GC% in sequence 186 @type GC : float 187 @cvar model : ORF model (bacteria, archaea or eukaryote) 188 @type model : string 189 @cvar ORFlist : list of ORF objects 190 @type ORFlist : list 191 """ 192

193 - def __init__(self, ligne1, ligne2):

194 """ 195 Initialize the ContigORF class 196 @param ligne1 : 1st line of a block in a MetaGene output file 197 @type ligne1 : list 198 @param ligne2 : 2nd line of a block in a MetaGene output file 199 @type ligne2 : list 200 """ 201 pourc = ligne1.split(" ")[3] 202 self.GC = float(pourc[0:len(pourc)-1]) 203 self.ORFlist = []

204

205 -class Cigar(object) :

206 """ 207 New alignment CIGAR object 208 @cvar SumDeletion : number of deletion 209 @type SumDeletion : integer 210 @cvar SumInsertion : number of insertion 211 @type SumInsertion : integer 212 @cvar AlignLength : number of match and mismatch 213 @type AlignLength : integer 214 @cvar Code : code containing int and char compound the CIGAR 215 @type Code : list 216 """ 217

218 - def __init__(self, strcigar):

219 """ 220 Initialize Cigar class 221 @param strcigar : CIGAR from a SAM line 222 @type strcigar : string 223 """ 224 225 sumDeletion = 0 226 sumInsertion = 0 227 alignLength = 0 228 code = [] 229 230 i = 0 231 j = 0 232 number = 0 233 chiffres = ('1','2','3','4','5','6','7','8','9','0') 234 235 if(len(strcigar)<2): # if no mapping 236 code="0" 237 238 else : 239 while(i<len(strcigar)): 240 241 j = i 242 while(strcigar[j+1] in chiffres): 243 j=j+1 244 number = int(strcigar[i:j+1]) 245 code.append(number) 246 i = j 247 248 if(strcigar[i+1]=='M'): 249 alignLength = alignLength + number 250 code.append('M') 251 i=i+2 252 253 elif(strcigar[i+1]=='S'): 254 code.append('S') 255 i=i+2 256 257 elif(strcigar[i+1]=='D'): 258 code.append('D') 259 sumDeletion = sumDeletion + number 260 i=i+2 261 262 elif(strcigar[i+1]=='I'): 263 code.append('I') 264 sumInsertion = sumInsertion + number 265 i=i+2 266 267 self.SumDeletion = sumDeletion 268 self.SumInsertion = sumInsertion 269 self.AlignLength = alignLength 270 self.Code = code

271

272 -class alignmentSAM(object) :

273 """ 274 New alignmentSAM object from a line of a SAM file, 275 see SAM format for more informations 276 @cvar Query : query sequence ID 277 @type Query : string 278 @cvar Flag : SAM flag 279 @type Flag : integer 280 @cvar Subject : Subject sequence ID 281 @type Subject : string 282 @cvar Pos : start alignment position on the subject sequence 283 @type Pos : integer 284 @cvar MAPQ : mapping quality score 285 @type MAPQ : integer 286 @cvar CIGAR : Cigar 287 @type CIGAR : Cigar 288 @cvar RNEXT : Reference sequence name of the primary alignment of the NEXT read 289 @type RNEXT : string 290 @cvar PNEXT : Position of the primary alignment of the NEXT read 291 @type PNEXT : string 292 @cvar Length : alignment length 293 @type Length : integer 294 @cvar QuerySequence : query nucleotide sequence 295 @type QuerySequence : string 296 """ 297

298 - def __init__(self, ligne):

299 """ 300 Initialize the alignmentSAM class 301 @param ligne : SAM alignment line 302 @type ligne : list 303 """ 304 self.Query = ligne[0] 305 self.Flag = int(ligne[1]) 306 self.Subject = ligne[2] 307 self.Pos = int(ligne[3]) 308 self.MAPQ = int(ligne[4]) 309 self.CIGAR = Cigar(ligne[5]) 310 if(ligne[6]=='*'): 311 self.RNEXT = None 312 else: 313 self.RNEXT = ligne[6] 314 if(ligne[7]=='*'): 315 self.PNEXT = None 316 else: 317 self.PNEXT = ligne[7] 318 self.Length = int(ligne[8]) 319 self.QuerySequence = ligne[9]

320

321 -class cluster(object) :

322 """ 323 New cluster object 324 @cvar listseq : list of clusterSequence object compound the cluster 325 @type listseq : list 326 @cvar ref : ID of the reference sequence 327 @type ref : string 328 @cvar length : cluster length 329 @type length : integer 330 @cvar meanPourcSimilarity : pourcent mean of similarity in the cluster 331 @type meanPourcSimilarity : float 332 """ 333

334 - def __init__(self):

335 """ 336 Initialize the cluster class 337 """ 338 self.listseq = [] 339 self.ref = "" 340 self.length = 0 341 self.meanPourcSimilarity = 0.0

342

343 - def computeLen(self):

344 """Compute the length of the cluster""" 345 self.length = len(self.listseq)

346

347 - def addSequence(self,clusterSequence):

348 """Add a clusterSequence in the list 349 @param clusterSequence : a clusterSequence object 350 @type clusterSequence : clusterSequence 351 """ 352 self.listseq.append(clusterSequence)

353

354 - def computeMeanPourc(self):

355 """Compute the pourcent mean of similarity""" 356 sum = 0.0 357 nbSeq = 0 358 for clusterSequence in self.listseq : 359 nbSeq += 1 360 sum = sum + clusterSequence.pourcSimilarity 361 self.meanPourcSimilarity = sum/float(nbSeq)

362

363 -class clusterSequence(object) :

364 """ 365 New sequence in a cluster object 366 @cvar length : sequence length 367 @type length : integer 368 @cvar header : sequence header 369 @type header : string 370 @cvar pourcSimilarity : pourcent of similarity of the sequence against the cluster reference sequence 371 @type pourcSimilarity : float 372 """ 373

374 - def __init__(self, length, header, pourc):

375 """ 376 @param length : sequence length 377 @type length : integer 378 @param header : sequence header 379 @type header : string 380 @param pourc : pourcent of similarity of the sequence against the cluster reference sequence 381 @type pourc : float 382 """ 383 self.length = length 384 self.header = header 385 self.pourcSimilarity = pourc

386

387 -def exist(fname):

388 """ 389 Check the existence of a file. 390 @param fname: file name 391 @type fname: string 392 @return: 1 if the file is present, 0 otherwise 393 @rtype: integer 394 """ 395 try: 396 f = open(fname,'r') 397 f.close() 398 return 1 399 except: 400 return 0

401

402 -def clstrParser(file):

403 """ 404 parse .clstr file from cd hit 405 @param file: file name 406 @type file: string 407 @return: list of cluster object 408 @rtype: list 409 """ 410 411 clstr = open(file, "rb") 412 lignes = clstr.readlines() 413 clstr.close() 414 415 clusterList = [] 416 417 flag = 0 418 419 for ligne in lignes : 420 if ligne[0]=='>': 421 if flag != 0 : 422 C.computeLen() # compute length of the cluster 423 C.computeMeanPourc() 424 clusterList.append(C) 425 426 C = cluster() # new cluster 427 flag = 1 428 else : 429 l = ligne.split("\t")[1] 430 l = l.split(" ") 431 length = int(l[0][0:len(l[0])-3]) 432 header = l[1][1:len(l[1])-3] 433 if(l[2]=='*\n'): 434 C.ref = header 435 pourc = 100.0 436 else: 437 pourc = float(l[3][0:len(l[3])-2]) 438 Cseq = clusterSequence(length ,header ,pourc) 439 C.addSequence(Cseq) 440 441 return clusterList

442

443 -def fastaReader(file) :

444 """ 445 Fasta parser 446 @param file: file name 447 @type file: string 448 @return: dictionnary of sequence object with sequence ID as key 449 @rtype: dictionary 450 """ 451 452 fasta = open(file, "rb") 453 lignes = fasta.readlines() 454 fasta.close() 455 456 dict = {} 457 seq = "" 458 459 for ligne in lignes: 460 if(ligne[0]=='>'): 461 if(len(seq)>0): #sequence not empty 462 seque = sequence(ID,seq) 463 dict[ID] = seque 464 seq = "" 465 ID = ligne[1:len(ligne)-1] 466 ID = ID.split(" ")[0] # keep only header to the first space 467 else: 468 seq = seq + ligne[0:len(ligne)-1] 469 470 seque = sequence(ID,seq) 471 dict[ID] = seque 472 473 return dict

474

475 -def geneExtended(orfSTART, orfEND, alignSTART, alignEND, orfSTATUT):

476 """ 477 Check if a gene has been extended 478 @param orfSTART : ORF start position in sequence extended 479 @type orfSTART : integer 480 @param orfEND : ORF end position in sequence extended 481 @type orfEND : integer 482 @param alignSTART : alignement start position of sequence on contig 483 @type alignSTART : integer 484 @param alignEND : alignement end position of sequence on contig 485 @type alignEND : integer 486 @param orfSTATUT : ORF complete or partial 487 @type orfSTATUT : string 488 @return: True if the gene is completed, Fasle otherwise 489 @rtype: booleen 490 """ 491 492 orfLength = orfEND - orfSTART 493 alignLength = alignEND - alignSTART 494 495 if ((alignSTART>=orfSTART and alignSTART<=orfEND) and (alignEND>=orfSTART and alignEND<=orfEND)) and (orfLength > alignLength) and ("complete" in orfSTATUT): 496 # gene completed 3' and 5' 497 return True 498 499 elif ((alignSTART>orfSTART and alignSTART<=orfEND) and alignEND>orfEND) and ("complete" in orfSTATUT): 500 # gene completed 5' 501 return True 502 503 elif ((alignEND>=orfSTART and alignEND<orfEND) and alignSTART<orfSTART) and ("complete" in orfSTATUT): 504 # gene completed 3' 505 return True 506 507 else : 508 return False

509

510 -def geneSeen(orfSTART, orfEND, alignSTART, alignEND):

511 """ 512 Check if a gene has been seen 513 @param orfSTART : ORF start position in sequence extended 514 @type orfSTART : integer 515 @param orfEND : ORF end position in sequence extended 516 @type orfEND : integer 517 @param alignSTART : alignement start position of sequence on contig 518 @type alignSTART : integer 519 @param alignEND : alignement end position of sequence on contig 520 @type alignEND : integer 521 @return: True if the gene is seen, Fasle otherwise 522 @rtype: booleen 523 """ 524 525 526 orfLength = orfEND - orfSTART 527 alignLength = alignEND - alignSTART 528 529 if ((alignSTART>=orfSTART and alignSTART<=orfEND) or (alignEND>=orfSTART and alignEND<=orfEND)): 530 return True 531 532 else : 533 return False

534

535 -def reverseComplement(sequen):

536 """ 537 make the reverse complement of a sequence 538 @param sequen : nucleotide sequence 539 @type sequen : string 540 @return: reverse complement of sequence 541 @rtype: string 542 """ 543 544 seqReverse = [] 545 546 for base in sequen : 547 if(base.upper() == 'A') : seqReverse.insert(0,'T') 548 elif(base.upper() == 'T') : seqReverse.insert(0,'A') 549 elif(base.upper() == 'G') : seqReverse.insert(0,'C') 550 elif(base.upper() == 'C') : seqReverse.insert(0,'G') 551 else : seqReverse.append(base.upper()) 552 553 return (''.join(seqReverse))

554

555 -def metageneParser(file):

556 """ 557 Sotck ORFs of metagene file 558 @param file : file name 559 @type file : string 560 @return: a a dictionnary with contigs IDs as key and contig object as value 561 @rtype: dictionary 562 """ 563 564 mFile = open(file, "rb") 565 metageneReader = csv.reader(mFile,MyDialect()) 566 567 ligneContig = [] 568 dicoContig = {} 569 contigName = "" 570 idLigne = 0 571 572 for ligne in metageneReader: # ORF and positions stored 573 574 if len(ligne)!=0 : # if the line isn't empty 575 576 if "#" in ligne[0] : 577 ligneContig.append(ligne[0]) 578 idLigne += 1 579 580 if idLigne == 1 : 581 if len(contigName) > 0 : 582 583 dicoContig[contigName] = contig 584 contigName = "" 585 586 contigName = ligne[0].split(" ")[1] 587 588 elif idLigne == 3 : 589 contig = ContigORF(ligneContig[1], ligneContig[2]) 590 idLigne = 0 591 ligneContig = [] 592 593 else : 594 orf = ORF(ligne) 595 contig.ORFlist.append(orf) 596 597 dicoContig[contigName] = contig # add last sequence 598 599 return dicoContig

600

601 -def subjectStartStop(alignment, subjectLength):

602 """ 603 From an alignment and a length, compute the start and stop alignment position 604 @param alignment : alignmentSAM object 605 @type alignment : alignmentSAM 606 @param subjectLength : subject sequence length 607 @type subjectLength : integer 608 @return: a list with the position start and stop of the alignment (if start = -1 --> alignment start befor the subject sequence ; if stop = -2 --> alignment stop after the subject sequence) 609 @rtype: list 610 """ 611 612 if(alignment.CIGAR.Code[1] == "S" and alignment.Pos == 1): 613 posStart = -1 614 else: 615 posStart = alignment.Pos 616 617 posEnd = alignment.Pos + alignment.CIGAR.AlignLength + alignment.CIGAR.SumDeletion -1 618 619 if(alignment.CIGAR.Code[len(alignment.CIGAR.Code)-1] == "S" and posEnd == subjectLength): 620 posEnd = -2 621 622 list = [posStart,posEnd] 623 624 return list

625

626 -def fileLineNumber(file):

627 """ 628 Compute the int number of lines from a file 629 @param file : file name 630 @type file : string 631 @return: number of lines 632 @rtype: integer 633 """ 634 out = os.popen("wc -l " + file, "r").read() 635 out = int(out.split(" ")[0]) 636 return out

637

638 -def nbSequenceFasta(file):

639 """ 640 Compute the number of sequences in a FASTA file 641 @param file : file name 642 @type file : string 643 @return: number of sequences 644 @rtype: integer 645 """ 646 out = os.popen("grep -c '>' " + file, "r").read() 647 out = int(out.split(" ")[0]) 648 return out

649

650 -def writeORF(ORFlist,prefix,ID,sequence,n):

651 """ 652 write ORFs in PREFIX_complete.fasta file or PREFIX_partial.fasta file 653 @param ORFlist : list of ORF object 654 @type ORFlist : list 655 @param prefix : prefix of output file name 656 @type prefix : string 657 @param ID : sequence ID 658 @type ID : string 659 @param sequence : nucleotide sequence 660 @type sequence : string 661 @param n : limte length for partial ORF 662 @type n : integer 663 """ 664 665 i = 0 666 667 for ORF in ORFlist: 668 669 if(ORF.statut == "complete"): 670 outfasta = open(prefix + "_complete.fasta","a") 671 outfasta.write(">" + ID + "_ORF" + str(i) + "_complete\n" + sequence[ORF.posSTART-1:ORF.posEND] + "\n") 672 outfasta.close() 673 i += 1 674 675 elif(ORF.posEND - ORF.posSTART > n): 676 outfasta = open(prefix + "_partial.fasta","a") 677 outfasta.write(">" + ID + "_ORF" + str(i) + "_partial\n" + sequence[ORF.posSTART-1:ORF.posEND] + "\n") 678 outfasta.close() 679 i += 1

680

Source Code for Module DIGEST_functions