Module extractORFsequences
Source Code for Module extractORFsequences

  1  #!/usr/bin/python 
  2  # -*- coding: iso-8859-1 -*- 
  3  from DIGEST_functions import * 
  4   
  5  __doc__=""" 
  6  Extract ORFs sequences in FASTA format from MetaGene output file and target extended FASTA file  
  7   
  8  @requires: DIGEST_functions.py (PYTHONPATH) 
  9   
 10  Output : PREFIX_complete.fasta and PREFIX_partial.fasta 
 11  """ 
 12           
 13 -def get_parser(): 
 14           
 15          parser = argparse.ArgumentParser(description='Extract ORFs sequences in FASTA format from MetaGene output file and target extended FASTA file') 
 16   
 17          parser.add_argument('-m', action="store", dest='metageneFile',  
 18                                                  type=str, help='metagene file') 
 19                                                   
 20          parser.add_argument('-f', action="store", dest='fasta',  
 21                                                  type=str, help='FASTA file') 
 22                                           
 23          parser.add_argument('-i', action="store", dest='info',  
 24                                                  type=str, help='info.txt file') 
 25                                                   
 26          parser.add_argument('-n', action="store", dest='limLength',  
 27                                                  type=int, default=100, help='min length for partial ORF (default=100)')                                  
 28                                                   
 29          parser.add_argument('-o', action="store", dest='prefix',  
 30                                                  type=str, help='output prefix')                                  
 31                                                   
 32          return parser 
 33   
 34           
 35           
 36 -def main():      
 37   
 38          parser=get_parser()              
 39           
 40          if len(sys.argv)==1: 
 41                  parser.print_help() 
 42                  sys.exit(1) 
 43           
 44          Arguments=parser.parse_args() 
 45   
 46          # files openning 
 47   
 48          iFile = open(Arguments.info, "rb") 
 49          dicoPos = {} 
 50          for ligne in iFile: 
 51                  ligne=re.split(r'\t+', ligne) 
 52                  dicoPos[ligne[0]]=[int(ligne[1]),int(ligne[2])] 
 53                   
 54                   
 55          mFile = open(Arguments.metageneFile, "rb") 
 56          lignes  = mFile.readlines() 
 57          mFile.close() 
 58          ligneContig = [] 
 59          dicoContig = {} 
 60          contigName = "" 
 61          idLigne = 0 
 62   
 63          for ligne in lignes: # stock ORF and their positions in memory 
 64                  ligne=ligne[0:len(ligne)-1] 
 65                  ligne=re.split(r'\t+', ligne) 
 66                  if len(ligne)!=0 : #if line not empty 
 67                   
 68                          if "#" in ligne[0] : 
 69                                  #print ligne 
 70                                  ligneContig.append(ligne[0]) 
 71                                  idLigne += 1 
 72   
 73                                  if idLigne == 1 : 
 74                                          if len(contigName) > 0 : 
 75                                                   
 76                                                  dicoContig[contigName] = contig 
 77                                                  contigName = "" 
 78                                                                                                                                                           
 79                                          contigName = ligne[0].split(" ")[1] 
 80                                   
 81                                  elif idLigne == 3 : 
 82                                          contig = ContigORF(ligneContig[1], ligneContig[2]) 
 83                                          idLigne = 0 
 84                                          ligneContig = [] 
 85   
 86                          elif("gene_" in ligne[0]) : 
 87                                  print ligne 
 88                                  orf = ORF(ligne) 
 89                                  contig.ORFlist.append(orf) 
 90                   
 91          dicoContig[contigName] = contig  # add last contig       
 92   
 93          targetComplete = 0 
 94          targetIncomplete = 0 
 95                                   
 96          fasta = open(Arguments.fasta, "rb") 
 97   
 98          lignes  = fasta.readlines() 
 99          sequence = "" 
100   
101          IDlist = [] 
102          IDlistIncomplete = []  
103   
104          for ligne in lignes : 
105                  if(ligne[0]=='>'): 
106                           
107                          if(len(sequence)>0 and ID in dicoContig): 
108                                  list = dicoContig[ID].ORFlist                    
109                                   
110                                  pos = dicoPos[ID] 
111                                  realID = ID[0:len(ID)-2] 
112                                  for lORF in list:        
113                                          #if(ORF.statut == "complete" and ((pos[0]>=ORF.posSTART and pos[0]<ORF.posEND) or (pos[1]>ORF.posSTART and pos[1]<=ORF.posEND)) and realID not in IDlist): 
114                                          if(lORF.statut == "complete" and pos[0]>=lORF.posSTART and pos[1]<=lORF.posEND and realID not in IDlist): 
115                                                  targetComplete += 1 
116                                                  IDlist.append(realID) 
117                                                  break 
118                                           
119                                          elif(lORF.statut == "partial" and pos[0]>=lORF.posSTART and pos[1]<=lORF.posEND and realID not in IDlist and realID not in IDlistIncomplete): 
120                                                  targetIncomplete += 1 
121                                                  IDlistIncomplete.append(realID) 
122                                                  break 
123                                   
124                                  #print ID 
125                                  writeORF(list, Arguments.prefix, ID, sequence,Arguments.limLength) 
126                          ID = ligne[1:len(ligne)-1] 
127                          sequence = "" 
128                                   
129                  else: 
130                          sequence = sequence + ligne[0:len(ligne)-1] 
131                           
132          if(ID in dicoContig): 
133                  list = dicoContig[ID].ORFlist 
134                  writeORF(list, Arguments.prefix, ID, sequence,Arguments.limLength) 
135                   
136                   
137          info=open(Arguments.prefix + "_info.txt", "a") 
138          info.write("target completed = " + str(targetComplete) + "\n") 
139          info.write("target incomplete = " + str(targetIncomplete) + "\n") 
140          info.close() 
141   
142  if __name__ == "__main__": 
143          main() 
144