Module extractORFsequences
[hide private]
[frames] | no frames]

Source Code for Module extractORFsequences

  1  #!/usr/bin/python 
  2  # -*- coding: iso-8859-1 -*- 
  3  from DIGEST_functions import * 
  4   
  5  __doc__=""" 
  6  Extract ORFs sequences in FASTA format from MetaGene output file and target extended FASTA file  
  7   
  8  @requires: DIGEST_functions.py (PYTHONPATH) 
  9   
 10  Output : PREFIX_complete.fasta and PREFIX_partial.fasta 
 11  """ 
 12           
13 -def get_parser():
14 15 parser = argparse.ArgumentParser(description='Extract ORFs sequences in FASTA format from MetaGene output file and target extended FASTA file') 16 17 parser.add_argument('-m', action="store", dest='metageneFile', 18 type=str, help='metagene file') 19 20 parser.add_argument('-f', action="store", dest='fasta', 21 type=str, help='FASTA file') 22 23 parser.add_argument('-i', action="store", dest='info', 24 type=str, help='info.txt file') 25 26 parser.add_argument('-n', action="store", dest='limLength', 27 type=int, default=100, help='min length for partial ORF (default=100)') 28 29 parser.add_argument('-o', action="store", dest='prefix', 30 type=str, help='output prefix') 31 32 return parser
33 34 35
36 -def main():
37 38 parser=get_parser() 39 40 if len(sys.argv)==1: 41 parser.print_help() 42 sys.exit(1) 43 44 Arguments=parser.parse_args() 45 46 # files openning 47 48 iFile = open(Arguments.info, "rb") 49 dicoPos = {} 50 for ligne in iFile: 51 ligne=re.split(r'\t+', ligne) 52 dicoPos[ligne[0]]=[int(ligne[1]),int(ligne[2])] 53 54 55 mFile = open(Arguments.metageneFile, "rb") 56 lignes = mFile.readlines() 57 mFile.close() 58 ligneContig = [] 59 dicoContig = {} 60 contigName = "" 61 idLigne = 0 62 63 for ligne in lignes: # stock ORF and their positions in memory 64 ligne=ligne[0:len(ligne)-1] 65 ligne=re.split(r'\t+', ligne) 66 if len(ligne)!=0 : #if line not empty 67 68 if "#" in ligne[0] : 69 #print ligne 70 ligneContig.append(ligne[0]) 71 idLigne += 1 72 73 if idLigne == 1 : 74 if len(contigName) > 0 : 75 76 dicoContig[contigName] = contig 77 contigName = "" 78 79 contigName = ligne[0].split(" ")[1] 80 81 elif idLigne == 3 : 82 contig = ContigORF(ligneContig[1], ligneContig[2]) 83 idLigne = 0 84 ligneContig = [] 85 86 elif("gene_" in ligne[0]) : 87 print ligne 88 orf = ORF(ligne) 89 contig.ORFlist.append(orf) 90 91 dicoContig[contigName] = contig # add last contig 92 93 targetComplete = 0 94 targetIncomplete = 0 95 96 fasta = open(Arguments.fasta, "rb") 97 98 lignes = fasta.readlines() 99 sequence = "" 100 101 IDlist = [] 102 IDlistIncomplete = [] 103 104 for ligne in lignes : 105 if(ligne[0]=='>'): 106 107 if(len(sequence)>0 and ID in dicoContig): 108 list = dicoContig[ID].ORFlist 109 110 pos = dicoPos[ID] 111 realID = ID[0:len(ID)-2] 112 for lORF in list: 113 #if(ORF.statut == "complete" and ((pos[0]>=ORF.posSTART and pos[0]<ORF.posEND) or (pos[1]>ORF.posSTART and pos[1]<=ORF.posEND)) and realID not in IDlist): 114 if(lORF.statut == "complete" and pos[0]>=lORF.posSTART and pos[1]<=lORF.posEND and realID not in IDlist): 115 targetComplete += 1 116 IDlist.append(realID) 117 break 118 119 elif(lORF.statut == "partial" and pos[0]>=lORF.posSTART and pos[1]<=lORF.posEND and realID not in IDlist and realID not in IDlistIncomplete): 120 targetIncomplete += 1 121 IDlistIncomplete.append(realID) 122 break 123 124 #print ID 125 writeORF(list, Arguments.prefix, ID, sequence,Arguments.limLength) 126 ID = ligne[1:len(ligne)-1] 127 sequence = "" 128 129 else: 130 sequence = sequence + ligne[0:len(ligne)-1] 131 132 if(ID in dicoContig): 133 list = dicoContig[ID].ORFlist 134 writeORF(list, Arguments.prefix, ID, sequence,Arguments.limLength) 135 136 137 info=open(Arguments.prefix + "_info.txt", "a") 138 info.write("target completed = " + str(targetComplete) + "\n") 139 info.write("target incomplete = " + str(targetIncomplete) + "\n") 140 info.close() 141 142 if __name__ == "__main__": 143 main() 144