Module extractORFsequences
|
|
1
2
3 from DIGEST_functions import *
4
5 __doc__="""
6 Extract ORFs sequences in FASTA format from MetaGene output file and target extended FASTA file
7
8 @requires: DIGEST_functions.py (PYTHONPATH)
9
10 Output : PREFIX_complete.fasta and PREFIX_partial.fasta
11 """
12
14
15 parser = argparse.ArgumentParser(description='Extract ORFs sequences in FASTA format from MetaGene output file and target extended FASTA file')
16
17 parser.add_argument('-m', action="store", dest='metageneFile',
18 type=str, help='metagene file')
19
20 parser.add_argument('-f', action="store", dest='fasta',
21 type=str, help='FASTA file')
22
23 parser.add_argument('-i', action="store", dest='info',
24 type=str, help='info.txt file')
25
26 parser.add_argument('-n', action="store", dest='limLength',
27 type=int, default=100, help='min length for partial ORF (default=100)')
28
29 parser.add_argument('-o', action="store", dest='prefix',
30 type=str, help='output prefix')
31
32 return parser
33
34
35
37
38 parser=get_parser()
39
40 if len(sys.argv)==1:
41 parser.print_help()
42 sys.exit(1)
43
44 Arguments=parser.parse_args()
45
46
47
48 iFile = open(Arguments.info, "rb")
49 dicoPos = {}
50 for ligne in iFile:
51 ligne=re.split(r'\t+', ligne)
52 dicoPos[ligne[0]]=[int(ligne[1]),int(ligne[2])]
53
54
55 mFile = open(Arguments.metageneFile, "rb")
56 lignes = mFile.readlines()
57 mFile.close()
58 ligneContig = []
59 dicoContig = {}
60 contigName = ""
61 idLigne = 0
62
63 for ligne in lignes:
64 ligne=ligne[0:len(ligne)-1]
65 ligne=re.split(r'\t+', ligne)
66 if len(ligne)!=0 :
67
68 if "#" in ligne[0] :
69
70 ligneContig.append(ligne[0])
71 idLigne += 1
72
73 if idLigne == 1 :
74 if len(contigName) > 0 :
75
76 dicoContig[contigName] = contig
77 contigName = ""
78
79 contigName = ligne[0].split(" ")[1]
80
81 elif idLigne == 3 :
82 contig = ContigORF(ligneContig[1], ligneContig[2])
83 idLigne = 0
84 ligneContig = []
85
86 elif("gene_" in ligne[0]) :
87 print ligne
88 orf = ORF(ligne)
89 contig.ORFlist.append(orf)
90
91 dicoContig[contigName] = contig
92
93 targetComplete = 0
94 targetIncomplete = 0
95
96 fasta = open(Arguments.fasta, "rb")
97
98 lignes = fasta.readlines()
99 sequence = ""
100
101 IDlist = []
102 IDlistIncomplete = []
103
104 for ligne in lignes :
105 if(ligne[0]=='>'):
106
107 if(len(sequence)>0 and ID in dicoContig):
108 list = dicoContig[ID].ORFlist
109
110 pos = dicoPos[ID]
111 realID = ID[0:len(ID)-2]
112 for lORF in list:
113
114 if(lORF.statut == "complete" and pos[0]>=lORF.posSTART and pos[1]<=lORF.posEND and realID not in IDlist):
115 targetComplete += 1
116 IDlist.append(realID)
117 break
118
119 elif(lORF.statut == "partial" and pos[0]>=lORF.posSTART and pos[1]<=lORF.posEND and realID not in IDlist and realID not in IDlistIncomplete):
120 targetIncomplete += 1
121 IDlistIncomplete.append(realID)
122 break
123
124
125 writeORF(list, Arguments.prefix, ID, sequence,Arguments.limLength)
126 ID = ligne[1:len(ligne)-1]
127 sequence = ""
128
129 else:
130 sequence = sequence + ligne[0:len(ligne)-1]
131
132 if(ID in dicoContig):
133 list = dicoContig[ID].ORFlist
134 writeORF(list, Arguments.prefix, ID, sequence,Arguments.limLength)
135
136
137 info=open(Arguments.prefix + "_info.txt", "a")
138 info.write("target completed = " + str(targetComplete) + "\n")
139 info.write("target incomplete = " + str(targetIncomplete) + "\n")
140 info.close()
141
142 if __name__ == "__main__":
143 main()
144