Module extendTargets_allPossibility
|
|
1
2
3 from DIGEST_functions import *
4
5 __doc__="""
6 Extension of each sequence target with the mapped contig,
7 if a target is mapped on several contigs, each possibility are kept.
8
9 Unmapped contig are lost.
10
11 @requires: DIGEST_functions.py (PYTHONPATH)
12
13 Input : SAM file without header only with mapped line and sort by query
14 Output : target extended FASTA file
15 """
16
17
19
20 parser = argparse.ArgumentParser(description='Extension of each sequence target with the mapped contig, if a target is mapped on several contigs, each possibility are kept.\
21 Unmapped contig are lost.')
22
23 parser.add_argument('-sam', action="store", dest='samFile',
24 type=str, help='SAM file')
25
26 parser.add_argument('-f', action="store", dest='contigFile',
27 type=str, help='contig FASTA file')
28
29 parser.add_argument('-qual', action="store", dest='minMAPQ',
30 type=int, default='30', help='MAPQ min to keep alignment (default:30)')
31
32 parser.add_argument('-o', action="store", dest='prefix',
33 type=str, default='targetExtended.fasta', help='output prefix')
34
35 return parser
36
37
39
40 parser=get_parser()
41
42 if len(sys.argv)==1:
43 parser.print_help()
44 sys.exit(1)
45
46 Arguments=parser.parse_args()
47
48
49 samReader=open(Arguments.samFile,'rb')
50
51
52 dicoContig = fastaReader(Arguments.contigFile)
53
54 targetCompleted = open(Arguments.prefix, "wb")
55 info = open("info.txt", "wb")
56
57 extend5 = ""
58 extend3 = ""
59 nVersionTarget = 0
60
61 lenExtension = []
62
63 targetObj = sequence("", "")
64
65 for ligne in samReader:
66
67 ligne=re.split(r'\t+', ligne)
68 alignment = alignmentSAM(ligne)
69
70 if(alignment.MAPQ >= Arguments.minMAPQ):
71
72 if(targetObj.ID == ""):
73 targetID = alignment.Query
74
75 contig = dicoContig[alignment.Subject]
76
77 position = subjectStartStop(alignment, contig.length)
78
79 cigar = alignment.CIGAR
80 targetSequence = alignment.QuerySequence
81
82
83 targetObj = sequence(alignment.Query, alignment.QuerySequence)
84
85 if(alignment.Pos > 1):
86 if(cigar.Code[1]=='S'):
87 subseq = targetSequence[0:cigar.Code[0]-1]
88 if('N' in subseq):
89 targetSequence = contig.seq[0:alignment.Pos-1] + targetSequence[cigar.Code[0]-1:len(targetSequence)]
90 else:
91 extend5 = contig.seq[0:alignment.Pos-1]
92
93 if(position[1] < contig.length):
94 if(cigar.Code[len(cigar.Code)-1]=='S'):
95 n = cigar.Code[len(cigar.Code)-2]
96 subseq = targetSequence[len(targetSequence)-n:len(targetSequence)]
97 if('N' in subseq):
98 targetSequence = targetSequence[0:len(targetSequence)-n] + contig.seq[position[1]:len(contig.seq)]
99 else:
100 extend3 = contig.seq[position[1]:len(contig.seq)]
101
102 seq = extend5 + targetSequence + extend3
103 lenExtension.append(len(extend5) + len(extend3))
104
105 targetObj.seq = seq
106 targetObj.length = len(seq)
107
108 if(targetID == targetObj.ID):
109 nVersionTarget += 1
110 else:
111 nVersionTarget = 1
112
113 targetID = alignment.Query
114 targetCompleted.write(">" + targetObj.ID + "v" + str(nVersionTarget) + "\n" + targetObj.seq + "\n")
115 start = len(extend5)+1
116 end = start + len(targetSequence) -1
117
118 info.write(targetObj.ID + "v" + str(nVersionTarget) + "\t" + str(start) + "\t" + str(end) + "\n")
119
120 targetCompleted.close()
121 print "mean length of extension = " + str(sum(lenExtension)/len(lenExtension)) + "nt"
122
123 if __name__ == "__main__":
124 main()
125