Module extendTargets_allPossibility
[hide private]
[frames] | no frames]

Source Code for Module extendTargets_allPossibility

  1  #!/usr/bin/python 
  2  # -*- coding: iso-8859-1 -*- 
  3  from DIGEST_functions import * 
  4   
  5  __doc__=""" 
  6  Extension of each sequence target with the mapped contig, 
  7  if a target is mapped on several contigs, each possibility are kept. 
  8   
  9  Unmapped contig are lost. 
 10   
 11  @requires: DIGEST_functions.py (PYTHONPATH) 
 12   
 13  Input : SAM file without header only with mapped line and sort by query 
 14  Output : target extended FASTA file 
 15  """ 
 16   
 17   
18 -def get_parser():
19 20 parser = argparse.ArgumentParser(description='Extension of each sequence target with the mapped contig, if a target is mapped on several contigs, each possibility are kept.\ 21 Unmapped contig are lost.') 22 23 parser.add_argument('-sam', action="store", dest='samFile', 24 type=str, help='SAM file') 25 26 parser.add_argument('-f', action="store", dest='contigFile', 27 type=str, help='contig FASTA file') 28 29 parser.add_argument('-qual', action="store", dest='minMAPQ', 30 type=int, default='30', help='MAPQ min to keep alignment (default:30)') 31 32 parser.add_argument('-o', action="store", dest='prefix', 33 type=str, default='targetExtended.fasta', help='output prefix') 34 35 return parser
36 37
38 -def main():
39 40 parser=get_parser() 41 42 if len(sys.argv)==1: 43 parser.print_help() 44 sys.exit(1) 45 46 Arguments=parser.parse_args() 47 48 #AlignmentSamFile = open(Arguments.samFile, "rb") 49 samReader=open(Arguments.samFile,'rb') 50 #samReader = csv.reader(AlignmentSamFile,MyDialect()) 51 52 dicoContig = fastaReader(Arguments.contigFile) 53 54 targetCompleted = open(Arguments.prefix, "wb") 55 info = open("info.txt", "wb") 56 57 extend5 = "" 58 extend3 = "" 59 nVersionTarget = 0 60 61 lenExtension = [] # length extension of each sequence 62 63 targetObj = sequence("", "") # new empty sequence object 64 65 for ligne in samReader: 66 67 ligne=re.split(r'\t+', ligne) 68 alignment = alignmentSAM(ligne) 69 70 if(alignment.MAPQ >= Arguments.minMAPQ): # target aligned with good MAPQ 71 72 if(targetObj.ID == ""): #1st loop 73 targetID = alignment.Query 74 75 contig = dicoContig[alignment.Subject] # retrieve the mapped contig 76 77 position = subjectStartStop(alignment, contig.length) 78 79 cigar = alignment.CIGAR 80 targetSequence = alignment.QuerySequence 81 82 83 targetObj = sequence(alignment.Query, alignment.QuerySequence) 84 85 if(alignment.Pos > 1): # possible extension on 5' 86 if(cigar.Code[1]=='S'): # unmapped region on 5' 87 subseq = targetSequence[0:cigar.Code[0]-1] 88 if('N' in subseq): # substitute the unmapped region with at least one 'N' by the sequence of the contig 89 targetSequence = contig.seq[0:alignment.Pos-1] + targetSequence[cigar.Code[0]-1:len(targetSequence)] 90 else: 91 extend5 = contig.seq[0:alignment.Pos-1] 92 93 if(position[1] < contig.length): # possible extension on 3' 94 if(cigar.Code[len(cigar.Code)-1]=='S'): # unmapped region on 3' 95 n = cigar.Code[len(cigar.Code)-2] 96 subseq = targetSequence[len(targetSequence)-n:len(targetSequence)] 97 if('N' in subseq): # substitute the unmapped region with at least one 'N' by the sequence of the contig 98 targetSequence = targetSequence[0:len(targetSequence)-n] + contig.seq[position[1]:len(contig.seq)] 99 else: 100 extend3 = contig.seq[position[1]:len(contig.seq)] 101 102 seq = extend5 + targetSequence + extend3 103 lenExtension.append(len(extend5) + len(extend3)) 104 105 targetObj.seq = seq 106 targetObj.length = len(seq) 107 108 if(targetID == targetObj.ID): # if target already writed 109 nVersionTarget += 1 110 else: 111 nVersionTarget = 1 112 113 targetID = alignment.Query 114 targetCompleted.write(">" + targetObj.ID + "v" + str(nVersionTarget) + "\n" + targetObj.seq + "\n") 115 start = len(extend5)+1 116 end = start + len(targetSequence) -1 117 118 info.write(targetObj.ID + "v" + str(nVersionTarget) + "\t" + str(start) + "\t" + str(end) + "\n") 119 120 targetCompleted.close() 121 print "mean length of extension = " + str(sum(lenExtension)/len(lenExtension)) + "nt" 122 123 if __name__ == "__main__": 124 main() 125