Module removeIdenticalSeq
[hide private]
[frames] | no frames]

Source Code for Module removeIdenticalSeq

 1  #!/env/cns/home/afelten/agc/afelten/script/Python-2.7.5/python 
 2  """#!/usr/bin/python""" 
 3  # -*- coding: iso-8859-1 -*- 
 4  import hashlib 
 5  import argparse 
 6  import os, sys 
 7   
 8   
 9  __doc__=""" 
10  Remove identical sequences from a FASTA file 
11   
12  @requires: U{hashlib<https://docs.python.org/2/library/hashlib.html>} python library 
13   
14  Input : SEQUENCE.fasta (1 line sequence) 
15  Output : PREFIX.fasta 
16  """ 
17   
18 -def get_parser():
19 20 parser = argparse.ArgumentParser(description='Remove identical sequences from a FASTA file') 21 22 parser.add_argument('-i', action="store", dest='fasta', 23 type=str, help='input FASTA file') 24 25 parser.add_argument('-o', action="store", dest='output', 26 type=str, help='output FASTA file') 27 28 return parser
29
30 -def main():
31 32 parser=get_parser() 33 34 if len(sys.argv)==1: 35 parser.print_help() 36 sys.exit(1) 37 38 Arguments=parser.parse_args() 39 40 fasta = open(Arguments.fasta, "rb") 41 lignes = fasta.readlines() 42 fasta.close() 43 output = open(Arguments.output, "wb") 44 md5List = [] 45 46 seqNb = 0 47 outNb = 0 48 49 for ligne in lignes: 50 if '>' in ligne[0]: # header 51 ID = ligne 52 seqNb += 1 53 else: 54 m = hashlib.md5() 55 m.update(ligne) 56 md5 = m.hexdigest() 57 if md5 not in md5List: 58 md5List.append(md5) 59 output.write(ID + ligne) 60 outNb += 1 61 output.close() 62 print str(seqNb - outNb) + " sequences removed" 63 64 if __name__ == "__main__": 65 main() 66