Module removeIdenticalSeq
|
|
1
2 """#!/usr/bin/python"""
3
4 import hashlib
5 import argparse
6 import os, sys
7
8
9 __doc__="""
10 Remove identical sequences from a FASTA file
11
12 @requires: U{hashlib<https://docs.python.org/2/library/hashlib.html>} python library
13
14 Input : SEQUENCE.fasta (1 line sequence)
15 Output : PREFIX.fasta
16 """
17
19
20 parser = argparse.ArgumentParser(description='Remove identical sequences from a FASTA file')
21
22 parser.add_argument('-i', action="store", dest='fasta',
23 type=str, help='input FASTA file')
24
25 parser.add_argument('-o', action="store", dest='output',
26 type=str, help='output FASTA file')
27
28 return parser
29
31
32 parser=get_parser()
33
34 if len(sys.argv)==1:
35 parser.print_help()
36 sys.exit(1)
37
38 Arguments=parser.parse_args()
39
40 fasta = open(Arguments.fasta, "rb")
41 lignes = fasta.readlines()
42 fasta.close()
43 output = open(Arguments.output, "wb")
44 md5List = []
45
46 seqNb = 0
47 outNb = 0
48
49 for ligne in lignes:
50 if '>' in ligne[0]:
51 ID = ligne
52 seqNb += 1
53 else:
54 m = hashlib.md5()
55 m.update(ligne)
56 md5 = m.hexdigest()
57 if md5 not in md5List:
58 md5List.append(md5)
59 output.write(ID + ligne)
60 outNb += 1
61 output.close()
62 print str(seqNb - outNb) + " sequences removed"
63
64 if __name__ == "__main__":
65 main()
66