#!/usr/bin/env python # -*- coding: utf-8 -*- ################################################################# # # # Convert a transcriptome alignement file (SAM/BAM) into a # # genome alignement file (SAM/BA) # # # ################################################################# # Audric Cologne (AC) 2017-03-20 : ## Objectif : Modifier 3 champs d'un fichier SAM : ### RNAME (champ 2) : ENSXXX -> chrName ### START (champ 3) : [0/1?-len(transcrit)] (startOnTranscript) -> startOnChr ### CIGAR (champ 5) : 10M5D20M -> 5M325N5M5D500N20M (ajout des introns) ## Questions : ### L'alignement commence à la position 0 ou 1 ? -> 1 ### Un "D"/"I" et un "N" peuvent-ils se suivrent dans un CIGAR ? -> Oui mais le tout sera visualiser comme une deletion. Il faut rajouter un "0M" entre un champ "D"/"I" et "N" d'un CIGAR pour que l'affichage soit correct ### Différence soft et hard clipping ? -> hard clipping si la séquence clippé s'aligne ailleur dans le génome. Dans ce cas, les séquences clippés n'apparaissent pas dans la séquence du SAM (si 10H10M10H, la séquence fait 10 bases ; si 10S10M10S, la séquence fait 30 bases) ### Il va falloir un nouveau header au SAM. Que mettre en valeur pour la longueur d'un chromosome ? -> Pour l'instant, on dit que le chromosome s'arrete a la derniere base du dernier gène ## Méthode : ### La methode utilisee est un peu brute. On ne sait pas à l'avance sur quel chromosome un read s'aligne dans le SAM/BAM donne en entre. Donc : ### 1. Sauvegarde des informations utiles du GTF dans des dictionnaires. Trop grosse structure de donnees ? ### 2. Parcourire le SAM et traduire les informations transcriptome en information genome pour chaque alignement # Remarques : ### Les RNAME autres que ENSXXX (GENSCAN par exemple) ne sont pas pris en compte # AC 2017-03-22 ## -> Modifier Sequence si brin "-" -> REVCOMP ## -> Si brin "-", le début de l'alignement est en fait la fin (car alignement sur transcrit) ### -> DONC, debGeno = startGene + tailleTranscrit - (startAli + lenAli - 1) ### -> Le code CIGAR est aussi a inverser import os import sys import gzip import time import argparse import subprocess from string import maketrans REVCOMP=lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}[B] for B in x][::-1]) BAM2SAM_COMMAND="samtools view " SAM2BAM_COMMAND="samtools view -bS " SORTBAM_COMMAND="samtools sort " INDEXBAM_COMMAND="samtools index " FAIDX_COMMAND="samtools faidx " SAM_MULTIMAP="0" SAM_NAME=0 SAM_FLAG=1 SAM_RNAME=2 SAM_START=3 SAM_SCORE=4 SAM_CIGAR=5 SAM_SEQ=9 SAM_QUALITY=10 SAM_OTHERSTART=11 GTF_CHROM=0 GTF_SOURCE=1 GTF_FEATURE=2 GTF_START=3 GTF_END=4 GTF_SCORE=5 GTF_STRAND=6 GTF_FRAME=7 GTF_ATT=8 GTF_ATTRIBUTE_GENENAME="gene_name" GTF_ATTRIBUTE_GENEID="gene_id" GTF_ATTRIBUTE_GENEBIO="gene_biotype" GTF_ATTRIBUTE_TRANSCRIPTNAME="transcript_name" GTF_ATTRIBUTE_TRANSCRIPTID="transcript_id" GTF_ATTRIBUTE_EXONNUMBER="exon_number" GTF_ATTRIBUTE_EXONID="exon_id" GTF_FEATURE_UTR="UTR" GTF_FEATURE_EXON="exon" GTF_FEATURE_TRANSCRIPT="transcript" GTF_FEATURE_GENE="gene" DGTF_CHRLEN=0 DGTF_TRLIST=1 DGTFBIN_STRAND=0 DGTFBIN_TRSTART=1 DGTFBIN_GAPS=2 DGTFBIN_TRLEN=3 SAM_VERSION=1.4 T2G_VERSION=0.3 def main(): parser = argparse.ArgumentParser(description='Convert transcriptome alignement file into genome alignement file ! By default, filter out Supplementary and Multi-Mapped reads.') parser.add_argument("--AlignmentFiles", action="append", dest="AlignmentFiles", type=str, help="On transcriptome alignment files, SAM or BAM (extension must be .sam or .bam). If multiple do --AlignmentFiles file1.sam --AlignmentFiles file2.bam ... OR --AlignmentFiles file1.sam,file2.bam ...). ", required = True) parser.add_argument("--GTF", action="store", dest="gtf", type=str, help="Reference GTF file. ", required=True) parser.add_argument("--OutputFolder", action="store", dest="OutputFolder", type=str, help="Folder where the output will be saved. If you use this option, a generic output name is created (ALIGNMENTFILENAME_genome_convert.XAM). ") parser.add_argument("--OutputFiles", action="append", dest="OutputFiles", type=str, help="Names of output folder and files (./out/folder/name/output_file). A .sam or .bam extension will be added if not specify. This option must have th same number of argument than the --AlignmentFiles parameter. If multiple do --OutputFiles ./out/folder/name/output_file1 --OutputFiles ./out/folder/name/output_file2 ... OR --OutputFiles ./out/folder/name/output_file1,./out/folder/name/output_file2 ...). ") parser.add_argument("--OutputType", action="store", dest="OutputType", type=str, choices=["SAM", "BAM", "sortedBAM", "sortedBAMindex"], default="sortedBAMindex", help="Output files type (either SAM, BAM, sortedBAM or sortedBAMindex default is sortedBAMindex). ") parser.add_argument("--RefFasta", action="store", dest="RefFasta", type=str, help="Reference genomic fasta file (needed if you want the programme to output the correct chromosome length in the SAM header). ") parser.add_argument("--RefFai", action="store", dest="RefFai", type=str, help="Reference genomic fasta.fai file (needed if you want the programme to output the correct chromosome length in the SAM header). ") parser.add_argument("--tmp", action="store", dest="tmp", type=str, default="t2g_tmp/", help="Name of the folder containing the temporary files. ") parser.add_argument("--keepSupplementary", action="store_true", dest="kS", help="Should we keep supplementary aligned reads ?") parser.add_argument("--keepMultiMap", action="store_true", dest="kMM", help="Should we keep multi-mapped aligned reads ?") parser.add_argument("--maxLenMultiMap", action="store", dest="mLMM", type=int, default=500, help="If a multi-mapped read have a alignment length larger or equal to this value, we keep him (we consider that the read is multimapped because of transcript redondance instead of low complexity regions). Set to -1 in order to delete all multi-mapped reads. Default is 500.") parser.add_argument("--minAliSeqOverSeq", action="store", dest="miASOS", type=float, default=0, help="Minimum percentage of the aligned sequence length (seq length without soft-clipping) over the sequence length in order to keep the alignement. Default is 0%%.") parser.add_argument("--maxAliSeqOverSeq", action="store", dest="maASOS", type=float, default=100, help="Maximum percentage of the aligned sequence length (seq length without soft-clipping) over the sequence length in order to keep the alignement. Default is 100%%.") parser.add_argument("--S2M", action="store_true", dest="S2M", help="Should we transform soft clipped bases into match bases ?") parser.add_argument("--outputCoverageInformations", action="store_true", dest="oCI", help="Should we output some covery informations ? (format : readID transcriptID readLength alignedReadLength transcriptLength alignmentLength leftSoftClipping rightSoftClipping)") options = parser.parse_args() # Parameters verifications if not os.path.isfile(options.gtf): print " ERROR:\tThe gtf file "+options.gtf+"does not exist." exit(1) if options.gtf[-4:]!=".gtf": print " ERROR:\tThe gtf file "+options.gtf+" does not have the .gtf extension." exit(1) if options.OutputFolder==None and options.OutputFiles==None: print " ERROR:\tYou must set an output (--OutputFolder or --OutputFiles)." exit(1) if options.OutputFolder!=None and options.OutputFiles!=None: print " ERROR:\tYou must set only one output (--OutputFolder or --OutputFiles)." exit(1) if options.RefFasta!=None and options.RefFai!=None: print " ERROR:\tYou must either set RefFasta or RefFai but not both." exit(1) if options.RefFasta!=None: if not os.path.isfile(options.RefFasta): print " ERROR:\tThe RefFasta file "+options.RefFasta+" does not exist." exit(1) if options.RefFasta.split(".")[-1] not in ["fa", "fasta"]: print " ERROR:\tThe RefFasta file "+options.RefFasta+" does not have a .fa/.fasta extension. If compressed, please uncompresse it." exit(1) if options.RefFai!=None: if not os.path.isfile(options.RefFai): print " ERROR:\tThe RefFai file "+options.RefFai+" does not exist." exit(1) if ".".join(options.RefFai.split(".")[-2:]) not in ["fa.fai", "fasta.fai"]: print " ERROR:\tThe RefFai file "+options.RefFai+" does not have a .fa.fai/.fasta.fai extension." exit(1) if options.mLMM<-1: print " ERROR:\t--maxLenMultiMap should be given a number greater or equal to -1." exit(1) if options.miASOS<0 or options.miASOS>100: print " ERROR:\t--minAliSeqOverSeq should be given a number between 0 and 100." exit(1) if options.maASOS<0 or options.maASOS>100: print " ERROR:\t--maxAliSeqOverSeq should be given a number between 0 and 100." exit(1) if options.maASOSSAM convertion of the input file if inFile[-3]=="b": print " -- Converting input bam file "+inFile+" into sam" inFile=BAM2SAM(inFile, tmpDir) print " -- Done !" tmpSAM=tmpDir+"tmp.sam" fTmp=open(tmpSAM, "w") fTmp.write(SAMheader) if oCI: fCI=open(outFile+".coverageInfo.txt", "w") fCI.write("@1:readID\t2:transcriptID\t3:readLength\t4:transcriptLength\t5:alignedReadLength\t6:alignmentLength\t7:leftSoftClipping\t8:rightSoftClipping\t9:multimap?\t10:supplementary?") # b. Reading each aligned read print " -- Reading alignment" lChrom=dGTF.keys() nNotENS=0 nNotAligned=0 nNotFound=0 nSup=0 # supplementary reads nMM=0 # multi-mapped reads nMMfiltered=0 # multi-mapped reads filtered nNotGoodPercentageBelow=0 # reads with len(aliSeq)/len(seq)*100maASOS nSMM=0 # sup+multimatch nSALI=0 # sup+NotGoodPercentage nSMMALI=0 # sup+mulimatch+NotGoodPercentage nMMALI=0 # multimatch+NotGoodPercentage nOK=0 f=open(inFile,"r") line=f.readline() # We don't care about the header. Should we ? while line[0]=="@": line=f.readline() iRead=0 while line: iRead+=1 if iRead%10000==0: print " --- "+str(iRead)+" reads treated..." line=line.strip() lLine=line.split("\t") if lLine[SAM_CIGAR]=="*": nNotAligned+=1 elif lLine[SAM_RNAME][:3] not in ["ENS", "SIR"]: # AC, 2018-01-12 : added SIR as a correct beginning for transcript ID for spike-in nNotENS+=1 else: startAli=int(lLine[SAM_START]) trID=lLine[SAM_RNAME].split(".")[0] trIDbin=int(trID[-2:]) found=False iChrom=0 while not found and iChrom < len(lChrom): chrom=lChrom[iChrom] if trID in dGTF[chrom][DGTF_TRLIST][trIDbin].keys(): found=True trList=dGTF[chrom][DGTF_TRLIST][trIDbin][trID] iChrom+=1 if not found: #print " WARNING:\tThis transcript ID was not found in the GTF file : "+trID nNotFound+=1 else: if oCI: fCI.write("\n"+lLine[SAM_NAME]+"\t"+lLine[SAM_RNAME]+"\t"+str(getLengthFromCIGAR(lLine[SAM_CIGAR]))+"\t"+str(trList[DGTFBIN_TRLEN])+"\t"+str(getLengthFromCIGAR(lLine[SAM_CIGAR], 2))+"\t"+str(getLengthFromCIGAR(lLine[SAM_CIGAR], 1))+"\t"+str(getLengthStartSoftClipping(lLine[SAM_CIGAR]))+"\t"+str(getLengthStartSoftClipping(lLine[SAM_CIGAR],True))) # AC 2017-03-24 # Determining if the read alignement length over the read length is OK alignedSeqLen=float(getLengthFromCIGAR(lLine[SAM_CIGAR], 2)) ratio=(alignedSeqLen/getLengthFromCIGAR(lLine[SAM_CIGAR]))*100 if maASOS>=ratio>=miASOS: isOKaliOverSeq=True else: isOKaliOverSeq=False if maASOS>=ratio: nNotGoodPercentageBelow+=1 else: nNotGoodPercentageOver+=1 # AC 2017-03-24 # Determining if the read is multi-mapped isMM=False if lLine[SAM_SCORE]==SAM_MULTIMAP: if not kMM and (mLMM==-1 or alignedSeqLenthisChrStop: baseToRemoveFromCIGAR=seqStop-thisChrStop lCIGAR=lLine[SAM_CIGAR].split("M") lLine[SAM_CIGAR]="M".join(lCIGAR[:-2])+"M"+str(int(lCIGAR[-2])+baseToRemoveFromCIGAR)+"M" lLine[SAM_SEQ]=lLine[SAM_SEQ][:baseToRemoveFromCIGAR] lLine[SAM_QUALITY]=lLine[SAM_QUALITY][:baseToRemoveFromCIGAR] ## ADDED END else: lLine[SAM_START]=str(trueStart) lLine[SAM_CIGAR]=trueCIGAR lLine[SAM_RNAME]=chrom fTmp.write("\n"+"\t".join(lLine)) line=f.readline() f.close() fTmp.close() if oCI: fCI.close() aliRead=iRead-nNotAligned-nNotENS-nNotFound nNotGoodPercentage=nNotGoodPercentageBelow+nNotGoodPercentageOver print " -- Done ! "+str(iRead)+" reads treated, of witch :\n\t"+str(nNotAligned)+"/"+str(iRead)+" wasn't aligned\n\t\t"+str(nNotENS)+"/"+str(iRead-nNotAligned)+" hadn't an ENSEMBL transcript ID\n\t\t\t"+str(nNotFound)+"/"+str(iRead-nNotAligned-nNotENS)+" aligned on unknown transcript (may be from PATCH chromosome)\n\t\t\t\t"+str(nSup)+"/"+str(aliRead)+" reads are supplementary reads (SR)\n\t\t\t\t"+str(nMM)+"/"+str(aliRead)+" reads are multi-mapped reads (MMR), among witch "+str(nMMfiltered)+" where filtered (alignment length < "+str(mLMM)+")\n\t\t\t\t"+str(nNotGoodPercentage)+"/"+str(aliRead)+" had a percentage of aligned sequence over the complete sequence (ALI) < "+str(miASOS)+"% ("+str(nNotGoodPercentageBelow)+") or > "+str(maASOS)+"% ("+str(nNotGoodPercentageOver)+")\n\t\tSR + MMR : "+str(nSMM)+"\n\t\tSR + ALI : "+str(nSALI)+"\n\t\tMMR + ALI : "+str(nMMALI)+"\n\t\tSR + MMR + ALI : "+str(nSMMALI) totRead=str(nOK) if kS: if kMM: totRead+=" (with supplementary and multi-map)" else: totRead+=" (with supplementary, without multi-map (alignment length < "+str(mLMM)+"))" else: if kMM: totRead+=" (without supplementary, with multi-map)" else: totRead+=" (without supplementary and multi-map (alignment length < "+str(mLMM)+"))" print "\tTotal outputed : "+totRead+" reads, with "+str(maASOS)+"% >= percentage of aligned sequence over the complete sequence >= "+str(miASOS)+"%." # AC 2017-03-23 # Creating the correct output file if oT=="SAM": os.rename(tmpSAM, outFile) return tmpBAM=tmpDir+"tmp.bam" print " -- Doing : "+SAM2BAM_COMMAND+tmpSAM+" > "+tmpBAM subprocess.call(SAM2BAM_COMMAND+tmpSAM+" > "+tmpBAM, shell=True) if oT=="BAM": os.rename(tmpBAM, outFile) return tmpSortedBAM=tmpDir+"tmp.sorted"+".bam" print " -- Doing : "+SORTBAM_COMMAND+"-o "+tmpSortedBAM+" "+tmpBAM subprocess.call(SORTBAM_COMMAND+"-o "+tmpSortedBAM+" "+tmpBAM, shell=True) os.rename(tmpSortedBAM, outFile) if oT=="sortedBAM": return sortedBAMindex=outFile+".bai" print " -- Doing : "+INDEXBAM_COMMAND+outFile+" "+sortedBAMindex subprocess.call(INDEXBAM_COMMAND+outFile+" "+sortedBAMindex, shell=True) def haveFlag(SAMflag, searchedFlag): # AC 2017-03-24 ## Tell if a flag is in the SAMflag. ## INPUT : ### SAMflag : int ### searchedFlag : int, 0<=searchedFlag<=11. 4 is reversed aligned read, 11 is supplementary alignement. See SAM manual for flag info ## OUTPUT : ### bool if searchedFlag<0 or searchedFlag>11: print(" ERROR:\tThe searchedFlag should be the power of 2, between 0 and 11 (inclusive). If you want reversed aligned reads, then searchedFlag should be 4 (16=2**4)") exit(1) START_AT=11 current=START_AT flag=SAMflag while current!=searchedFlag: if flag>=2**current: flag-=2**current current-=1 if flag>=2**current: return True return False def correctMDZ(lSAM): # AC 2017-03-22 ## Inverte the MD:Z: field of a SAM line if any. ## INPUT : ### lSAM : list, liste of SAM elements i=SAM_OTHERSTART while iT etc...) MD:Z: field. ## INPUT : ### MDfield : str, MD:Z:... format ## OUTPUT : ### str, complemented MDfield intab="atgcATGC" outtab="tacgTACG" trantab=maketrans(intab,outtab) lMD=MDfield.split(":") MD=lMD[2] lMD[2]=MD.translate(trantab) return ":".join(lMD) def invertMDZ(MDfield): # AC 2017-03-22 ## Invert the MD:Z: field. ## INPUT : ### MDfield : str, MD:Z:... format ## OUTPUT : ### str, inverted MDfield lMD=MDfield.split(":") MD=lMD[2] iMD="" cLen="" cCirc="" cLetters="" wasDigit=True for letter in MD: if letter.isdigit(): if not wasDigit: iMD=cCirc+cLetters[::-1]+iMD cCirc="" cLetters="" cLen+=letter wasDigit=True else: iMD=cLen+iMD cLen="" if letter=="^": cCirc="^" else: cLetters+=letter wasDigit=False if wasDigit: iMD=cLen+iMD else: iMD=cCirc+cLetters[::-1]+iMD lMD[2]=iMD return ":".join(lMD) def getLengthStartSoftClipping(CIGAR,reverse=False): # AC 2017-03-22 ## Get the length of the starting soft clipping, if any. ## INPUT : ### CIGAR : str ### reverse : bool, reverse the CIGAR (get ending softclipping) ## OUTPUT : ### lSSC : int lSSC=0 cLen="" if reverse: end=len(CIGAR)-1 if CIGAR[end]=="S": end-=1 while CIGAR[end].isdigit(): cLen+=CIGAR[end] end-=1 lSSC=int(cLen[::-1]) return lSSC else: for letter in CIGAR: if letter.isdigit(): cLen+=letter else: if letter=="S": lSSC+=int(cLen) return lSSC def invertCIGAR(CIGAR): # AC 2017-03-22 ## Invert a CIGAR string (the end become the begining). ## INPUT : ### CIGAR : str ## OUTPUT : ### iCIGAR : str, inverted CIGAR iCIGAR="" cLen="" for letter in CIGAR: if letter.isdigit(): cLen+=letter else: iCIGAR=cLen+letter+iCIGAR cLen="" return iCIGAR def t2gCIGAR(tCIGAR, start, lGaps): # AC 2017-03-21 ## Translate a transcriptomic CIGAR string into a genomic CIGAR string. ## INPUT : ### tCIGAR : str, transcriptomic CIGAR string ### start : int, start of the alignment ### lGaps : list, [startGap-endGap, ...] ## OUTPUT : ### gCIGAR : str, genomic CIGAR string # Creating gCIGAR currentStart=start gCIGAR="" cLen="" # CIGAR element length for letter in tCIGAR: if letter.isdigit(): cLen+=letter else: if letter in ["M", "D", "=", "X"]: # Insertions and soft/hard clipping does not change the start, deletions are like matches does #cLenSave=int(cLen) stop=currentStart+int(cLen) lStartLenGap=getGapLength(currentStart, stop, lGaps, 1) ## KEEP ? #if lStartLenGap==[]: # currentStart+=cLenSave ## for gap in lStartLenGap: startGap,lenGap=getStartLenGap(gap) subLen=startGap-currentStart # subLen = part of the tCIGAR before the gap cLen=str(int(cLen)-subLen) # cLen = part of the tCIGAR after the gap gCIGAR+=str(subLen)+letter+str(lenGap)+"N" currentStart+=lenGap+subLen #currentStart+=lenGap currentStart+=int(cLen) #elif letter=="D": # stop=currentStart+int(cLen) # lStartLenGap=getGapLength(currentStart, stop, lGaps, 1) # for gap in lStartLenGap: # currentStart+=lenGap elif letter=="N": #print " WARNING:\tFound an unexplainable N in the transcriptomic CIGAR code" currentStart+=int(cLen) elif letter not in ["I", "S", "H"]: print " ERROR:\tUnknown CIGAR ID : "+letter exit(1) gCIGAR+=cLen+letter cLen="" gCIGAR=correctCIGAR(gCIGAR) return gCIGAR def correctCIGAR(CIGAR): # AC 2017-03-22 ## Add "0M" where needed in a CIGAR string. ## INPUT : ### CIGAR : str ## OUTPUT : ### cCIGAR : str, corrected CIGAR wasN=False wasDI=False cLen="" cCIGAR="" for letter in CIGAR: if letter.isdigit(): cLen+=letter else: if letter=="N": if wasDI: cLen="0M"+cLen wasN=True wasDI=False elif letter in ["D", "I"]: if wasN: cLen="0M"+cLen wasDI=True wasN=False else: wasN=False wasDI=False cCIGAR+=cLen+letter cLen="" return cCIGAR def getStartLenGap(gap): # AC 2017-03-21 ## Get the start and length of a gap (format start:len). ## INPUT : ### gap : str ## OUTPUT : ### list, 1st element is start, 2nd element is length (integer) lGap=gap.split(":") return [int(lGap[0]), int(lGap[1])] def getLengthFromCIGAR(CIGAR, mode=0): # AC 2017-03-21 ## Give the length of a sequence from a CIGAR string. ## INPUT : ### CIGAR : str ### mode : int (0, 1), 0 is for full seq length (with hard-clipping), 1 is for alignment length, 2 is for aligned seq length ## OUTPUT : ### length : int cLen="" sLen=0 for letter in CIGAR: if letter.isdigit(): cLen+=letter else: if mode==0: if letter in ["M", "S", "I", "H", "=", "X"]: sLen+=int(cLen) elif mode==1: if letter in ["M", "D", "N", "=", "X"]: sLen+=int(cLen) elif mode==2: if letter in ["M", "I", "=", "X"]: sLen+=int(cLen) cLen="" return sLen def getGapLength(start, stop, lGaps, withStart=0): # AC 2017-03-20 ## Give a list of the length of the gaps between start and stop. ## INPUT : ### start : int, inclusive ### stop : int, inclusive ### lGaps : list, [startGap-endGap, ...] ### withStart : int (0,1), change output format ## OUTPUT : ### lGapLen : list, [lenGap, ...] OR [startGap:lenGap] with withStart!=0 lGapLen=[] #lGaps.sort() lGaps=croisSort(lGaps) for gap in lGaps: startGap,stopGap=getStartStop(gap) if startGap>stop: return lGapLen if startGap>start: lenGap=stopGap-startGap+1 if withStart==0: lGapLen.append(lenGap) else: lGapLen.append(str(startGap)+":"+str(lenGap)) stop+=lenGap return lGapLen def croisSort(l): # AC 2018-01-12 ## Sort a start-end type list by increasing start. ## INPUT : ### l : list, format : ['start-end1', 'start-end2', ...] ## OUTPUT : ### list, sorted (by start) elements of l d={} # d[start]='start-stop' lStart=[] for e in l: start=int(e.split("-")[0]) d[start]=e lStart.append(start) lStart.sort() lRes=[] for start in lStart: lRes.append(d[start]) return lRes def getStartStop(startStop): # AC 2017-03-20 ## Return integer of start and stop (start-stop format). ## INPUT : ### StartStop : str ## OUTPUT : ### list, first element is start, second/last element is stop return [int(startStop.split("-")[0]),int(startStop.split("-")[1])] def makeHeader(dGTF): # AC 2017-03-20 ## Create a SAM header from a filled GTF dictionnary. ## INPUT : ### dGTF : dict, filled GTF dictionnary ## OUTPUT : ### header : str header="@HD\tVN:"+str(SAM_VERSION)+"\tSO:unknown" for chrom in dGTF.keys(): header+="\n@SQ\tSN:"+chrom+"\tLN:"+str(dGTF[chrom][DGTF_CHRLEN]) header+="\n@PG\tID:t2g\tPN:ali_transcriptome2genome.py\tVN:"+str(T2G_VERSION)+"\tCL:"+" ".join(sys.argv) return header def getGTFdict(fGTF): # AC 2017-03-20 ## Create a dictionnary from a GTF. ## INPUT : ### fGTF : str, gtf file name ## OUTPUT : ### dGTF : dict # dGTF[chr]=[chrLength, [{RNAMEendIs0: strand, STARTtranscript, [gaps], lenTranscript}, {RNAMEendIs1: strand, STARTtranscript, [gaps], lenTranscript}, ...]] dGTF={} f=open(fGTF,"r") line=f.readline() # We don't care about the header while line[0]=="#": line=f.readline() # We got to fill the transcript dictionnary lLine=line.split("\t") chrSave=lLine[GTF_CHROM] # saved chromosome print " -- Analysing chromosome "+chrSave maxGeneEnd=int(lLine[GTF_END]) line=f.readline() lLine=line.split("\t") initDictGtf(dGTF, chrSave) strand=lLine[GTF_STRAND] # strand say how to read the exons transcriptID=getAttributeValue(lLine[GTF_ATT], GTF_ATTRIBUTE_TRANSCRIPTID) transcriptID=transcriptID.strip("\"").split(".")[0] transcriptBin=int(transcriptID[-2:]) dGTFbin=dGTF[chrSave][DGTF_TRLIST][transcriptBin] dGTFbin[transcriptID]=[strand, int(lLine[GTF_START]), [], "NA"] intronStart=None intronStop=None trLen=0 line=f.readline() while line: line=line.strip() lLine=line.split("\t") chrom=lLine[GTF_CHROM] feature=lLine[GTF_FEATURE] if feature==GTF_FEATURE_GENE: # If we have a new gene dGTFbin[transcriptID][DGTFBIN_TRLEN]=trLen if chrom!=chrSave: # If we have a new chromosome dGTF[chrSave][DGTF_CHRLEN]=maxGeneEnd+1 chrSave=chrom maxGeneEnd=int(lLine[GTF_END]) initDictGtf(dGTF, chrSave) print " -- Analysing chromosome "+chrSave elif maxGeneEnd < int(lLine[GTF_END]): maxGeneEnd=int(lLine[GTF_END]) elif feature==GTF_FEATURE_TRANSCRIPT: # If we have a new transcript dGTFbin[transcriptID][DGTFBIN_TRLEN]=trLen strand=lLine[GTF_STRAND] # strand say how to read the exons transcriptID=getAttributeValue(lLine[GTF_ATT], GTF_ATTRIBUTE_TRANSCRIPTID) transcriptID=transcriptID.strip("\"").split(".")[0] transcriptBin=int(transcriptID[-2:]) dGTFbin=dGTF[chrSave][DGTF_TRLIST][transcriptBin] dGTFbin[transcriptID]=[strand, int(lLine[GTF_START]), [], "NA"] intronStart=None intronStop=None trLen=0 elif feature==GTF_FEATURE_EXON: # If we have a new exon trLen+=int(lLine[GTF_END])-int(lLine[GTF_START])+1 if strand=="+": if intronStart!=None: intronStop=int(lLine[GTF_START])-1 else: intronStart=int(lLine[GTF_END])+1 else: if intronStop!=None: intronStart=int(lLine[GTF_END])+1 else: intronStop=int(lLine[GTF_START])-1 if intronStart!=None and intronStop!=None: dGTFbin[transcriptID][DGTFBIN_GAPS].append(str(intronStart)+"-"+str(intronStop)) if strand=="+": intronStart=int(lLine[GTF_END])+1 else: intronStop=int(lLine[GTF_START])-1 line=f.readline() f.close() dGTFbin[transcriptID][DGTFBIN_TRLEN]=trLen dGTF[chrSave][DGTF_CHRLEN]=maxGeneEnd+1 return dGTF def getAttributeValue(attributes, searchAttribute): # AC 2017-03-20 ## Get value of an attribute in an attribute line ## INPUT : ### attributes : str, attribute line ### searchAttribute : str return attributes.split(searchAttribute)[1].split(";")[0].strip() def initDictGtf(dGTF, chrom): # AC 2017-03-20 ## Initialise the gtf dictionnary for a chromosome ## INPUT : ### dGTF : dict ### chrom : str dGTF[chrom]=[None, []] for i in range(100): # We had 100 empty dictionnary. The first one will contain transcript ID finishing with 00, the second one will contain transcript ID finishing with 01, ... dGTF[chrom][DGTF_TRLIST].append(dict()) def BAM2SAM(f, tmpDir): # AC 2017-03-20 ## Convert bam files into sam files in a list of files. Update lFiles. ## INPUT : ### lFiles : str, file name ### tmpDir : str ## OUTPUT : ### newFile : str, name of the converted file ensure_dir(tmpDir) fName=f.split("/")[-1] newFile=tmpDir+fName[:-3]+"s"+fName[-2:] print " -- Doing : "+BAM2SAM_COMMAND+f+" > "+newFile subprocess.call(BAM2SAM_COMMAND+f+" > "+newFile, shell=True) return newFile def checkInput(lName): # AC 2017-03-20 ## Check if the input files names extensions are correct ## INPUT : ### lName : list, input files names for name in lName: if not os.path.isfile(name): print " ERROR:\tInput file "+name+" does not exist." exit(1) if name[-4:] not in [".bam", ".sam"]: print " ERROR:\tInput files names should all have either the .sam or .bam extension. " exit(1) def addExtension(lOut, extType): # AC 2017-03-20 ## Modify a list by adding an extension to its elements if needed ## INPUT : ### lOut : list ### extType : str, element of the OutputType parameter ext=None if extType=="SAM": ext=".sam" elif extType=="BAM": ext=".bam" else: ext=".sorted.bam" for i in range(len(lOut)): if len(ext)>len(lOut[i]) or lOut[i][-1*len(ext):]!=ext: if lOut[i][-4:] in [".sam", ".bam"]: lOut[i]=lOut[i][:-4]+ext else: lOut[i]+=ext def makeOutputGenericName(lInName, folderName, add="_genome_convert"): # AC 2017-03-20 ## Create a list of file names given input names, a folder and a suffix. (folder/name/inputNameWithoutExtension+add) ## INPUT : ### lInName : list ### folderName : str ### add : str, suffix ## OUTPUT: ### lOutName : list lOutName=[] for inName in lInName: lOutName.append(folderName+"/"+".".join(inName.split("/")[-1].split(".")[:-1])+add) return lOutName def makeParamList(param, name): # AC 2017-03-20 ## Create a list by spliting with "," from a parameter containing a list ## INPUT : ### param : list ### name : str, name of an element in the param list ## OUTPUT: ### l : list of unique parameters l=[] for p in param: for psplit in p.split(","): if psplit in l: print " WARNING:\tThe "+name+" "+psplit+" appear several times." else: l.append(psplit) return l def ensure_dir(d): # AC 2017-03-20 ## Create a directory, if it does not exist ## INPUT : ### d : str, directory name if not os.path.exists(d): os.makedirs(d) if __name__ == '__main__': main()