273486dcbd4a0fd80d7f09be639f32cd58598881 max Wed Jul 20 15:03:46 2016 -0700 adding uniprot track for hg38 diff --git src/utils/uniprotLift src/utils/uniprotLift index fe49425..d29868c 100755 --- src/utils/uniprotLift +++ src/utils/uniprotLift @@ -277,50 +277,53 @@ seq, size = line.rstrip("\n").split() seqSizes[seq] = int(size)/3 # get uniprot IDs speciesAccs = set([line.split()[0] for line in open("work/chromSizes")]) print "got %d accessions" % len(speciesAccs) # read data, write bed to file # this annotates in protein coordinates print "converting tab to bed" ofh = open("work/temp.bed", "w") uniProtMutFname = join(uniprotDir, "uniprot.%s.annots.tab" % taxonId) print "reading %s" % uniProtMutFname for mut in iterTsvRows(open(uniProtMutFname)): if mut.acc not in speciesAccs: + logging.debug("skipping %s, not in work/chromSizes" % mut.acc) continue if options.filterAnnots and mut.featType in removeTypes: + logging.debug("skipping %s, explicitely removed annotation" % str(mut)) continue mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa # remove annotations that cover the whole protein, probably not very useful #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc] if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1: + logging.debug("skipping %s, covers whole protein" % str(mut)) continue mutPos = 3*(int(mut.begin)-1) mutPosEnd = 3*(int(mut.end)-1) if mutName not in mutData: ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n") mutData[mutName].append(mut) ofh.close() # lift from protein sequence bed to target bed print "lifting" - cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname + cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | LC_COLLATE=C sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname run(cmd) print "adding extra fields" # read lifted bed and add uniprot annotations to it as additional fields mutBed = open("spMut.bed", "w") annotBed = open("spAnnot.bed" , "w") structBed = open("spStruct.bed" , "w") count = 0 blackCount = 0 noInfoCount = 0 doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name) for line in open("work/temp.lifted.bed"): bed = line.strip().split()