273486dcbd4a0fd80d7f09be639f32cd58598881
max
  Wed Jul 20 15:03:46 2016 -0700
adding uniprot track for hg38

diff --git src/utils/uniprotLift src/utils/uniprotLift
index fe49425..d29868c 100755
--- src/utils/uniprotLift
+++ src/utils/uniprotLift
@@ -277,50 +277,53 @@
         seq, size = line.rstrip("\n").split()
         seqSizes[seq] = int(size)/3
         
     # get uniprot IDs 
     speciesAccs = set([line.split()[0] for line in open("work/chromSizes")])
     print "got %d accessions" % len(speciesAccs)
 
     # read data, write bed to file
     # this annotates in protein coordinates
     print "converting tab to bed"
     ofh = open("work/temp.bed", "w")
     uniProtMutFname = join(uniprotDir, "uniprot.%s.annots.tab" % taxonId)
     print "reading %s" % uniProtMutFname
     for mut in iterTsvRows(open(uniProtMutFname)):
         if mut.acc not in speciesAccs:
+            logging.debug("skipping %s, not in work/chromSizes" % mut.acc)
             continue
         if options.filterAnnots and mut.featType in removeTypes:
+            logging.debug("skipping %s, explicitely removed annotation" % str(mut))
             continue
         mutName = mut.featType.replace(" ", "_")+":"+mut.acc+":"+mut.origAa+mut.begin+mut.mutAa
 
         # remove annotations that cover the whole protein, probably not very useful
         #print mut.begin, mut.end, mut.acc, seqSizes[mut.acc]
         if int(mut.begin)==1 and int(mut.end)==int(seqSizes[mut.acc])+1:
+            logging.debug("skipping %s, covers whole protein" % str(mut))
             continue
 
         mutPos = 3*(int(mut.begin)-1)
         mutPosEnd = 3*(int(mut.end)-1)
         if mutName not in mutData:
             ofh.write("\t".join([mut.acc, str(mutPos), str(mutPosEnd), mutName])+"\n")
         mutData[mutName].append(mut)
     ofh.close()
     
     # lift from protein sequence bed to target bed
     print "lifting"
-    cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname
+    cmd = "bedToPsl work/chromSizes work/temp.bed stdout | pslMap stdin %s stdout | pslToBed stdin stdout | LC_COLLATE=C sort -k1,1 -k2,2n | bedFixBlockOverlaps /dev/stdin > work/temp.lifted.bed" % liftFname
     run(cmd)
 
     print "adding extra fields"
     # read lifted bed and add uniprot annotations to it as additional fields
     mutBed = open("spMut.bed", "w")
     annotBed = open("spAnnot.bed" , "w")
     structBed = open("spStruct.bed" , "w")
 
     count = 0
     blackCount = 0
     noInfoCount = 0
     doneAnnots = set() # to skip overlaps: a set with (seq, start, end, name)
 
     for line in open("work/temp.lifted.bed"):
         bed = line.strip().split()