2adcc68f17866e32eab65cfd5c423583ddb50e3b
max
  Mon Apr 27 07:53:47 2020 -0700
fixing duplicates in avada track, refs #25433

diff --git src/hg/makeDb/avada/toBed.py src/hg/makeDb/avada/toBed.py
index 1cb90fd..5d233df 100644
--- src/hg/makeDb/avada/toBed.py
+++ src/hg/makeDb/avada/toBed.py
@@ -1,37 +1,50 @@
 # convert avada to bed
 import gzip
-for line in gzip.open("avada_v1.00_2016.vcf.gz", "rt"):
-    if line.startswith("#"):
+#import unidecode # install with 'pip install unidecode'
+alreadyDone = set()
+for line in gzip.open("avada_v1.00_2016.vcf.gz", "rb"):
+    # strange hack, but was necessary
+    line = line.replace(b"\xe2\xac\x8e", b">") # very weird - what type of encoding was this before? 
+    ucLine = line.decode("utf8")
+    if ucLine.startswith("#"):
         continue
-    row = line.rstrip("\n").split("\t")
+    row = ucLine.rstrip("\n").split("\t")
     # ['1', '45797228', '.', 'C', 'T', '.', '.', 'PMID=23361220;GENE_SYMBOL=MUTYH;ENSEMBL_ID=ENSG00000132781;ENTREZ_ID=4595;REFSEQ_ID=NM_001128425.1;STRAND=-;ORIGINAL_VARIANT_STRING=c.1187G4A']
     chrom, start, qual, ref, alt, dot1, dot2, otherStr = row
     otherParts = otherStr.split(';')
     annots = {}
     for p in otherParts:
         key, val = p.split("=")
         annots[key] = val
 
+    # if a string from a paper has already been mapped to the same position and gene, then don't 
+    # convert it again. Variants can be mapped through two similar transcripts, to the same location.
+    annotKey = (chrom, start, annots["PMID"], annots["GENE_SYMBOL"], annots["ORIGINAL_VARIANT_STRING"])
+    if annotKey in alreadyDone:
+        #print("annot %s already done" % repr(annotKey))
+        continue
+    alreadyDone.add(annotKey)
+
     # bed9
     chrom = "chr"+chrom
-    start = int(start)
+    start = int(start)-1 # VCF is 1-based. Argh.
     end = start+len(ref)
     name = annots["ORIGINAL_VARIANT_STRING"]
     score = "0"
     strand = annots["STRAND"]
     thickStart = start
     thickEnd = end
     itemRgb = "0"
     # extraFields:
     pmid = annots["PMID"]
     variant = name # easier for users, so variant is together with gene / paper
     geneSym = annots["GENE_SYMBOL"]
     ensId = annots["ENSEMBL_ID"]
     entrez = annots["ENTREZ_ID"]
     refseq = annots["REFSEQ_ID"]
     
     bed = [chrom, str(start), str(end), name, score, strand, str(thickStart), str(thickEnd), itemRgb, \
-            pmid, geneSym, variant, ensId, entrez, refseq]
+            geneSym, variant, ensId, entrez, refseq, pmid]
 
     print("\t".join(bed))