cbbdd7331c4477e84f01020a41e44d8ccec900a0
chmalee
  Thu Feb 13 12:56:22 2020 -0800
Fixing bad decoding problem in clinvar build script

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index ee04313..97866d6 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -367,45 +367,45 @@
 # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
 # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
 # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
 cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
 
 hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
 
 # e.g.
 # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
 # c.80A>T
 # c.740+356C>T
 # c.3839_3855del17
 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
 
-ifh = gzip.open(filename, "rt")
+ifh = gzip.open(filename, "rt", encoding="latin_1")
 
 # check the header
 line1 = ifh.readline()
 if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again
     logging.error("ClinVar has changed their header line again")
     logging.error("current header line: %s" % line1.replace("\t","|"))
     logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|"))
     raise Exception("code needs fixing")
 
 # open out files
-hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w")
-hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w")
-hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w")
-hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w")
+hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
+hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")
+hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
+hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")
 
 longCount = 0
 noAssCount = 0
 
 # convert lines
 for line in ifh:
     line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
     line = line.rstrip("\n")
     row = line.split("\t")
     row = [f.replace("\0", "") for f in row ]
 
     alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
       irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
       refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
       submCategories, varId = row