cbbdd7331c4477e84f01020a41e44d8ccec900a0 chmalee Thu Feb 13 12:56:22 2020 -0800 Fixing bad decoding problem in clinvar build script diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index ee04313..97866d6 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -367,45 +367,45 @@ # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445) # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106) # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*') hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)') # e.g. # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA # c.80A>T # c.740+356C>T # c.3839_3855del17 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$') -ifh = gzip.open(filename, "rt") +ifh = gzip.open(filename, "rt", encoding="latin_1") # check the header line1 = ifh.readline() if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again logging.error("ClinVar has changed their header line again") logging.error("current header line: %s" % line1.replace("\t","|")) logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|")) raise Exception("code needs fixing") # open out files -hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w") -hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w") -hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w") -hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w") +hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1") +hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1") +hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1") +hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1") longCount = 0 noAssCount = 0 # convert lines for line in ifh: line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions line = line.rstrip("\n") row = line.split("\t") row = [f.replace("\0", "") for f in row ] alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ submCategories, varId = row