0ce9210dc769e64e79daf51e11ab0f08774c287f max Mon Nov 9 05:12:43 2020 -0800 making clinvar otto a little bit less verbose, no redmine diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index a23a3c0..77d6de0 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -660,30 +660,31 @@ if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again logging.error("ClinVar has changed their header line again") logging.error("current header line: %s" % line1.replace("\t","|")) logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|")) raise Exception("code needs fixing") # open out files hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1") hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1") hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1") hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1") longCount = 0 noAssCount = 0 noMcCount = 0 +minusAccs = [] # convert lines for line in ifh: line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions line = line.rstrip("\n") row = line.split("\t") row = [f.replace("\0", "") for f in row ] alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ submCategories, varId, posVcf, refAllVcf, altAllVcf = row # changed in July 2018 when the "varId" was merged into this table. It used to be # in allToVar @@ -710,30 +711,34 @@ if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ? if assembly=="GRCh37": chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 else: chrom = "chrM" shortName, longName = shortenName(name) if len(shortName)>20: shortName = shortName[:17]+"..." longCount+=1 if len(longName)>60: longName = longName[:60]+"..." + if start=="-1" and end=="-1": + minusAccs.append(irvcAcc) + continue + if start=="" or end=="": print("empty start or end coordinate. record %s, start/end = %s/%s "% (irvcAcc, start, end)) continue if int(start)<0 or int(end)<0: print("illegal start or end coordinate. record %s, start/end = %s/%s"% (irvcAcc, start, end)) continue # sometimes they seem to inverse their coordinates if int(start)>int(end): start, end = end, start if chrom=="chr17" and (end=="217748468" or end=="84062405"): print("blacklisted feature") continue @@ -856,30 +861,31 @@ else: noAssCount +=1 #for o in row: #print(o, type(o)) ofh.write("\t".join(row)) ofh.write("\n") hg19Bed.close() hg38Bed.close() hg19BedCnv.close() hg38BedCnv.close() logging.info("%d lines with feature name that was too long, shortened them" % longCount) logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount) +logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3])) fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name] # sort the files for fname in fnames: cmd = "bedSort %s %s" % (fname, fname) logging.debug(cmd) assert(os.system(cmd)==0) # check new bed files against last months's bed files if options.maxDiff: for fname in fnames: compareWithLastMonth(fname, options.maxDiff) # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb