6548404563a511e928e6813b7569614d56526a29 max Mon Sep 2 05:17:51 2024 -0700 adding VSV accessions to clinvar track, refs #34377 diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index fa88597..959a831 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -750,30 +750,33 @@ line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions line = line.rstrip("\n") row = line.split("\t") row = [f.replace("\0", "") for f in row ] alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ submCategories, varId, posVcf, refAllVcf, altAllVcf, somClinImpact, somClinLastEval, revStatusClinImpact, oncogen, oncogenLastEval, revStatusOncogen = row # SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity # changed in July 2018 when the "varId" was merged into this table. It used to be # in allToVar if varId=="": logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) + vcvId = "" + else: + vcvId = "VCV%08d.1" % int(varId) if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na": noAssCount += 1 continue if assembly=="GRCh37": hgvs = hg19Hgvs elif assembly=="GRCh38": hgvs = hg38Hgvs else: print("Invalid assembly: %s, row %s" % (repr(assembly), repr(row))) assert(False) #hgvsTable = ";".join(hgvs[alleleId]) hgvsTable = json.dumps(hgvs[alleleId]) @@ -954,31 +957,31 @@ oncogenDesc = "" if oncogen!="" and oncogen!="-": oncogenDesc = "%s (%s, %s)" % (oncogen, oncogenLastEval, revStatusOncogen) row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ blockCount, blockSizes, blockStarts, name, clinSign, starRatingHtml, allType, geneStr, molConseq, snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver, vcfDesc, somImpactDesc, oncogenDesc, # these fields are the for the filters, not for the display pathoCode, originCode, allTypeCode, str(varLen), str(starCount), # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field - varId, dbVarSsvAcc] + varId, dbVarSsvAcc, vcvId] # replace clinvar's placeholders with real empty fields newRow = [] for x in row: if x in ["-1", "-"]: newRow.append("") else: newRow.append(x) row = newRow if assembly=="GRCh37": ofh = hg19Bed if isCnv: ofh = hg19BedCnv elif assembly=="GRCh38": @@ -1018,31 +1021,31 @@ bigBedDir = "bigBedAlpha" # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb tmpFnames = [] for fname in fnames: parts = fname.split(".") if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed base, db, date, ext = parts else: base, db, ext = parts base = basename(base) tmpFname = "%s.%s.temp.bb" % (base, db) - cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname) + cmd = "bedToBigBed -extraIndex=_dbVarSsvId,snpId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname) mustRun(cmd) finalFname = "%s/%s.%s.bb" % (bigBedDir, base, db) tableName = finalFname.split('.')[0] # e.g. clinvarMain tmpFnames.append( (db, tableName, tmpFname, finalFname) ) # make it atomic by renaming the temp files to the final file names at the very end for db, tableName, tmpFname, finalFname in tmpFnames: logging.debug("Renaming %s to %s" % (tmpFname, finalFname)) os.rename(tmpFname, finalFname) # create the version.txt file clinvarVersion = open(versionFname).read() versionString = "ClinVar Release: %s, converted at UCSC on %s" % (clinvarVersion, date.today().strftime("%Y-%m-%d"))