0026e52591c39e5a0644b478f8d3f7409b7f9af1 max Mon Dec 6 02:48:07 2021 -0800 fixing clinvar B/P codes back to BN/PG, refs #28562 diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index c40f520..79f795a 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -29,113 +29,116 @@ parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n" # ==== FUNCTIONs ===== -# benign = B, likely benign = LB, conflicting = CF, likely pathogenic= LP, -# pathogenic = P, other = OT, uncertain = VUS, RF=risk factor +# benign = BN, likely benign = LB, conflicting = CF, likely pathogenic= LP, +# pathogenic = PG, other = OT, uncertain = VUS, RF=risk factor # colors were defined by Ana Benet Pages +# - WARNING ON FILTER CHANGES: - see #28562 +# Never change these codes. They are in old sessions. If you change them, change the across the whole script +# and also change the name of the .as field, so the cart variables will be different. You can add new values through: cnvColors = { "INS" : { "OT" : "225,165,0", - "B" : "250,214,69", + "BN" : "250,214,69", "LB" : "250,214,69", "CF" : "225,165,0", "RF" : "225,165,0", "VUS" : "225,165,0", "LP" : "199,114,3", - "P" : "199,114,3" + "PG" : "199,114,3" }, "LOSS" : { "OT" : "255,0,0", - "B" : "255,98,119", + "BN" : "255,98,119", "LB" : "255,98,119", "CF" : "255,0,0", "VUS" : "255,0,0", "RF" : "255,0,0", "LP" : "153,0,0", - "P" : "153,0,0" + "PG" : "153,0,0" }, "GAIN" : { "OT" : "0,0,225", - "B" : "77,184,255", + "BN" : "77,184,255", "LB" : "77,184,255", "CF" : "0,0,225", "VUS" : "0,0,225", "RF" : "0,0,225", "LP" : "0,0,179", - "P" : "0,0,179" + "PG" : "0,0,179" }, "STRUCT" : { "OT" : "0,210,0", - "B" : "0,255,0", + "BN" : "0,255,0", "LB" : "0,255,0", "CF" : "0,210,0", "VUS" : "0,210,0", "RF" : "0,210,0", "LP" : "0,128,0", - "P" : "0,128,0" + "PG" : "0,128,0" }, "INV" : { "OT" : "192,0,192", - "B" : "255,128,255", + "BN" : "255,128,255", "LB" : "255,128,255", "CF" : "192,0,192", "VUS" : "192,0,192", "RF" : "192,0,192", "LP" : "128,0,128", - "P" : "128,0,128" + "PG" : "128,0,128" }, "SEQALT" : { "OT" : "128,128,128", - "B" : "191,191,191", + "BN" : "191,191,191", "LB" : "191,191,191", "CF" : "128,128,128", "VUS" : "128,128,128", "RF" : "128,128,128", "LP" : "64,64,64", - "P" : "64,64,64" + "PG" : "64,64,64" }, "SEQLEN" : { "OT" : "0,179,179", - "B" : "0,255,255", + "BN" : "0,255,255", "LB" : "0,255,255", "CF" : "0,179,179", "VUS" : "0,179,179", "RF" : "0,179,179", "LP" : "0,102,102", - "P" : "0,102,102" + "PG" : "0,102,102" }, "SUBST" : { "OT" : "128,128,128", - "B" : "191,191,191", + "BN" : "191,191,191", "LB" : "191,191,191", "CF" : "128,128,128", "VUS" : "128,128,128", "RF" : "128,128,128", "LP" : "64,64,64", - "P" : "64,64,64" + "PG" : "64,64,64" }, } def mustRun(cmd): logging.debug(cmd) ret = os.system(cmd) if ret!=0: print(("Could not run: %s" % cmd)) sys.exit(1) def shortenName(name): "make the clinvar name shorter and keep a long version for the mouse over " cnvMatch = cnvRe.match(name) #hgvsMatch = hgvsRe.match(name) @@ -317,44 +320,46 @@ not provided 8860 1.440% Benign/Likely benign 19489 3.167% Likely pathogenic 30200 4.907% Conflicting interpretations of pathogenicity 30910 5.023% Pathogenic 66679 10.835% Benign 77551 12.602% Likely benign 154870 25.166% Uncertain significance 217838 35.399% """ pathoStr = pathoStr.lower() # don't change the order of these unless you tested it. The order is crucial. # the aim of this order is to err on the side of caution: if a variant is VUS and pathogenic, # we rather classify it as VUS (=triggering manual review). If it's pathogenic and benign at the same # time, we rather classify it as pathogenic + # + # Do not changes these codes. See WARNING ON FILTER CHANGES in this script. if "conflicting" in pathoStr: return "CF" if "uncertain" in pathoStr: return "VUS" if "risk factor" in pathoStr: return "RF" if "likely pathogenic" in pathoStr: return "LP" if "pathogenic" in pathoStr: - return "P" + return "PG" if "likely benign" in pathoStr: return "LB" if "benign" in pathoStr: - return "B" + return "BN" return "OT" def lastMonth(d,x=1): """ returns same day as this month, but last month, e.g. for 2013-01-01 return 2012-12-01 """ days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] newmonth = ((( d.month - 1) - x ) % 12 ) + 1 newyear = d.year + ((( d.month - 1) - x ) // 12 ) if d.day > days_of_month[newmonth-1]: newday = days_of_month[newmonth-1] else: newday = d.day return datetime( newyear, newmonth, newday) @@ -804,33 +809,33 @@ pathoCode = clinSignToCode(clinSign) originCode = originSimpleToCode(originSimple) allTypeCode = allTypeToCode(allType, "|".join(row)) if isCnv: typeColors = cnvColors[allTypeCode] #else: #print("CNV encountered with unknown type: %s / %s"%(allType, allTypeCode)) #assert(False) itemRgb = typeColors.get(pathoCode) if itemRgb is None: print("CNV encountered with unknown pathoCode: "+pathoCode) assert(False) else: - if pathoCode in ["P", "LP"]: # pathogenic or likely pathogenic + if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic itemRgb = "210,0,0" # red - elif pathoCode in ["B", "LB"]: # benign or likely benign + elif pathoCode in ["BN", "LB"]: # benign or likely benign itemRgb = "0,210,0" # green elif pathoCode in ["VUS", "RF"]: # uncertain itemRgb = "0,0,128" # dark-blue elif pathoCode in ["CF"]: # conflicting itemRgb = "137,121,212" # light-blue elif pathoCode in ["OT"]: # other itemRgb = "128,128,128" # grey else: assert(False)# should never happen varLen = int(end)-int(start) geneStr = geneSymbol if geneId.isdigit() and geneId!="-1": geneStr = geneId+"|"+geneSymbol