57105e3e6e8c590537fedcc559cb0fac2812d56d
lrnassar
  Thu Feb 22 14:33:24 2024 -0800
Staging the ENIGMA tracks, refs #32919

diff --git src/hg/makeDb/scripts/enigma/BRCAmla.py src/hg/makeDb/scripts/enigma/BRCAmla.py
new file mode 100644
index 0000000..b26cd8d
--- /dev/null
+++ src/hg/makeDb/scripts/enigma/BRCAmla.py
@@ -0,0 +1,175 @@
+import subprocess
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+def assignRGBcolorByLR(LR):
+    LR = float(LR)
+    if LR > 2.08:
+        itemRgb = '255,0,0'
+    elif LR <=0.48:
+        itemRgb = '0,128,0'
+    else:
+        itemRgb = '91,91,91'
+    return(itemRgb)
+
+def assignACMGcode(LR):
+    LR = float(LR)
+    if LR > 2.08 and LR <= 4.3:
+        ACMGcode = 'PP4 - Pathogenic - Supporting'
+    elif LR > 4.3 and LR <= 18.7:
+        ACMGcode = 'PP4 - Pathogenic - Moderate'
+    elif LR > 18.7 and LR <= 350:
+        ACMGcode = 'PP4 - Pathogenic - Strong'
+    elif LR > 350:
+        ACMGcode = 'PP4 - Pathogenic - Very strong'
+    elif LR <= .48 and LR > .23:
+        ACMGcode = 'BP5 - Benign - Supporting'
+    elif LR <= .22 and LR > .05:
+        ACMGcode = 'BP5 - Benign - Moderate'
+    elif LR <= .04 and LR > .00285:
+        ACMGcode = 'BP5 - Benign - Strong'
+    elif LR <= 0.00285:
+        ACMGcode = 'BP5 - Benign - Very strong'
+    else: 
+        print("This code did not match: "+str(LR))
+        ACMGcode = "Not informative"
+    return(ACMGcode)
+
+
+bash("tail -n +3 /hive/data/inside/enigmaTracksData/HUMU-40-1557-s001-1.txt > /hive/data/inside/enigmaTracksData/HUMUdataMinusHeader.txt")
+rawFileNoHeader = open('/hive/data/inside/enigmaTracksData/HUMUdataMinusHeader.txt','r', encoding='latin-1')
+varsToConvertToVcf = open('/hive/data/inside/enigmaTracksData/varsToConvert.txt','w')
+
+for line in rawFileNoHeader:
+    line = line.rstrip().split("\t")
+    if line[0].startswith("BRCA1"):
+        varsToConvertToVcf.write("NM_007294.3:"+line[1]+"\n")
+    elif line[0].startswith("BRCA2"):
+        varsToConvertToVcf.write("NM_000059.3:"+line[1]+"\n")
+    else:
+        print("ERROR! Missing transcript")
+
+rawFileNoHeader.close()
+varsToConvertToVcf.close()
+
+bash("hgvsToVcf -noLeftShift hg38 /hive/data/inside/enigmaTracksData/varsToConvert.txt /hive/data/inside/enigmaTracksData/tempVcfFile")
+
+tempVcfFile = open('/hive/data/inside/enigmaTracksData/tempVcfFile','r')
+
+vcfVarCoords = {}
+for line in tempVcfFile:
+    line = line.rstrip().split("\t")
+    if line[0].startswith("#"):
+        continue
+    else:
+        #Have the dic be the hgnsc as key (e.g. NM_007294.3:c.1081T>C) with a list of chr + vcfLocation
+        #Also subtracting 1 to make a chr chromStart chromEnd format
+        vcfVarCoords[line[2]] = [line[0],str(int(line[1])-1),line[1]]
+
+tempVcfFile.close()
+
+rawFileNoHeader = open('/hive/data/inside/enigmaTracksData/HUMUdataMinusHeader.txt','r', encoding='latin-1')
+outputBedFile = open("/hive/data/inside/enigmaTracksData/outputBedFile.bed",'w')
+#Reiterate through the file matching coordinates now
+for line in rawFileNoHeader:
+    line = line.rstrip("\n").split("\t")
+    
+    geneSymbol = line[0]
+    if line[0].startswith("BRCA1"):
+        nmAccession = "NM_007294.3"
+    elif line[0].startswith("BRCA2"):
+        nmAccession = "NM_000059.3"
+    else:
+        print("Error: Line didn't start with BRCA*")
+    
+    ACMGcode = assignACMGcode(line[10])
+    nameToMatch = nmAccession+":"+line[1]
+    _mouseOver = "<b>HGVSc:</b> "+nmAccession+":"+line[1]+"<br><b>HGVSp:</b> "+\
+        line[2]+"<br><b>Combined LR:</b> "+line[10]+\
+        "<br><b>ACMG Code:</b> "+ACMGcode
+        
+    itemRGB = assignRGBcolorByLR(line[10])
+    
+    if nameToMatch in vcfVarCoords.keys():
+        lineToWrite = ("\t".join(vcfVarCoords[nameToMatch])+"\t"+line[1]+\
+                            "\t0\t.\t"+"\t".join(vcfVarCoords[nameToMatch][1:])+\
+                            "\t"+itemRGB+"\t"+nmAccession+"\t"+line[0]+"\t"+\
+                            ACMGcode+"\t"+"\t".join(line[2:])+"\t"+_mouseOver+"\n")
+            
+        outputBedFile.write(lineToWrite)
+    else:
+        print("PROBLEM! A key did not match.")
+        print(nameToMatch)
+            
+outputBedFile.close()
+rawFileNoHeader.close()
+
+bash("bedSort /hive/data/inside/enigmaTracksData/outputBedFile.bed \
+/hive/data/inside/enigmaTracksData/outputBedFile.bed")
+
+startOfAsFile="""table BRCAmla
+"BRCA1/BRCA2 multifactorial likelihood analysis"
+   (
+   string chrom;       "Reference sequence chromosome or scaffold"
+   uint   chromStart;  "Start position in chromosome"
+   uint   chromEnd;    "End position in chromosome"
+   string name;        "HGVS Nucleotide"
+   uint score;         "Not used, all 0"
+   char[1] strand;     "Not used, all ."
+   uint thickStart;    "Same as chromStart"
+   uint thickEnd;      "Same as chromEnd"
+   uint reserved;       "RGB value (use R,G,B string in input file)"
+   string NMaccession;        "RefSeq NM Accession"
+   string geneSymbol;        "Gene symbol"
+   string ACMGcode;        "Translating the LR into the ACMG codes PP4 and BP5 alongside supporting strength."
+"""
+
+line1 = bash("head -1 /hive/data/inside/enigmaTracksData/HUMU-40-1557-s001-1.txt")
+line2 = bash("head -2 /hive/data/inside/enigmaTracksData/HUMU-40-1557-s001-1.txt | tail -1")
+line1 = line1.rstrip("\n").split("\t")
+line2 = line2.rstrip("\n").split("\t")
+
+name = []
+for i in range(65):
+    if line2[i] != "":
+        line2[i] = line2[i].replace('"','')
+        name.append(line1[i]+" - "+line2[i])
+    else:
+        name.append(line1[i])
+n=0
+for i in name[2:]:
+    n+=1 
+    asFileAddition = "   lstring extraField"+str(n)+';\t"'+i+'"\n'
+    startOfAsFile = startOfAsFile+asFileAddition
+startOfAsFile = startOfAsFile+"   string _mouseOver;"+'\t"'+'Field only used as mouseOver'+'"\n'
+    
+asFileOutput = open("/hive/data/inside/enigmaTracksData/BRCAmla.as","w")
+for line in startOfAsFile.split("\n"):
+    if "_mouseOver" in line:
+        asFileOutput.write(line+"\n   )")
+    else:
+        asFileOutput.write(line+"\n")
+asFileOutput.close()
+
+bash("bedToBigBed -as=/hive/data/inside/enigmaTracksData/BRCAmla.as -type=bed9+67 -tab \
+/hive/data/inside/enigmaTracksData/outputBedFile.bed /cluster/data/hg38/chrom.sizes \
+/hive/data/inside/enigmaTracksData/BRCAmfaHg38.bb")
+
+bash("liftOver -bedPlus=9 -tab /hive/data/inside/enigmaTracksData/outputBedFile.bed \
+/hive/data/genomes/hg38/bed/liftOver/hg38ToHg19.over.chain.gz \
+/hive/data/inside/enigmaTracksData/outputBedFileHg19.bed /hive/data/inside/enigmaTracksData/unmapped.bed")
+
+bash("bedToBigBed -as=/hive/data/inside/enigmaTracksData/BRCAmla.as -type=bed9+67 -tab \
+/hive/data/inside/enigmaTracksData/outputBedFileHg19.bed /cluster/data/hg19/chrom.sizes \
+/hive/data/inside/enigmaTracksData/BRCAmfaHg19.bb")
+
+bash("ln -sf /hive/data/inside/enigmaTracksData/BRCAmfaHg38.bb /gbdb/hg38/bbi/enigma/BRCAmfa.bb")
+bash("ln -sf /hive/data/inside/enigmaTracksData/BRCAmfaHg19.bb /gbdb/hg19/bbi/enigma/BRCAmfa.bb")