2b58971a98de87361a625375a7b1bf1fcc5ded03
lrnassar
Fri Feb 28 17:36:45 2025 -0800
Fixing the mitomap track, I incorrectly assumed that the hg19 chrM was the same as hg38 chrM, but it was chrMT that matches with chrM on hg38. Refs #24849
diff --git src/hg/utils/otto/mitoMap/buildMitoMap.py src/hg/utils/otto/mitoMap/buildMitoMap.py
index 09e10fc6597..306b091fd63 100755
--- src/hg/utils/otto/mitoMap/buildMitoMap.py
+++ src/hg/utils/otto/mitoMap/buildMitoMap.py
@@ -1,258 +1,268 @@
import subprocess
def bash(cmd):
"""Run the cmd in bash subprocess"""
try:
rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
bashStdoutt = rawBashOutput.stdout
except subprocess.CalledProcessError as e:
raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
return(bashStdoutt)
# Need -1 to all start positions
# 3 T-C needs to end up as chrM 2 3
# 46 TG-del needs to end up as chrM 45 47
# One item at 16289 with no AA change
# 3 items with 366 G-GAAAG
pwd = "/hive/data/outside/otto/mitoMap/"
mitoMapControlRegionVarsFileInputFile = pwd+"variantsControl.latest.tsv"
mitoMapVarsOutputFile = pwd+"mitoMapVars.bed"
fh = open(mitoMapControlRegionVarsFileInputFile,"r", encoding="utf-8", errors="replace")
fh2 = open(mitoMapVarsOutputFile,"w")
# $ head VariantsControlMITOMAPFoswiki.tsv
-# Position Locus Nucleotide Change GB FreqFL (CR)*‡ GB Seqstotal (FL/CR)* Curated References
+# Position Locus Nucleotide Change GB FreqFL (CR)*‡ GB Seqstotal (FL/CR)* Curated References
# 3 Control Region T-C 0.000%(0.000%) 0 2
# I want the output to be bed9 for itemRgb and mouseover with 10 additional fields. The 6 original, plus 3 more from the coding file plus mouseOver:
# chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb position locus nucleotideChange GBfreq GBseqs curatedRefs _mouseOver
for line in fh:
if line.startswith("Position"):
continue
else:
line = line.rstrip().split("\t")
position = line[0]
nucleotideChange = line[2]
chrom = "chrM"
chromStart = int(position)-1
if nucleotideChange == "":
nucleotideChange = "Blank"
chromEnd = chromStart + 1
else:
if " " in nucleotideChange: #3 items with 366 G-GAAAG
# print(nucleotideChange) #For troubleshooting
if "or" in nucleotideChange:
chromEnd = chromStart + len(nucleotideChange.split(" or ")[0].split("-")[0])
else:
chromEnd = chromStart + len(nucleotideChange.split("-")[0].split(" ")[1])
else:
chromEnd = chromStart + len(nucleotideChange.split("-")[0])
name = nucleotideChange
score = "0"
strand = "."
thickStart = chromStart
thickEnd = chromEnd
itemRgb = "83,175,1"
locus = line[1]
codonNumber = ""
codonPosition = ""
AAchange = ""
GBfreq = line[3]
GBseqs = line[4]
curatedRefs = line[5]
varType = "VariantsControl"
mouseOver = "Nucl change: "+name+"
Position: "+position+"
Locus: "+locus+"
"+\
"GenBank freq FL(CR): "+GBfreq+"
GenBank seqs FL(CR): "+GBseqs+"
# of curated refs: "+curatedRefs
fh2.write("\t".join(map(str, [chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, \
position, locus, nucleotideChange, codonNumber, codonPosition, AAchange, GBfreq, GBseqs, curatedRefs, varType, mouseOver])) + "\n")
fh2.close()
fh.close()
mitoMapCodingRegionVarsFileInputFile = pwd+"variantsCoding.latest.tsv"
mitoMapVarsOutputFile = pwd+"mitoMapVars.bed"
+mitoMapVarsOutputFileHg19 = mitoMapVarsOutputFile.split(".")[0]+".hg19.bed"
+
fh = open(mitoMapCodingRegionVarsFileInputFile,"r", encoding="utf-8", errors="replace")
fh2 = open(mitoMapVarsOutputFile,"a")
# $ head VariantsCodingMITOMAPFoswiki.tsv
#Position Locus Nucleotide Change Codon Number Codon Position Amino Acid Change GB Freq‡ GB Seqs Curated References
#577 MT-TF G-A - - tRNA 0.000% 0 1
# I want the output to be bed9 for itemRgb and mouseover with 10 additional fields. The 6 original, plus 3 more from the coding file plus mouseOver:
# chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb position locus nucleotideChange GBfreq GBseqs curatedRefs _mouseOver
for line in fh:
if line.startswith("Position"):
continue
else:
line = line.rstrip().split("\t")
position = line[0]
nucleotideChange = line[2]
chrom = "chrM"
chromStart = int(position)-1
if nucleotideChange == "":
nucleotideChange = "Blank"
chromEnd = chromStart + 1
else:
if " " in nucleotideChange: #3 items with 366 G-GAAAG
# print(nucleotideChange) #For troubleshooting
if "or" in nucleotideChange:
chromEnd = chromStart + len(nucleotideChange.split(" or ")[0].split("-")[0])
else:
chromEnd = chromStart + len(nucleotideChange.split("-")[0].split(" ")[1])
else:
chromEnd = chromStart + len(nucleotideChange.split("-")[0])
name = nucleotideChange
score = "0"
strand = "."
thickStart = chromStart
thickEnd = chromEnd
itemRgb = "62,137,211"
locus = line[1]
codonNumber = line[3]
codonPosition = line[4]
AAchange = line[5]
GBfreq = line[6]
GBseqs = line[7]
curatedRefs = line[8]
varType = "VariantsCoding"
mouseOver = "Nucl change: "+name+"
Position: "+position+"
Locus: "+locus+"
"+\
"Codon num: "+codonNumber+"
Codon pos: "+codonPosition+"
AA change: "+AAchange+\
"
GenBank freq FL(CR): "+GBfreq+"
GenBank seqs FL(CR): "+GBseqs+"
# of curated refs: "+curatedRefs
fh2.write("\t".join(map(str, [chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, \
position, locus, nucleotideChange, codonNumber, codonPosition, AAchange, GBfreq, GBseqs, curatedRefs, varType, mouseOver])) + "\n")
fh2.close()
fh.close()
bash("bedSort "+mitoMapVarsOutputFile+" "+mitoMapVarsOutputFile)
bash("bedToBigBed -as="+pwd+"mitoMapVars.as -type=bed9+11 -tab "+mitoMapVarsOutputFile+" /cluster/data/hg38/chrom.sizes "+mitoMapVarsOutputFile.split(".")[0]+".new.bb")
+bash("sed 's/chrM/chrMT/g' "+mitoMapVarsOutputFile+" > "+mitoMapVarsOutputFileHg19)
+bash("bedToBigBed -as="+pwd+"mitoMapVars.as -type=bed9+11 -tab "+mitoMapVarsOutputFileHg19+" /cluster/data/hg19/chrom.sizes "+mitoMapVarsOutputFileHg19.split(".")[0]+".hg19.new.bb")
print("Final file created: "+mitoMapVarsOutputFile.split(".")[0]+".new.bb")
######
# BEGIN PROCESSING OF DISEASE FILES
######
# Need -1 to all start positions
# Everything is point mutations - only one of the files needs size calculation
mitoMapRNAmutsFileInputFile = pwd+"mutationsRNA.latest.tsv"
mitoMapDiseaseMutsOutputFile = pwd+"mitoMapDiseaseMuts.bed"
+mitoMapDiseaseMutOutputFileHg19 = mitoMapDiseaseMutsOutputFile.split(".")[0]+".hg19.bed"
+
fh = open(mitoMapRNAmutsFileInputFile,"r", encoding="utf-8", errors="replace")
fh2 = open(mitoMapDiseaseMutsOutputFile,"w")
# $ head -2 MutationsRNAMITOMAPFoswiki.tsv
-#Position Locus Disease Allele RNA Homoplasmy Heteroplasmy Status MitoTIP† GB Freq FL (CR)*‡ GB Seqs FL (CR)* References
+#Position Locus Disease Allele RNA Homoplasmy Heteroplasmy Status MitoTIP† GB Freq FL (CR)*‡ GB Seqs FL (CR)* References
#576 MT-CR Hearing loss patient A576G noncoding MT-TF precursor nr nr Reported N/A 0.005%(0.012%) 3 (10) 1
# I want the output to be bed9 for itemRgb and mouseover with 14 additional fields.
# chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb position locus disease nucleotideChange allele rnaOrAAchange plasmy clinStatus mitoTIP GBfreq GBseqs curatedRefs _mouseOver
for line in fh:
if line.startswith("Position"):
continue
else:
line = line.rstrip().split("\t")
position = line[0]
nucleotideChange = line[3]
chrom = "chrM"
chromStart = int(position)-1
chromEnd = chromStart + 1 #These are all base mutations
name = nucleotideChange
score = "0"
strand = "."
thickStart = chromStart
thickEnd = chromEnd
itemRgb = "153,51,255"
locus = line[1]
disease = line[2]
allele = line[3]
rnaOrAAchange = line[4]
plasmy = line[5] + "/" + line[6]
clinStatus = line[7]
mitoTIP = line[8]
GBfreq = line[9]
GBseqs = line[10]
curatedRefs = line[11]
mutsCodingOrRNA = "MutationsRNA"
mouseOver = "Allele: "+allele+"
Position: "+position+"
Locus: "+locus+"
"+\
"Disease: "+disease+"
RNA: "+rnaOrAAchange+"
Plasmy (Hom/Het): "+plasmy+"
"+\
"Status: "+clinStatus+"
MitoTIP: "+mitoTIP+"
"+\
"GenBank freq FL(CR): "+GBfreq+"
GenBank seqs FL(CR): "+GBseqs+"
# of curated refs: "+curatedRefs
fh2.write("\t".join(map(str, [chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, \
position, locus, disease, nucleotideChange, allele, rnaOrAAchange, plasmy, clinStatus, \
mitoTIP, GBfreq, GBseqs, curatedRefs, mutsCodingOrRNA, mouseOver])) + "\n")
fh2.close()
fh.close()
mitoMapDiseaseMutsFileInputFile = pwd+"mutationsCodingControl.latest.tsv"
mitoMapDiseaseMutsOutputFile = pwd+"mitoMapDiseaseMuts.bed"
fh = open(mitoMapDiseaseMutsFileInputFile,"r", encoding="utf-8", errors="replace")
fh2 = open(mitoMapDiseaseMutsOutputFile,"a")
# $ head -2 MutationsCodingControlMITOMAPFoswiki.tsv
-#Locus Allele Position NucleotideChange Amino AcidChange Plasmy Reports(Homo/Hetero) Disease Status GB Freq FL (CR)*‡ GB Seqs FL (CR)* References
+#Locus Allele Position NucleotideChange Amino AcidChange Plasmy Reports(Homo/Hetero) Disease Status GB Freq FL (CR)*‡ GB Seqs FL (CR)* References
#MT-ATP6 m.8573G>A 8573 G-A G16D +/- Patient with suspected mitochondrial disease Reported by paper as Benign 0.107%(0.000%) 66 (0) 1
# I want the output to be bed9 for itemRgb and mouseover with 14 additional fields.
# chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb position locus disease nucleotideChange allele rnaOrAAchange plasmy clinStatus mitoTIP GBfreq GBseqs curatedRefs mutsCodingOrRNA _mouseOver
for line in fh:
if line.startswith("Locus"):
continue
else:
line = line.rstrip().split("\t")
position = line[2]
nucleotideChange = line[3]
chrom = "chrM"
chromStart = int(position)-1
if nucleotideChange == "":
print("This is an error!")
chromEnd = chromStart + 1
else:
if " " in nucleotideChange: #3 items with 366 G-GAAAG
print("Look at this entry:")
print(nucleotideChange)
if "or" in nucleotideChange:
chromEnd = chromStart + len(nucleotideChange.split(" or ")[0].split("-")[0])
else:
chromEnd = chromStart + len(nucleotideChange.split("-")[0].split(" ")[1])
elif "_" in nucleotideChange: #3 18bp_deletion
chromEnd = chromStart + len(nucleotideChange.split("bp_")[0])
else: # ACCTTGC-GCAAGGT
chromEnd = chromStart + len(nucleotideChange.split("-")[0])
name = nucleotideChange
score = "0"
strand = "."
thickStart = chromStart
thickEnd = chromEnd
itemRgb = "1,85,135"
locus = line[0]
disease = line[6]
allele = line[1]
rnaOrAAchange = line[4]
plasmy = line[5]
clinStatus = line[7]
mitoTIP = ""
GBfreq = line[8]
GBseqs = line[9]
curatedRefs = line[10]
mutsCodingOrRNA = "MutationsCodingControl"
mouseOver = "Allele: "+allele+"
Nucl change: "+nucleotideChange+"
Position: "+position+"
Locus: "+locus+"
"+\
"Disease: "+disease+"
RNA: "+rnaOrAAchange+"
Plasmy (Hom/Het): "+plasmy+"
"+\
"Status: "+clinStatus+"
"+\
"GenBank freq FL(CR): "+GBfreq+"
GenBank seqs FL(CR): "+GBseqs+"
# of curated refs: "+curatedRefs
fh2.write("\t".join(map(str, [chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, \
position, locus, disease, nucleotideChange, allele, rnaOrAAchange, plasmy, clinStatus, \
mitoTIP, GBfreq, GBseqs, curatedRefs, mutsCodingOrRNA, mouseOver])) + "\n")
fh2.close()
fh.close()
bash("bedSort "+mitoMapDiseaseMutsOutputFile+" "+mitoMapDiseaseMutsOutputFile)
bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/mitomap/mitoMapDiseaseMuts.as -type=bed9+14 -tab "+mitoMapDiseaseMutsOutputFile+" /cluster/data/hg38/chrom.sizes "+mitoMapDiseaseMutsOutputFile.split(".")[0]+".new.bb")
+
+bash("sed 's/chrM/chrMT/g' "+mitoMapDiseaseMutsOutputFile+" > "+mitoMapDiseaseMutOutputFileHg19)
+bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/mitomap/mitoMapDiseaseMuts.as -type=bed9+14 -tab "+mitoMapDiseaseMutOutputFileHg19+" /cluster/data/hg19/chrom.sizes "+mitoMapDiseaseMutOutputFileHg19.split(".")[0]+".hg19.new.bb")
+
print("Final file created: "+mitoMapDiseaseMutsOutputFile.split(".")[0]+".new.bb")