2596c3765d9f8fb8ffe464d7d11ba843374f672a lrnassar Wed Feb 12 12:49:11 2025 -0800 MITOMAP track is now ready for QA, refs #24849 diff --git src/hg/makeDb/scripts/buildMitoMap.py src/hg/makeDb/scripts/buildMitoMap.py new file mode 100755 index 00000000000..d8135b46a76 --- /dev/null +++ src/hg/makeDb/scripts/buildMitoMap.py @@ -0,0 +1,256 @@ +import subprocess + +def bash(cmd): + """Run the cmd in bash subprocess""" + try: + rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ + stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) + bashStdoutt = rawBashOutput.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) + return(bashStdoutt) + +# Need -1 to all start positions +# 3 T-C needs to end up as chrM 2 3 +# 46 TG-del needs to end up as chrM 45 47 +# One item at 16289 with no AA change +# 3 items with 366 G-GAAAG + +mitoMapControlRegionVarsFileInputFile = "/hive/data/genomes/hg38/bed/mitomap/VariantsControlMITOMAPFoswiki.tsv" +mitoMapVarsOutputFile = "/hive/data/genomes/hg38/bed/mitomap/mitoMapVars.bed" +fh = open(mitoMapControlRegionVarsFileInputFile,"r") +fh2 = open(mitoMapVarsOutputFile,"w") + +# $ head VariantsControlMITOMAPFoswiki.tsv +# Position Locus Nucleotide Change GB FreqFL (CR)*‡ GB Seqstotal (FL/CR)* Curated References +# 3 Control Region T-C 0.000%(0.000%) 0 2 + +# I want the output to be bed9 for itemRgb and mouseover with 10 additional fields. The 6 original, plus 3 more from the coding file plus mouseOver: +# chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb position locus nucleotideChange GBfreq GBseqs curatedRefs _mouseOver +for line in fh: + if line.startswith("Position"): + continue + else: + line = line.rstrip().split("\t") + position = line[0] + nucleotideChange = line[2] + chrom = "chrM" + chromStart = int(position)-1 + if nucleotideChange == "": + nucleotideChange = "Blank" + chromEnd = chromStart + 1 + else: + if " " in nucleotideChange: #3 items with 366 G-GAAAG + print(nucleotideChange) + if "or" in nucleotideChange: + chromEnd = chromStart + len(nucleotideChange.split(" or ")[0].split("-")[0]) + else: + chromEnd = chromStart + len(nucleotideChange.split("-")[0].split(" ")[1]) + else: + chromEnd = chromStart + len(nucleotideChange.split("-")[0]) + name = nucleotideChange + score = "0" + strand = "." + thickStart = chromStart + thickEnd = chromEnd + itemRgb = "83,175,1" + locus = line[1] + codonNumber = "" + codonPosition = "" + AAchange = "" + GBfreq = line[3] + GBseqs = line[4] + curatedRefs = line[5] + varType = "VariantsControl" + mouseOver = "Nucl change: "+name+"
Position: "+position+"
Locus: "+locus+"
"+\ + "GenBank freq FL(CR): "+GBfreq+"
GenBank seqs FL(CR): "+GBseqs+"
# of curated refs: "+curatedRefs + + fh2.write("\t".join(map(str, [chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, \ + position, locus, nucleotideChange, codonNumber, codonPosition, AAchange, GBfreq, GBseqs, curatedRefs, varType, mouseOver])) + "\n") + +fh2.close() +fh.close() + +mitoMapCodingRegionVarsFileInputFile = "/hive/data/genomes/hg38/bed/mitomap/VariantsCodingMITOMAPFoswiki.tsv" +mitoMapVarsOutputFile = "/hive/data/genomes/hg38/bed/mitomap/mitoMapVars.bed" +fh = open(mitoMapCodingRegionVarsFileInputFile,"r") +fh2 = open(mitoMapVarsOutputFile,"a") + +# $ head VariantsCodingMITOMAPFoswiki.tsv +#Position Locus Nucleotide Change Codon Number Codon Position Amino Acid Change GB Freq‡ GB Seqs Curated References +#577 MT-TF G-A - - tRNA 0.000% 0 1 + +# I want the output to be bed9 for itemRgb and mouseover with 10 additional fields. The 6 original, plus 3 more from the coding file plus mouseOver: +# chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb position locus nucleotideChange GBfreq GBseqs curatedRefs _mouseOver +for line in fh: + if line.startswith("Position"): + continue + else: + line = line.rstrip().split("\t") + position = line[0] + nucleotideChange = line[2] + chrom = "chrM" + chromStart = int(position)-1 + if nucleotideChange == "": + nucleotideChange = "Blank" + chromEnd = chromStart + 1 + else: + if " " in nucleotideChange: #3 items with 366 G-GAAAG + print(nucleotideChange) + if "or" in nucleotideChange: + chromEnd = chromStart + len(nucleotideChange.split(" or ")[0].split("-")[0]) + else: + chromEnd = chromStart + len(nucleotideChange.split("-")[0].split(" ")[1]) + else: + chromEnd = chromStart + len(nucleotideChange.split("-")[0]) + name = nucleotideChange + score = "0" + strand = "." + thickStart = chromStart + thickEnd = chromEnd + itemRgb = "62,137,211" + locus = line[1] + codonNumber = line[3] + codonPosition = line[4] + AAchange = line[5] + GBfreq = line[6] + GBseqs = line[7] + curatedRefs = line[8] + varType = "VariantsCoding" + mouseOver = "Nucl change: "+name+"
Position: "+position+"
Locus: "+locus+"
"+\ + "Codon num: "+codonNumber+"
Codon pos: "+codonPosition+"
AA change: "+AAchange+\ + "
GenBank freq FL(CR): "+GBfreq+"
GenBank seqs FL(CR): "+GBseqs+"
# of curated refs: "+curatedRefs + + fh2.write("\t".join(map(str, [chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, \ + position, locus, nucleotideChange, codonNumber, codonPosition, AAchange, GBfreq, GBseqs, curatedRefs, varType, mouseOver])) + "\n") + +fh2.close() +fh.close() + +bash("bedSort "+mitoMapVarsOutputFile+" "+mitoMapVarsOutputFile) +bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/mitomap/mitoMapVars.as -type=bed9+11 -tab "+mitoMapVarsOutputFile+" /cluster/data/hg38/chrom.sizes "+mitoMapVarsOutputFile.split(".")[0]+".bb") +print("Final file created: "+mitoMapVarsOutputFile.split(".")[0]+".bb") + +###### +# BEGIN PROCESSING OF DISEASE FILES +###### + +# Need -1 to all start positions +# Everything is point mutations - only one of the files needs size calculation + +mitoMapRNAmutsFileInputFile = "/hive/data/genomes/hg38/bed/mitomap/MutationsRNAMITOMAPFoswiki.tsv" +mitoMapDiseaseMutsOutputFile = "/hive/data/genomes/hg38/bed/mitomap/mitoMapDiseaseMuts.bed" +fh = open(mitoMapRNAmutsFileInputFile,"r") +fh2 = open(mitoMapDiseaseMutsOutputFile,"w") + +# $ head -2 MutationsRNAMITOMAPFoswiki.tsv +#Position Locus Disease Allele RNA Homoplasmy Heteroplasmy Status MitoTIP† GB Freq  FL (CR)*‡ GB Seqs FL (CR)* References +#576 MT-CR Hearing loss patient A576G noncoding MT-TF precursor nr nr Reported N/A 0.005%(0.012%) 3 (10) 1 + +# I want the output to be bed9 for itemRgb and mouseover with 14 additional fields. +# chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb position locus disease nucleotideChange allele rnaOrAAchange plasmy clinStatus mitoTIP GBfreq GBseqs curatedRefs _mouseOver +for line in fh: + if line.startswith("Position"): + continue + else: + line = line.rstrip().split("\t") + position = line[0] + nucleotideChange = line[3] + chrom = "chrM" + chromStart = int(position)-1 + chromEnd = chromStart + 1 #These are all base mutations + name = nucleotideChange + score = "0" + strand = "." + thickStart = chromStart + thickEnd = chromEnd + itemRgb = "153,51,255" + locus = line[1] + disease = line[2] + allele = line[3] + rnaOrAAchange = line[4] + plasmy = line[5] + "/" + line[6] + clinStatus = line[7] + mitoTIP = line[8] + GBfreq = line[9] + GBseqs = line[10] + curatedRefs = line[11] + mutsCodingOrRNA = "MutationsRNA" + mouseOver = "Allele: "+allele+"
Position: "+position+"
Locus: "+locus+"
"+\ + "Disease: "+disease+"
RNA: "+rnaOrAAchange+"
Plasmy (Hom/Het): "+plasmy+"
"+\ + "Status: "+clinStatus+"
MitoTIP: "+mitoTIP+"
"+\ + "GenBank freq FL(CR): "+GBfreq+"
GenBank seqs FL(CR): "+GBseqs+"
# of curated refs: "+curatedRefs + + fh2.write("\t".join(map(str, [chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, \ + position, locus, disease, nucleotideChange, allele, rnaOrAAchange, plasmy, clinStatus, \ + mitoTIP, GBfreq, GBseqs, curatedRefs, mutsCodingOrRNA, mouseOver])) + "\n") + +fh2.close() +fh.close() + +mitoMapDiseaseMutsFileInputFile = "/hive/data/genomes/hg38/bed/mitomap/MutationsCodingControlMITOMAPFoswiki.tsv" +mitoMapDiseaseMutsOutputFile = "/hive/data/genomes/hg38/bed/mitomap/mitoMapDiseaseMuts.bed" +fh = open(mitoMapDiseaseMutsFileInputFile,"r") +fh2 = open(mitoMapDiseaseMutsOutputFile,"a") + +# $ head -2 MutationsCodingControlMITOMAPFoswiki.tsv +#Locus Allele Position NucleotideChange Amino AcidChange Plasmy Reports(Homo/Hetero) Disease Status GB Freq  FL (CR)*‡ GB Seqs FL (CR)* References +#MT-ATP6 m.8573G>A 8573 G-A G16D +/- Patient with suspected mitochondrial disease Reported by paper as Benign 0.107%(0.000%) 66 (0) 1 + +# I want the output to be bed9 for itemRgb and mouseover with 14 additional fields. +# chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb position locus disease nucleotideChange allele rnaOrAAchange plasmy clinStatus mitoTIP GBfreq GBseqs curatedRefs mutsCodingOrRNA _mouseOver +for line in fh: + if line.startswith("Locus"): + continue + else: + line = line.rstrip().split("\t") + position = line[2] + nucleotideChange = line[3] + chrom = "chrM" + chromStart = int(position)-1 + if nucleotideChange == "": + print("This is an error!") + chromEnd = chromStart + 1 + else: + if " " in nucleotideChange: #3 items with 366 G-GAAAG + print("Look at this entry:") + print(nucleotideChange) + if "or" in nucleotideChange: + chromEnd = chromStart + len(nucleotideChange.split(" or ")[0].split("-")[0]) + else: + chromEnd = chromStart + len(nucleotideChange.split("-")[0].split(" ")[1]) + elif "_" in nucleotideChange: #3 18bp_deletion + chromEnd = chromStart + len(nucleotideChange.split("bp_")[0]) + else: # ACCTTGC-GCAAGGT + chromEnd = chromStart + len(nucleotideChange.split("-")[0]) + name = nucleotideChange + score = "0" + strand = "." + thickStart = chromStart + thickEnd = chromEnd + itemRgb = "1,85,135" + locus = line[0] + disease = line[6] + allele = line[1] + rnaOrAAchange = line[4] + plasmy = line[5] + clinStatus = line[7] + mitoTIP = "" + GBfreq = line[8] + GBseqs = line[9] + curatedRefs = line[10] + mutsCodingOrRNA = "MutationsCodingControl" + mouseOver = "Allele: "+allele+"
Nucl change: "+nucleotideChange+"
Position: "+position+"
Locus: "+locus+"
"+\ + "Disease: "+disease+"
RNA: "+rnaOrAAchange+"
Plasmy (Hom/Het): "+plasmy+"
"+\ + "Status: "+clinStatus+"
"+\ + "GenBank freq FL(CR): "+GBfreq+"
GenBank seqs FL(CR): "+GBseqs+"
# of curated refs: "+curatedRefs + + fh2.write("\t".join(map(str, [chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, \ + position, locus, disease, nucleotideChange, allele, rnaOrAAchange, plasmy, clinStatus, \ + mitoTIP, GBfreq, GBseqs, curatedRefs, mutsCodingOrRNA, mouseOver])) + "\n") +fh2.close() +fh.close() + +bash("bedSort "+mitoMapDiseaseMutsOutputFile+" "+mitoMapDiseaseMutsOutputFile) +bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/mitomap/mitoMapDiseaseMuts.as -type=bed9+14 -tab "+mitoMapDiseaseMutsOutputFile+" /cluster/data/hg38/chrom.sizes "+mitoMapDiseaseMutsOutputFile.split(".")[0]+".bb") +print("Final file created: "+mitoMapDiseaseMutsOutputFile.split(".")[0]+".bb")