e67106d5cf1204021cada70c1ca43a92bd3b346d
chmalee
  Wed Aug 21 16:54:25 2019 -0700
Adding links to dbSnp hgc pages if the position is supported over at MuPIT, refs #24020

diff --git src/hg/makeDb/doc/hg38/mupit.txt src/hg/makeDb/doc/hg38/mupit.txt
new file mode 100644
index 0000000..fd2e161
--- /dev/null
+++ src/hg/makeDb/doc/hg38/mupit.txt
@@ -0,0 +1,75 @@
+# This file describes how to build the knownToMupit table and mupitRanges table.
+# knownToMupit is a link between knownGene IDS and PDB Ids supported by Mupit, used
+#   by hgGene for links to 3D protein structures.
+# mupitRanges is used on hgc pages for dbSnp to decide whether to show if the
+#   SNP position is supported by MuPIT.
+cd /hive/data/outside/
+mkdir mupit
+cd mupit
+
+# mupit-pdbids.txt was emailed from Kyle Moad (kmoad@insilico.us.com)
+
+# wc -l mupit-pdbids.txt
+for db in "hg38" "hg19" "hg18"; do \
+    # get knownGene IDs and associated PDB IDS
+    # the extDb{Ref} parts come from hg/hgGene/domains.c:domainsPrint()
+    hgsql -Ne "select kgID, extAcc1 from $db.kgXref x \
+        inner join sp180404.extDbRef sp on x.spID = sp.acc \
+        inner join sp180404.extDb e on sp.extDb=e.id \
+        where x.spID != '' and e.val='PDB' order by kgID" \
+        > $db.knownToPdb.txt;
+    # filter out pdbIds not found in mupit
+    cat mupit-pdbids.txt | tr '[a-z]' '[A-Z]' | \
+        grep -Fwf - $db.knownToPdb.txt >  $db.knownToMupit.txt;
+    # check that it filtered correctly:
+    # cut -f2 $db.knownToMuipit.txt | sort -u | wc -l;
+    # load new table for hgGene/hgc
+    hgLoadSqlTab $db knownToMupit ~/kent/src/hg/lib/knownTo.sql $db.knownToMupit.txt
+done
+
+# mupit.sqlite came from Rick Kim: rkim@insilico.us.com
+# Get all the entries from the database:
+sqlite3 mupit.sqlite ".databases"
+# seq  name             file                                                      
+# ---  ---------------  ----------------------------------------------------------
+# 0    main             /hive/data/outside/mupit/mupit.sqlite                     
+sqlite3 mupit.sqlite ".tables"
+# mupit
+# The -separator is a literal tab inserted with by pressing "ctrl-v tab"
+sqlite3 -separator '	' mupit.sqlite "select * from mupit;" | \
+    tawk '{print $1,$2-1,$3}' | sort -k1,1 -k2,2n > hg38.mupitPositions.bed
+
+# mupit_hg19_chrompos.txt also came from Rick Kim
+# this file is only two  columns, make it into a range table:
+tawk '{print $1,$2-1,$2}' mupit_hg19_chrompos.txt | sort -k1,1 -k2,2n | \
+    uniq > hg19.mupitPositionsZeroBased.txt
+bedtools merge -i hg19.mupitPositionsZeroBased.txt > hg19.mupitPositions.bed
+
+for db in hg19 hg38; do hgLoadBed -tab $db mupitRanges $db.mupitPositions.bed; done
+# Reading hg19.mupitPositions.bed
+# Read 171024 elements of size 3 from hg19.mupitPositions.bed
+# Sorted
+# Creating table definition for mupitRanges, bedSize: 3
+# Saving bed.tab
+# Loading hg19
+# Reading hg38.mupitPositions.bed
+# ERROR: line 10547:'chr1 248858130       248857859'
+# chromStart after chromEnd (248858130 > 248857859)
+
+# Redo hg38 and throw out positions where chromStart > chromEnd
+sqlite3 -separator '	' mupit.sqlite "select * from mupit;" | tawk '{if ($2 <= $3) print $1, $2-1, $3}' | sort -k1,1 -k2,2n > hg38.mupitPositions.bed
+# now reload is clean:
+for db in hg19 hg38; do hgLoadBed -tab $db mupitRanges $db.mupitPositions.bed; done
+# Reading hg19.mupitPositions.bed
+# Read 171024 elements of size 3 from hg19.mupitPositions.bed
+# Sorted
+# Creating table definition for mupitRanges, bedSize: 3
+# Saving bed.tab
+# Loading hg19
+# Reading hg38.mupitPositions.bed
+# Read 116709 elements of size 3 from hg38.mupitPositions.bed
+# Sorted
+# Creating table definition for mupitRanges, bedSize: 3
+# Saving bed.tab
+# Loading hg38
+