c11dda620b2fd111b46877fa8b05e7250a61b9f8
angie
  Tue Feb 11 12:50:33 2020 -0800
Instructions for loading up NCBI's ReMap alignments as a chain track.  refs #24449

diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt
index 425d388..88a1e90 100644
--- src/hg/makeDb/doc/hg38/hg38.txt
+++ src/hg/makeDb/doc/hg38/hg38.txt
@@ -6869,15 +6869,59 @@
     ln -s lastzSelf.2020-01-27 lastz.self
     ln -s lastzSelf.2020-01-27 lastz.hg38
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain
     cp /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/axtChain/README.txt .
     $EDITOR README.txt
     md5sum hg38.hg38.all.chain.gz > md5sum.txt
     # Make sure that the old download dir has only symlinks, no real files, then remove and rebuild.
     ls -lR /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
     rm -r /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
     cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
     ln -s /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/{README.txt,hg38.hg38.all.chain.gz,md5sum.txt} .
 
 
 #########################################################################
+# NCBI ReMap alignments (DONE 2020-02-11 Angie)
+# RM 24449
+    mkdir /hive/data/genomes/hg38/bed/chainHg19ReMap
+    cd /hive/data/genomes/hg38/bed/chainHg19ReMap
+    wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.39_GRCh38.p13/GCF_000001405.25_GRCh37.p13/GCF_000001405.39-GCF_000001405.25.gff
+    # We will need to substitute all the RefSeq chrom and contig IDs with our own names.
+    # The same alt contig can appear in both assemblies with the same name, so replace
+    # hg19 names at the beginning of the line and hg38 names after "Target=".
+    hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \
+    | sed -re 's/\./\\./;' \
+    | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \
+      > hg38.hg19.chromAlias.sed
+    hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \
+    | sed -re 's/\./\\./;' \
+    | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \
+      >> hg38.hg19.chromAlias.sed
+
+    # There are some GRCh38.p13 sequences that we have not yet imported into hg38 -- use -dropT.
+    sed -f hg38.hg19.chromAlias.sed GCF_000001405.39-GCF_000001405.25.gff \
+    | gff3ToPsl -dropT /hive/data/genomes/{hg19,hg38}/chrom.sizes stdin stdout \
+    | pslPosTarget stdin stdout \
+    | sort -k14,14 -k16n,16n > remap.hg38.hg19.psl
+
+    # Convert to chain for browser display.  Some of the remap chains have minScore < 1000 and
+    # by default would be dropped by chainScore... use -minScore=0 to prevent that.
+    time pslToChain remap.hg38.hg19.psl stdout \
+    | chainScore -minScore=0 stdin /hive/data/genomes/{hg38/hg38.2bit,hg19/hg19.2bit} \
+        remap.hg38.hg19.chain
+#real    9m31.900s
+#user    9m1.624s
+#sys     0m20.863s
+    hgLoadChain hg38 -tIndex chainHg19ReMap remap.hg38.hg19.chain
+#Loading 5315 chains into hg38.chainHg19ReMap
+    time axtChain -psl -linearGap=medium -verbose=0 remap.hg38.hg19.psl \
+      /hive/data/genomes/hg38/hg38.2bit /hive/data/genomes/hg19/hg19.2bit \
+      remap.axtChain.hg38.hg19.chain
+#real    2m26.333s
+#user    2m4.237s
+#sys     0m22.071s
+    hgLoadChain hg38 -tIndex chainHg19ReMapAxtChain remap.axtChain.hg38.hg19.chain
+#Loading 2115 chains into hg38.chainHg19ReMapAxtChain
+
+
+#########################################################################