c11dda620b2fd111b46877fa8b05e7250a61b9f8 angie Tue Feb 11 12:50:33 2020 -0800 Instructions for loading up NCBI's ReMap alignments as a chain track. refs #24449 diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt index 425d388..88a1e90 100644 --- src/hg/makeDb/doc/hg38/hg38.txt +++ src/hg/makeDb/doc/hg38/hg38.txt @@ -6869,15 +6869,59 @@ ln -s lastzSelf.2020-01-27 lastz.self ln -s lastzSelf.2020-01-27 lastz.hg38 cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain cp /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/axtChain/README.txt . $EDITOR README.txt md5sum hg38.hg38.all.chain.gz > md5sum.txt # Make sure that the old download dir has only symlinks, no real files, then remove and rebuild. ls -lR /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ rm -r /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ ln -s /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/{README.txt,hg38.hg38.all.chain.gz,md5sum.txt} . ######################################################################### +# NCBI ReMap alignments (DONE 2020-02-11 Angie) +# RM 24449 + mkdir /hive/data/genomes/hg38/bed/chainHg19ReMap + cd /hive/data/genomes/hg38/bed/chainHg19ReMap + wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.39_GRCh38.p13/GCF_000001405.25_GRCh37.p13/GCF_000001405.39-GCF_000001405.25.gff + # We will need to substitute all the RefSeq chrom and contig IDs with our own names. + # The same alt contig can appear in both assemblies with the same name, so replace + # hg19 names at the beginning of the line and hg38 names after "Target=". + hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ + | sed -re 's/\./\\./;' \ + | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \ + > hg38.hg19.chromAlias.sed + hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ + | sed -re 's/\./\\./;' \ + | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \ + >> hg38.hg19.chromAlias.sed + + # There are some GRCh38.p13 sequences that we have not yet imported into hg38 -- use -dropT. + sed -f hg38.hg19.chromAlias.sed GCF_000001405.39-GCF_000001405.25.gff \ + | gff3ToPsl -dropT /hive/data/genomes/{hg19,hg38}/chrom.sizes stdin stdout \ + | pslPosTarget stdin stdout \ + | sort -k14,14 -k16n,16n > remap.hg38.hg19.psl + + # Convert to chain for browser display. Some of the remap chains have minScore < 1000 and + # by default would be dropped by chainScore... use -minScore=0 to prevent that. + time pslToChain remap.hg38.hg19.psl stdout \ + | chainScore -minScore=0 stdin /hive/data/genomes/{hg38/hg38.2bit,hg19/hg19.2bit} \ + remap.hg38.hg19.chain +#real 9m31.900s +#user 9m1.624s +#sys 0m20.863s + hgLoadChain hg38 -tIndex chainHg19ReMap remap.hg38.hg19.chain +#Loading 5315 chains into hg38.chainHg19ReMap + time axtChain -psl -linearGap=medium -verbose=0 remap.hg38.hg19.psl \ + /hive/data/genomes/hg38/hg38.2bit /hive/data/genomes/hg19/hg19.2bit \ + remap.axtChain.hg38.hg19.chain +#real 2m26.333s +#user 2m4.237s +#sys 0m22.071s + hgLoadChain hg38 -tIndex chainHg19ReMapAxtChain remap.axtChain.hg38.hg19.chain +#Loading 2115 chains into hg38.chainHg19ReMapAxtChain + + +#########################################################################