c11dda620b2fd111b46877fa8b05e7250a61b9f8 angie Tue Feb 11 12:50:33 2020 -0800 Instructions for loading up NCBI's ReMap alignments as a chain track. refs #24449 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index bf70be9..f8a601f 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -34174,15 +34174,54 @@ # minDepth: 1.000000 # maxDepth: 2.000000 # std of depth: 0.042340 # symlinks to /gbdb/hg19: ln -s `pwd`/haploInsufficiency.bb \ /gbdb/hg19/doseSensitivity/clinGenHaploInsufficiency.bb ln -s `pwd`/triploSensitivity.bb \ /data/hg19/doseSensitivity/clinGenTriploSensitivity.bb # trackDb composite perhaps in ClinVar CNVs composite ############################################################################## +# NCBI ReMap alignments (DONE 2020-02-11 Angie) +# RM 24449 + mkdir /hive/data/genomes/hg19/bed/chainHg38ReMap + cd /hive/data/genomes/hg19/bed/chainHg38ReMap + wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.25_GRCh37.p13/GCF_000001405.39_GRCh38.p13/GCF_000001405.25-GCF_000001405.39.gff + # We will need to substitute all the RefSeq chrom and contig IDs with our own names. + # The same alt contig can appear in both assemblies with the same name, so replace + # hg38 names at the beginning of the line and hg19 names after "Target=". + hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ + | sed -re 's/\./\\./;' \ + | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \ + > hg19.hg38.chromAlias.sed + hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ + | sed -re 's/\./\\./;' \ + | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \ + >> hg19.hg38.chromAlias.sed + sed -f hg19.hg38.chromAlias.sed GCF_000001405.25-GCF_000001405.39.gff \ + | gff3ToPsl -dropQ /hive/data/genomes/{hg38,hg19}/chrom.sizes stdin stdout \ + | pslPosTarget stdin stdout \ + | sort -k14,14 -k16n,16n > remap.hg19.hg38.psl + # Convert to chain for browser display. Some of the remap chains have minScore < 1000 and + # by default would be dropped by chainScore... use -minScore=0 to prevent that. + time pslToChain remap.hg19.hg38.psl stdout \ + | chainScore -minScore=0 stdin /hive/data/genomes/{hg19/hg19.2bit,hg38/hg38.2bit} \ + remap.hg19.hg38.chain +#real 5m55.241s + hgLoadChain hg19 -tIndex chainHg38ReMap remap.hg19.hg38.chain +#Loading 5315 chains into hg19.chainHg38ReMap + # Chaining the ReMap alignments makes it a lot easier to see when separate alignments + # to the same sequence are in the same order and orientation. + time axtChain -psl -linearGap=medium -verbose=0 remap.hg19.hg38.psl \ + /hive/data/genomes/hg19/hg19.2bit /hive/data/genomes/hg38/hg38.2bit \ + remap.axtChain.hg19.hg38.chain +#real 1m41.773s + hgLoadChain hg19 -tIndex chainHg38ReMapAxtChain remap.axtChain.hg19.hg38.chain +#Loading 2141 chains into hg19.chainHg38ReMapAxtChain + + +##############################################################################