41f199e88fc710d1f86b96b606d15a5b975dda71 markd Tue Apr 19 22:05:14 2022 -0700 fixed chain ids in hgLiftOver diff --git src/hg/makeDb/doc/chm13v2.0userData/build.txt src/hg/makeDb/doc/chm13v2.0userData/build.txt index fe44bcc..57fe414 100644 --- src/hg/makeDb/doc/chm13v2.0userData/build.txt +++ src/hg/makeDb/doc/chm13v2.0userData/build.txt @@ -109,73 +109,83 @@ # obtain sequence fastas http://courtyard.gi.ucsc.edu/~mhauknes/T2T/t2t_Y/annotation_set/CHM13.v2.0.fasta http://courtyard.gi.ucsc.edu/~mhauknes/T2T/t2t_Y/annotation_set/CHM13.v2.0.protein.fasta mv CHM13.v2.0.fasta catLiftOffGenesV1.rna.fa mv CHM13.v2.0.protein.fasta catLiftOffGenesV1.protein.fa pigz *.fa ================================================================ * hgLiftOver (2022-03-26 markd) ---------------------------------------------------------------- GRCh38 & GRCh37 Nae-Chyun Chen # 2022-04-09 it was noted that chrM was left out of above alignments, so obtain them and repeat +# 2022-04-19 it was discover that chains render oddly due to the lack of chain ids. Use chainMergeSort +# to fix this globus: /team-liftover/v1_nflo/with_chrM/ chm13v2-grch38.chain grch38-chm13v2.chain chm13v2-hg19_chrM.chain chm13v2-hg19_chrMT.chain hg19_chrM-chm13v2.chain hg19_chrMT-chm13v2.chain cd trackData/hgLiftOver # rename to match UCSC conventions - mv chm13v2-grch38.chain chm13v2-hg38.over.chain - mv grch38-chm13v2.chain hg38-chm13v2.over.chain - mv chm13v2-hg19_chrM.chain chm13v2-hg19_chrM.over.chain - mv chm13v2-hg19_chrMT.chain chm13v2-hg19_chrMT.over.chain - mv hg19_chrM-chm13v2.chain hg19_chrM-chm13v2.over.chain - mv hg19_chrMT-chm13v2.chain hg19_chrMT-chm13v2.over.chain + mv chm13v2-grch38.chain chm13v2-hg38.over.no-id.chain + mv grch38-chm13v2.chain hg38-chm13v2.over.no-id.chain + mv chm13v2-hg19_chrM.chain chm13v2-hg19_chrM.over.no-id.chain + mv chm13v2-hg19_chrMT.chain chm13v2-hg19_chrMT.over.no-id.chain + mv hg19_chrM-chm13v2.chain hg19_chrM-chm13v2.over.no-id.chain + mv hg19_chrMT-chm13v2.chain hg19_chrMT-chm13v2.over.no-id.chain + +# add chain ids + chainMergeSort chm13v2-hg19_chrM.over.no-id.chain > chm13v2-hg19_chrM.over.chain + chainMergeSort chm13v2-hg19_chrMT.over.no-id.chain > chm13v2-hg19_chrMT.over.chain + chainMergeSort chm13v2-hg19.over.no-id.chain > chm13v2-hg19.over.chain + chainMergeSort chm13v2-hg38.over.no-id.chain > chm13v2-hg38.over.chain + chainMergeSort hg19-chm13v2.over.no-id.chain > hg19-chm13v2.over.chain + chainMergeSort hg19_chrM-chm13v2.over.no-id.chain > hg19_chrM-chm13v2.over.chain + chainMergeSort hg19_chrMT-chm13v2.over.no-id.chain > hg19_chrMT-chm13v2.over.chain + chainMergeSort hg38-chm13v2.over.no-id.chain > hg38-chm13v2.over.chain + # create hg19 chains that combine chrM and chrMT for use in browser. - cp chm13v2-hg19_chrM.over.chain chm13v2-hg19.over.chain - chainFilter -q=chrMT chm13v2-hg19_chrMT.over.chain >>chm13v2-hg19.over.chain - cp hg19_chrM-chm13v2.over.chain hg19-chm13v2.over.chain - chainFilter -t=chrMT hg19_chrMT-chm13v2.over.chain >>hg19-chm13v2.over.chain + chainFilter -q=chrMT chm13v2-hg19_chrMT.over.chain | chainMergeSort stdin chm13v2-hg19_chrM.over.chain > chm13v2-hg19.over.chain + chainFilter -t=chrMT hg19_chrMT-chm13v2.over.chain | chainMergeSort stdin hg19_chrM-chm13v2.over.chain > hg19-chm13v2.over.chain pigz *.chain # build tracks hgLoadChain -noBin -test none bigChain chm13v2-hg38.over.chain.gz sed 's/\.000000//' chain.tab | awk 'BEGIN {OFS="\t"} {print $2, $4, $5, $11, 1000, $8, $3, $6, $7, $9, $10, $1}' > bigChainIn.tab bedToBigBed -type=bed6+6 -as=${HOME}/kent/src/hg/lib/bigChain.as -tab bigChainIn.tab ../chromAlias/ucsc.sizes.txt chm13v2-hg38.over.chain.bb tawk '{print $1, $2, $3, $5, $4}' link.tab | csort -k1,1 -k2,2n --parallel=64 > bigLinkIn.tab bedToBigBed -type=bed4+1 -as=${HOME}/kent/src/hg/lib/bigLink.as -tab bigLinkIn.tab ../chromAlias/ucsc.sizes.txt chm13v2-hg38.over.link.bb hgLoadChain -noBin -test none bigChain chm13v2-hg19.over.chain.gz sed 's/\.000000//' chain.tab | awk 'BEGIN {OFS="\t"} {print $2, $4, $5, $11, 1000, $8, $3, $6, $7, $9, $10, $1}' > bigChainIn.tab bedToBigBed -type=bed6+6 -as=${HOME}/kent/src/hg/lib/bigChain.as -tab bigChainIn.tab ../chromAlias/ucsc.sizes.txt chm13v2-hg19.over.chain.bb tawk '{print $1, $2, $3, $5, $4}' link.tab | csort -k1,1 -k2,2n --parallel=64 > bigLinkIn.tab bedToBigBed -type=bed4+1 -as=${HOME}/kent/src/hg/lib/bigLink.as -tab bigLinkIn.tab ../chromAlias/ucsc.sizes.txt chm13v2-hg19.over.link.bb rm *.tab - pigz *.chain # make available is liftOver directory as we ln -f *.chain.gz ../../liftOver/ # GRCh38 mask used in liftover. This is based on: # https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/references/GRCh38/GCA_000001405.15_GRCh38_GRC_exclusions_T2Tv2.bed # plus UCSC hg38 centromeres track GRCh38: /team-liftover/grch38_masked_fasta/grch38-centromere_and_falsedup.bed (edited) rename to hg38.liftover-mask.bed ln -f hg38.liftover-mask.bed ../../liftOver/ ================================================================ * hgCactus (2022-03-28 markd) ----------------------------------------------------------------