704efd7a1be11b0d69aff4a9101f87ed5b62734d angie Thu Jan 23 10:10:10 2020 -0800 Add NC_012920 (rCRS) mitochondrion to hg19 as chrMT. refs #24648 diff --git src/hg/makeDb/doc/hg19.patchUpdate.13.txt src/hg/makeDb/doc/hg19.patchUpdate.13.txt index 53684ad..b5fd706 100644 --- src/hg/makeDb/doc/hg19.patchUpdate.13.txt +++ src/hg/makeDb/doc/hg19.patchUpdate.13.txt @@ -1,729 +1,740 @@ # for emacs: -*- mode: sh; -*- # This file describes how hg19 was extended with patch sequences and annotations from grcH37P13 ############################################################################## # Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2018-09-28 - Angie) # first done 2018-09-17 # UNDONE 2018-09-28 (see ROLL IT ALL BACK below) # REDONE 2018-09-28 cd /hive/data/genomes/hg19 # main 2bit time faToTwoBit <(twoBitToFa hg19.2bit stdout) \ <(twoBitToFa /hive/data/genomes/grcH37P13/grcH37P13.2bit stdout) \ hg19.p13.2bit #real 0m43.817s # unmasked 2bit time twoBitMask -type=.bed hg19.p13.2bit /dev/null hg19.p13.unmasked.2bit #real 0m1.803s # chrom.sizes sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcH37P13/chrom.sizes > chrom.sizes.p13 # chromInfo cd /hive/data/genomes/hg19/bed/chromInfo awk '{print $1 "\t" $2 "\t/gbdb/hg19/hg19.2bit";}' ../../chrom.sizes.p13 \ > chromInfo.p13.tab wc -l chromInfo*.tab # 297 chromInfo.p13.tab # 93 chromInfo.tab # Install cd /hive/data/genomes/hg19 # For the first update only, move initial release files to .initial. Don't do this next update! mv hg19.2bit hg19.initial.2bit mv hg19.unmasked.2bit hg19.initial.unmasked.2bit mv chrom.sizes chrom.sizes.initial # End of first-update-only stuff ln -sf hg19.p13.2bit hg19.2bit ln -sf hg19.p13.unmasked.2bit hg19.unmasked.2bit ln -sf chrom.sizes.p13 chrom.sizes cd /hive/data/genomes/hg19/bed/chromInfo hgLoadSqlTab hg19 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql chromInfo.p13.tab ############################################################################## # Extend main database tables for fileless tracks (DONE - 2018-09-28 - Angie) # first done 2018-09-17 # UNDONE 2018-09-28 (see ROLL IT ALL BACK below) # REDONE 2018-09-28 # Just add the patch table rows to the main database tables for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do echo $table hgsql hg19 -e "insert into hg19.$table select * from grcH37P13.$table" done ############################################################################## # Extend main database gc5BaseBw.bw (DONE - 2018-09-28 - Angie) # first done 2018-09-17 # UNDONE 2018-09-28 (see ROLL IT ALL BACK belaow) # REDONE (except for first-update-only mv, using initial files) 2018-09-28 cd /hive/data/genomes/hg19/bed/gc5Base/ # Concatenate original assembly results with grcH37P13 results time (zcat hg19.gc5Base.txt.gz \ /hive/data/genomes/grcH37P13/bed/gc5Base/grcH37P13.gc5Base.wigVarStep.gz \ | gzip -c \ > hg19.p13.gc5Base.wigVarStep.gz) #real 5m48.392s # Make a new gc5BaseBw.bw time wigToBigWig hg19.p13.gc5Base.wigVarStep.gz ../../chrom.sizes.p13 \ hg19.p13.gc5Base.bw #real 10m57.806s # Install cd /hive/data/genomes/hg19/bed/gc5Base/ # For the first update only, move initial release files to .initial. Don't do this next update! mv hg19.gc5Base.txt.gz hg19.initial.gc5Base.wigVarStep.gz mv gc5Base.bw hg19.initial.gc5Base.bw # End of first-update-only stuff ln -sf hg19.p13.gc5Base.wigVarStep.gz hg19.gc5Base.wigVarStep.gz ln -sf hg19.p13.gc5Base.bw hg19.gc5Base.bw ln -sf /hive/data/genomes/hg19/bed/gc5Base/hg19.gc5Base.wigVarStep.gz \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/gc5Base/hg19.gc5Base.txt.gz ############################################################################## # Extend main database download files (DONE - 2019-01-12 - Angie) # first done 2018-09-18 # UNDONE 2018-09-28 (see ROLL IT ALL BACK below) # REDONE (except for first-update-only mv) 2018-09-? cd /hive/data/genomes/hg19/goldenPath/bigZips mkdir p13 # hg19.2bit and chrom.sizes were already extended above. ln -sf /hive/data/genomes/hg19/hg19.p13.2bit p13/ ln -sf /hive/data/genomes/hg19/chrom.sizes.p13 p13/hg19.p13.chrom.sizes # AGP: zcat hg19.agp.gz \ /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.agp.gz \ | grep -v ^# \ | gzip -c > p13/hg19.p13.agp.gz # FASTA twoBitToFa ../../hg19.p13.2bit stdout \ | gzip -c > p13/hg19.p13.fa.gz twoBitToFa ../../hg19.p13.2bit stdout \ | maskOutFa stdin hard stdout \ | gzip -c > p13/hg19.p13.fa.masked.gz # RepeatMasker (don't include header of patch file): cat <(zcat hg19.fa.out.gz) \ <(zcat /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.fa.out.gz | tail -n +4) \ | gzip -c > p13/hg19.p13.fa.out.gz # SimpleRepeats/TRF: zcat hg19.trf.bed.gz \ /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.trf.bed.gz \ | gzip -c > p13/hg19.p13.trf.bed.gz # We don't expect a complete set of chroms to have simpleRepeats, but at least an increase: zcat hg19.trf.bed.gz | cut -f 1 | uniq | wc -l #88 zcat p13/hg19.p13.trf.bed.gz | cut -f 1 | uniq | wc -l #292 # hg19 also has download files with the old tar-bundle structure -- update those too. # Per-chrom AGP: rm -rf agp && mkdir agp && cd agp tar xvzf ../chromAgp.tar.gz splitFileByColumn -chromDirs -ending=.agp \ /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.agp.gz . tar cvzf ../p13/hg19.p13.chromAgp.tar.gz * cd .. rm -rf agp # Per-chrom soft-masked FASTA: rm -rf chroms && mkdir chroms && cd chroms tar xvzf ../chromFa.tar.gz cd .. faSplit byname /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.fa.gz chroms/ ls -1 chroms | wc -l #297 tar cvzf p13/hg19.p13.chromFa.tar.gz ./chroms rm -rf chroms # Per-chrom hard-masked FASTA: rm -rf maskedChroms && mkdir maskedChroms && cd maskedChroms tar xvzf ../chromFaMasked.tar.gz cd .. faSplit byname /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.fa.masked.gz \ maskedChroms/ +# NOTE FOR NEXT TIME: fix up names, .fa --> .fa.masked like this: + cd maskedChroms + for f in *.fa; do + mv $f $f.masked + done + cd .. +# END NEXT TIME + ls -1 maskedChroms | wc -l #297 tar cvzf p13/hg19.p13.chromFaMasked.tar.gz ./maskedChroms rm -rf maskedChroms # Per-chrom RepeatMasker .out: rm -rf out && mkdir out && cd out tar xvzf ../chromOut.tar.gz zcat /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.fa.out.gz \ | head -3 > RepeatMaskerHeader.txt zcat /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.fa.out.gz \ | tail -n +4 \ | splitFileByColumn -col=5 -chromDirs -head=RepeatMaskerHeader.txt -ending=.out \ stdin . +# NOTE FOR NEXT TIME: clean up this file: + rm RepeatMaskerHeader.txt +# END NEXT TIME tar cvzf ../p13/hg19.p13.chromOut.tar.gz * cd .. rm -r out # Per-chrom TRF output: rm -rf trfMaskChrom tar xvzf chromTrf.tar.gz cd trfMaskChrom splitFileByColumn -ending=.bed \ /hive/data/genomes/grcH37P13/goldenPath/bigZips/grcH37P13.trf.bed.gz . cd .. tar cvzf p13/hg19.p13.chromTrf.tar.gz ./trfMaskChrom rm -rf trfMaskChrom # RepeatMasker .align files: zcat hg19.fa.align.gz /hive/data/genomes/grcH37P13/bed/repeatMasker/grcH37P13.fa.align.gz \ | gzip -c > p13/hg19.p13.fa.align.gz # TODO: regenerate upstream* files for p13 # Make new md5sum.txt cd p13 md5sum hg19.* > md5sum.txt # Install cd /hive/data/genomes/hg19/goldenPath/bigZips # For the first update only, move initial release files to initial/. Don't do this next update! mkdir initial mv chrom* hg19.* up* md5sum.txt initial/ ln -sf /hive/data/genomes/hg19/hg19.initial.2bit initial/hg19.2bit ln -sf /hive/data/genomes/hg19/chrom.sizes.initial initial/hg19.chrom.sizes ln -sf /hive/data/genomes/hg19/bed/repeatMasker/hg19.fa.align.gz initial/hg19.fa.align.gz # Replace top-level files with links to initial/ files ln -sf initial/* . # End of first-update-only stuff ln -s /hive/data/genomes/hg19/goldenPath/bigZips/{initial,p13} \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/bigZips/ #TODO # Edit README.txt # TODO: /hive/data/genomes/hg19/goldenPath/chromosomes/ ############################################################################# # Build perSeqMax file for gfServer (hgBlat) (DONE 19-08-14 angie) # When the blat server is restarted with the updated hg19.2bit file, # hg19.altsAndFixes needs to be copied over along with the new hg19.2bit file, # and gfServer needs to be restarted with -perSeqMax=hg19.altsAndFixes. cd /hive/data/genomes/hg19 cut -f 1 chrom.sizes.p13 \ | grep -E '_(alt|fix|hap.*)$' \ | sed -re 's/^/hg19.2bit:/;' \ > hg19.altsAndFixes.p13 ############################################################################# # Extend cytoBandIdeo (DONE 2018-09-28 angie) # UNDONE 2018-09-28 (see ROLL IT ALL BACK below) # REDONE 2018-09-28 cd /hive/data/genomes/hg19/bed/cytoBand tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcH37P13/chrom.sizes \ > cytoBand.p13.tab hgLoadSqlTab -oldTable hg19 cytoBandIdeo - cytoBand.p13.tab ######################################################################### # Regenerate idKeys with extended hg19 (DONE - 2018-09-28 - Angie) # UNDONE 2018-09-28 (see ROLL IT ALL BACK below) # REDONE (except for first-update-only mv) 2018-09-28 mkdir /hive/data/genomes/hg19/bed/idKeys.p13 cd /hive/data/genomes/hg19/bed/idKeys.p13 time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \ -twoBit=/hive/data/genomes/hg19/hg19.p13.unmasked.2bit \ -bigClusterHub=ku -smallClusterHub=ku \ -buildDir=`pwd` hg19) > do.log 2>&1 & tail -f do.log #real 1m7.891s cat hg19.keySignature.txt #e7d0d6d259fd7e0898b183cd73b3500b # Install # For the first update only, move initial release files to .initial. Don't do this next update! mv /hive/data/genomes/hg19/bed/idKeys{,.initial} cd /hive/data/genomes/hg19/bed/ rm -f idKeys ln -s idKeys.p13 idKeys ############################################################################# # ROLL IT ALL BACK (DONE - 2018-09-28 - Angie) # I went through a couple rounds of confusion about how to reconcile hg19's alt naming # convention (e.g. "chr6_ssto_hap7", "chr4_ctg9_hap1") with 204 new sequences with much # more varied and arbitrary-looking Assembly sequence names. In the end I decided to # use accession_{alt,fix}-style names -- that is more consistent with the hg38 naming, # and also with Hiram's hg19Patch13 naming. # So halfway through the process, I restored hg19 to its initital state (at least, linking # to *.initial* files that had been created), rebuilt grcH37P13 with different seq naming, # then updated hg19 with p13 again. # This will go more or less in reverse order of all the previous steps. # idKeys cd /hive/data/genomes/hg19/bed/ rm idKeys ln -sf idKeys.initial idKeys # cytoBandIdeo hgsql hg19 -e 'delete from cytoBandIdeo where chrom like "%alt" or chrom like "%fix"' # downloads cd /hive/data/genomes/hg19/goldenPath/bigZips mv md5sum.txt{,.bak} ln -sf initial/* . # gc5Base ln -sf /hive/data/genomes/hg19/bed/gc5Base/hg19.initial.gc5Base.wigVarStep.gz \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/gc5Base/hg19.gc5Base.txt.gz cd /hive/data/genomes/hg19/bed/gc5Base/ ln -sf hg19.initial.gc5Base.wigVarStep.gz hg19.gc5Base.wigVarStep.gz ln -sf hg19.initial.gc5Base.bw hg19.gc5Base.bw # database tables for fileless tracks for table in gap gold simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do echo $table hgsql hg19 -e "delete from hg19.$table where chrom like '%alt' or chrom like '%fix'" done hgsql hg19 -e "delete from hg19.rmsk where genoName like '%alt' or genoName like '%fix'" # 2bit, chrom.sizes, chromInfo cd /hive/data/genomes/hg19 ln -sf hg19.initial.2bit hg19.2bit ln -sf hg19.initial.unmasked.2bit hg19.unmasked.2bit ln -sf chrom.sizes.initial chrom.sizes hgsql hg19 -e 'delete from chromInfo where chrom like "%alt" or chrom like "%fix"' ############################################################################## # UCSC to RefSeq, INSDC, Assembly; chromAlias (DONE 2018-10-01 angie) # need to have idKeys for the genbank and refseq assemblies: mkdir -p /hive/data/genomes/hg19/bed/ucscToINSDC/genbankP13 cd /hive/data/genomes/hg19/bed/ucscToINSDC/genbankP13 ln -s /hive/data/genomes/grcH37P13/genbank/GCA_000001405.14_GRCh37.p13_genomic.fna.gz . faToTwoBit GCA_000001405.14_GRCh37.p13_genomic.fna.gz genbankP13.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP13.2bit \ -bigClusterHub=ku -smallClusterHub=ku \ genbankP13) > do.log 2>&1 #real 1m4.236s mkdir /hive/data/genomes/hg19/bed/ucscToINSDC/refseqP13 cd /hive/data/genomes/hg19/bed/ucscToINSDC/refseqP13 ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz ./ faToTwoBit GCF_000001405.25_GRCh37.p13_genomic.fna.gz refseqP13.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP13.2bit \ -bigClusterHub=ku -smallClusterHub=ku \ refseqP13) > do.log 2>&1 #real 1m7.528s # with the three idKeys available, join them to make the table bed files: cd /hive/data/genomes/hg19/bed/ucscToINSDC sed -re 's/gi\|[0-9]+\|gb\|([A-Z0-9.]+)\|/\1/' genbankP13/genbankP13.idKeys.txt \ | join -t$'\t' ../idKeys/hg19.idKeys.txt - \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.p13.bed # Add our "oops" pre-hg38 mitochondrion sequence: echo -e "chrM\t0\t16571\tNC_001807.4" >> ucscToINSDC.p13.bed join -t$'\t' ../idKeys/hg19.idKeys.txt refseqP13/refseqP13.idKeys.txt \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToRefSeq.p13.bed # loading tables: export db=hg19 export chrSize=`cut -f1 ucscToINSDC.p13.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p13.bed export chrSize=`cut -f1 ucscToRefSeq.p13.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p13.bed # must be exactly 100% coverage featureBits -countGaps ${db} ucscToINSDC #3234834691 bases of 3234834691 (100.000%) in intersection # except for chrM (no refSeq): featureBits -countGaps ${db} ucscToRefSeq #3234818120 bases of 3234834691 (99.999%) in intersection expr 3234834691 - 3234818120 #16571 # construct chromAlias: cd /hive/data/genomes/hg19/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \ | sort -k1,1 > ucsc.refseq.p13.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \ | sort -k1,1 > ucsc.genbank.p13.tab # add NCBI sequence names from assembly report grep -v ^# \ /hive/data/genomes/grcH37P13/genbank/GCA_000001405.14_GRCh37.p13_assembly_report.txt \ | tawk '{print $5, $1;}' | sort \ > genbankToAssembly.txt tawk '{print $2, $1;}' ucsc.genbank.p13.tab | sort \ | join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \ | sort -k1,1 > ucsc.assembly.p13.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.p13.tab \ | sed -re 's/\.p13//;' \ > ${db}.chromAlias.p13.tab # verify all there: for t in refseq genbank assembly do c0=`cat ucsc.$t.p13.tab | wc -l` c1=`grep $t hg19.chromAlias.p13.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 296 =? 296 OK # checking genbank: 297 =? 297 OK # checking assembly: 296 =? 296 OK hgLoadSqlTab hg19 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.p13.tab ############################################################################## # UCSC to Ensembl (TODO 18-? angie) # doc?? ############################################################################ # altLocations and fixLocations (DONE - 2018-10-01 - Angie) # indicate corresponding locations between haplotypes and reference; take care with the # 9 initial assembly chr*_hap* alt names. mkdir /hive/data/genomes/hg19/bed/altLocations.p13 cd /hive/data/genomes/hg19/bed/altLocations.p13 ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \ -db hg19 -assemblyReport /hive/data/genomes/grcH37P13/genbank/GC*_assembly_report.txt \ /hive/data/genomes/grcH37P13/genbank/{ALT_*,PATCHES}/alt_scaffolds/alt_scaffold_placement.txt \ | sed -e 's/chr6_gl000250_alt/chr6_apd_hap1/; s/chr6_gl000251_alt/chr6_cox_hap2/; s/chr6_gl000252_alt/chr6_dbb_hap3/; s/chr6_gl000253_alt/chr6_mann_hap4/; s/chr6_gl000254_alt/chr6_mcf_hap5/; s/chr6_gl000255_alt/chr6_qbl_hap6/; s/chr6_gl000256_alt/chr6_ssto_hap7/; s/chr4_gl000257_alt/chr4_ctg9_hap1/; s/chr17_gl000258_alt/chr17_ctg5_hap1/;' \ | sort -k1,1 -k2n,2n \ > altAndFixLocations.bed wc -l altAndFixLocations.bed #426 altAndFixLocations.bed grep _alt altAndFixLocations.bed > altLocations.bed grep _hap altAndFixLocations.bed >> altLocations.bed grep _fix altAndFixLocations.bed > fixLocations.bed hgLoadBed hg19 altLocations{,.bed} #Read 164 elements of size 4 from altLocations.bed hgLoadBed hg19 fixLocations{,.bed} #Read 262 elements of size 4 from fixLocations.bed featureBits -countGaps hg19 altLocations #76740788 bases of 3234834691 (2.372%) in intersection featureBits -countGaps hg19 fixLocations #154171650 bases of 3234834691 (4.766%) in intersection ############################################################################# # Check for new chrX alts/patches to add to par (DONE 2018-10-01 angie) # Thanks to Hiram for pointing out that intersecting chrX positions in # altLocations/fixLocations and par shows whether a chrX alt overlaps a PAR. cd /hive/data/genomes/hg19/bed/par hgsql hg19 -e 'select l.chrom, l.chromStart, l.chromEnd, l.name from altLocations l, par \ where par.chrom = l.chrom and l.chromEnd > par.chromStart and l.chromStart < par.chromEnd;' # No output hgsql hg19 -e 'select l.chrom, l.chromStart, l.chromEnd, l.name from fixLocations l, par \ where par.chrom = l.chrom and l.chromEnd > par.chromStart and l.chromStart < par.chromEnd;' #+-------+------------+----------+-------------------+ #| chrom | chromStart | chromEnd | name | #+-------+------------+----------+-------------------+ #| chrX | 803877 | 1227822 | chrX_gl877877_fix | #+-------+------------+----------+-------------------+ # Here's what's already in par: hgsql hg19 -e 'select * from par where chrom like "chrX%"' #+-------+------------+-----------+------+ #| chrom | chromStart | chromEnd | name | #+-------+------------+-----------+------+ #| chrX | 60000 | 2699520 | PAR1 | #| chrX | 154931043 | 155260560 | PAR2 | #+-------+------------+-----------+------+ # So we need to add chrX_gl877877_fix! It is completely contained in PAR1. grep chrX_gl877877_fix ../../chrom.sizes #chrX_gl877877_fix 284527 echo -e "chrX_gl877877_fix\t0\t284527\tPAR1" > hg19.p13.par.bed hgLoadBed -oldTable -noBin hg19 par hg19.p13.par.bed ############################################################################## # altSeqLiftOver (DONE 19-01-08 Angie) # originally done 2018-10-01; redone 2018-11-06 w/fixed gff3ToPsl to get correct - strand alignments # mainToPatch over.chain regenerated 2018-12-03 w/fixed pslToChain # altSeqLiftOver.psl fixed to include _hap and mainToAllAltPatch.over.chain added 19-01-08) mkdir /hive/data/genomes/hg19/bed/altSeqLiftOver.p13 cd /hive/data/genomes/hg19/bed/altSeqLiftOver.p13 # Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names hgsql hg19 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \ | awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed cp /dev/null altToChrom.noScore.psl for f in /hive/data/genomes/grcH37P13/genbank/ALT*/alt_scaffolds/alignments/*.gff \ /hive/data/genomes/grcH37P13/genbank/PATCHES/alt_scaffolds/alignments/*.gff; do e=$(basename $f .gff | sed -e 's/_/|/g;') s=$(grep -E $e gbToUcsc.sed) sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \ | pslPosTarget stdin stdout \ >> altToChrom.noScore.psl done pslCheck altToChrom.noScore.psl #checked: 294 failed: 0 errors: 0 time pslRecalcMatch altToChrom.noScore.psl ../../hg19.2bit{,} altToChrom.psl #real 0m28.942s pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \ > altAndPatches.psl grep _alt altAndPatches.psl > altSeqLiftOver.psl grep _hap altAndPatches.psl >> altSeqLiftOver.psl grep _fix altAndPatches.psl > fixSeqLiftOver.psl # Load tables hgLoadPsl hg19 -table=altSeqLiftOverPsl altSeqLiftOver.psl hgLoadPsl hg19 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl # Make chrom-to-alt PSL file for genbank process. ln -f -s `pwd`/chromToAlt.psl \ /hive/data/genomes/hg19/jkStuff/hg19.p13.alt.psl # Make a liftOver chain file for mapping annotations on main chroms to new patch sequences # Exclude alts that were already in hg19 before p13. # Redone 12/3/18 after Braney fixed pslToChain cut -f 1 ../../chrom.sizes.initial | grep _ \ | grep -vwf - chromToAlt.psl \ | pslToChain stdin stdout \ | chainScore stdin ../../hg19.2bit{,} ../../jkStuff/hg19.mainToPatch.p13.over.chain grep chain ../../jkStuff/hg19.mainToPatch.p13.over.chain | wc -l #281 # 1/8/19 also make a liftOver that includes the original alts, for tracks that have # annotations only on main chromosomes. Exclude alt-to-fix alignments. # This is necessary only for the first time we add a patch update. awk '($14 !~ /_/)' chromToAlt.psl \ | pslToChain stdin stdout \ | chainScore stdin ../../hg19.2bit{,} ../../jkStuff/hg19.mainToAllAltPatch.p13.over.chain ######################################################################### # ncbiRefSeq.p13 Genes (DONE - 2018-10-01 - Angie) mkdir /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2018-10-01 cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2018-10-01 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.25_GRCh37.p13 hg19) > do.log 2>&1 & tail -f do.log # *** All done ! Elapsed time: 5m14s #real 5m13.784s cat fb.ncbiRefSeq.hg19.txt #85414465 bases of 2991694177 (2.855%) in intersection # Coverage is a lot lower than hg38.ncbiRefSeq because there are no predicted transcripts, # only curated. ############################################################################## # GRC Incident Database (TODO - 2018-? - Angie) # Wait until the updated hg19 files have been pushed to RR because GRC Incident update is # automated. Then update the file used to map GRC's RefSeq accessions to our names: hgsql hg19 -NBe 'select alias,chrom from chromAlias where source = "refseq" order by alias;' \ > /hive/data/outside/grc/incidentDb/GRCh37/refSeq.chromNames.tab ############################################################################## # Extend wgEncodeReg bigWig tracks (DONE 19-01-09 angie) #NOTE: these have not been liftOver'd to original alts, so use mainToAllAltPatch.over.chain # The original files don't live in hg19/bed/ -- they're in the ENCODE DCC pipeline dir. # Make a new bed/ dir for the extended files. mkdir /hive/data/genomes/hg19/bed/wgEncodeReg.p13 cd /hive/data/genomes/hg19/bed/wgEncodeReg.p13 for dir in /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeReg{Mark*,Txn}; do composite=$(basename $dir) echo $composite mkdir -p $composite for f in $dir/wg*.bigWig; do track=$(basename $f .bigWig) ~/kent/src/hg/utils/liftOverBigWigToPatches $f \ /hive/data/genomes/hg19/jkStuff/hg19.mainToAllAltPatch.p13.over.chain \ /hive/data/genomes/hg19/chrom.sizes \ $composite/$track.plusP13.bigWig & done wait done # Install: don't update hgdownload link, only /gbdb/ link. All flat in hg19/bbi/, no subdirs. # RegMark* links are .bigWig, Txn links are .bw (but .bigWig in the downloads dir above). for dir in wgEncodeRegMark*; do composite=$(basename $dir) echo $composite for f in $composite/wg*.plusP13.bigWig; do track=$(basename $f .plusP13.bigWig) ln -sf `pwd`/$f /gbdb/hg19/bbi/$track.bigWig done done composite=wgEncodeRegTxn for f in $composite/wg*.plusP13.bigWig; do track=$(basename $f .plusP13.bigWig) ln -sf `pwd`/$f /gbdb/hg19/bbi/$track.bw done ############################################################################## # Extend wgEncodeRegDnaseClusteredV3 (DONE 19-01-09 angie) #NOTE: this has not been liftOver'd to original alts, so use mainToAllAltPatch.over.chain cd /hive/data/genomes/hg19/bed/wgEncodeReg.p13 origFile=/hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegDnaseClustered/wgEncodeRegDnaseClusteredV3.bed.gz track=$(basename $origFile .bed.gz) liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/hg19/jkStuff/hg19.mainToAllAltPatch.p13.over.chain \ $track.p13.bed /dev/null sort -k1,1 -k2n,2n <(zcat $origFile) $track.p13.bed \ > $track.plusP13.bed hgLoadBed -type=bed5+ -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql -renameSqlTable \ hg19 $track $track.plusP13.bed gzip $track.plusP13.bed ############################################################################## # Extend wgEncodeRegTfbsClusteredV{2,3} (DONE 19-01-10 angie) #NOTE: this has not been liftOver'd to original alts, so use mainToAllAltPatch.over.chain cd /hive/data/genomes/hg19/bed/wgEncodeReg.p13 # wgEncodeRegTfbsClusteredV3 is bed5+ origFile=/hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegTfbsClustered/wgEncodeRegTfbsClusteredV3.bed.gz track=$(basename $origFile .bed.gz) liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/hg19/jkStuff/hg19.mainToAllAltPatch.p13.over.chain \ $track.p13.bed /dev/null sort -k1,1 -k2n,2n <(zcat $origFile) $track.p13.bed \ | gzip -c \ > $track.plusP13.bed.gz hgLoadBed -type=bed5+ -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql -renameSqlTable \ hg19 $track $track.plusP13.bed.gz # wgEncodeRegTfbsClusteredV2 is bed15 origFile=/hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegTfbsClustered/wgEncodeRegTfbsClusteredV2.bed.gz track=$(basename $origFile .bed.gz) liftOver -multiple -bedPlus=15 -noSerial $origFile \ /hive/data/genomes/hg19/jkStuff/hg19.mainToAllAltPatch.p13.over.chain \ $track.p13.bed /dev/null sort -k1,1 -k2n,2n <(zcat $origFile) $track.p13.bed \ | gzip -c \ > $track.plusP13.bed.gz hgLoadBed hg19 $track $track.plusP13.bed.gz checkTableCoords hg19 $track # No output, good. rm bed.tab gzip *.bed ############################################################################## # Extend GTEX GENE (DONE 19-01-15 angie) # first done 19-01-10; gtexGeneModel redone 19-01-15 after implementing liftOver -multiple -genePred. #NOTE: this has not been liftOver'd to original alts, so use mainToAllAltPatch.over.chain cd /hive/data/genomes/hg19/bed/gtex origFile=gtexGeneV6.bed liftOver -multiple -bedPlus=6 -noSerial $origFile \ /hive/data/genomes/hg19/jkStuff/hg19.mainToAllAltPatch.p13.over.chain \ gtexGeneV6.p13.bed /dev/null sort -k1,1 -k2n,2n $origFile gtexGeneV6.p13.bed \ > gtexGeneV6.plusP13.bed hgLoadBed -noBin -type=bed6+ -sqlTable=$HOME/kent/src/hg/lib/gtexGeneBed.sql -renameSqlTable \ hg19 gtexGene gtexGeneV6.plusP13.bed #Warning 1062 Duplicate entry 'chr10_jh591181_fix-ENSG00000239883.4' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr10_jh591181_fix-ENSG00000189090.7' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr10_jh591181_fix-ENSG00000223477.3' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr10_jh591181_fix-ENSG00000223477.3' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr10_jh591181_fix-ENSG00000223477.3' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr10_jh591181_fix-ENSG00000165874.8' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr10_jh591181_fix-ENSG00000204164.6' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000203832.6' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000203832.6' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000163386.16' for key 'PRIMARY' # gtexGeneBed.sql has a unique index on (chrom, name). Some genes are lifted over in fragments, # causing multiple results that violate the unique constraint. Merge the multiple items # to make one overall region in each case. grep ENSG00000239883 gtexGeneV6.plusP13.bed | grep chr10_jh591181_fix | cut -f 1-3 #chr10_jh591181_fix 125207 142676 #chr10_jh591181_fix 142676 187286 # Those are adjacent -- edit gtexGeneV6.plusP13.bed to merge them --> 125207 187286 grep ENSG00000189090 gtexGeneV6.plusP13.bed | grep chr10_jh591181_fix | cut -f 1-3 #chr10_jh591181_fix 640208 642237 #chr10_jh591181_fix 642237 644679 # Adjacent, merge. grep ENSG00000223477 gtexGeneV6.plusP13.bed | grep chr10_jh591181_fix | cut -f 1-3 #chr10_jh591181_fix 669028 732149 #chr10_jh591181_fix 732158 760961 #chr10_jh591181_fix 761395 797300 #chr10_jh591181_fix 797759 808979 # There, GTEx is using a really long transcript from the Comprehensive set to represent # the gene. So the region spans a lot more than the RefSeq transcript, which is only in # the first block above. But it's GENCODE, so merge all those to get the larger region. grep ENSG00000165874 gtexGeneV6.plusP13.bed | grep chr10_jh591181_fix | cut -f 1-3 #chr10_jh591181_fix 881588 918539 #chr10_jh591181_fix 918539 923140 # Adjacent, merge. grep ENSG00000204164 gtexGeneV6.plusP13.bed | grep chr10_jh591181_fix | cut -f 1-3 #chr10_jh591181_fix 1059504 1106460 #chr10_jh591181_fix 1106460 1109375 # Adjacent, merge. grep ENSG00000203832 gtexGeneV6.plusP13.bed | grep chr1_jh636052_fix | cut -f 1-3 #chr1_jh636052_fix 2105312 2120519 #chr1_jh636052_fix 2164543 2219303 #chr1_jh636052_fix 2219303 2221435 # Interesting repetitive gene NBPF20 - merge. grep ENSG00000163386 gtexGeneV6.plusP13.bed | grep chr1_jh636052_fix | cut -f 1-3 #chr1_jh636052_fix 2880111 2889757 #chr1_jh636052_fix 2892953 2963817 # Similar situation with NBPF10, merge. hgLoadBed -noBin -type=bed6+ -sqlTable=$HOME/kent/src/hg/lib/gtexGeneBed.sql -renameSqlTable \ hg19 gtexGene gtexGeneV6.plusP13.bed # Argh, there are still some... If this happens with another track, it's worth writing # a program to merge nearby ranges for same name. #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000162836.7' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000188092.10' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000178104.15' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000168614.13' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000225241.3' for key 'PRIMARY' #Warning 1062 Duplicate entry 'chr1_jh636052_fix-ENSG00000234232.2' for key 'PRIMARY' hgLoadBed -noBin -type=bed6+ -sqlTable=$HOME/kent/src/hg/lib/gtexGeneBed.sql -renameSqlTable \ hg19 gtexGene gtexGeneV6.plusP13.bed # Yay, it works. # I don't see the original file for gtexGeneModelV6, so I'll dump genePred from the table. # gtexGeneModel has a bin.... but gtexGeneModelV6 does not. hgsql hg19 -NBe 'select * from gtexGeneModelV6' > gtexGeneModelV6.initial.gp # 19-01-15: update now that liftOver -multiple -genePred works. liftOver -multiple -genePred gtexGeneModelV6.initial.gp \ /hive/data/genomes/hg19/jkStuff/hg19.mainToAllAltPatch.p13.over.chain \ gtexGeneModelV6.p13.gp /dev/null sort -k2,2 -k3n,3n gtexGeneModelV6.initial.gp gtexGeneModelV6.p13.gp \ | hgLoadGenePred hg19 gtexGeneModel stdin ##############################################################################