282d648d8896a7bb45af20bbc306873f6a5ffb87
angie
Wed Jan 17 11:03:32 2024 -0800
Added chrUn_KI270752v1 to hg38.{p11,p12,p13}.chromAlias.txt files after user pointed out in MLQ #32874 that it was missing.
It is present in hg38.p14.chromAlias.txt and all *chromAlias.bb files.
diff --git src/hg/makeDb/doc/hg38/patchUpdate.12.txt src/hg/makeDb/doc/hg38/patchUpdate.12.txt
index 156fc8f..71ff217 100644
--- src/hg/makeDb/doc/hg38/patchUpdate.12.txt
+++ src/hg/makeDb/doc/hg38/patchUpdate.12.txt
@@ -1,795 +1,816 @@
# for emacs: -*- mode: sh; -*-
# This file describes how hg38 was extended with patch sequences and annotations from grcH38P12,
# after having been extended with grcH38P1 (see patchUpdate.11.txt).
##############################################################################
# Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2018-08-10 - Angie)
cd /hive/data/genomes/hg38
# main 2bit
time faToTwoBit <(twoBitToFa hg38.2bit stdout) \
<(twoBitToFa /hive/data/genomes/grcH38P12/grcH38P12.2bit stdout) \
hg38.p12.2bit
#real 1m33.136s
# unmasked 2bit
twoBitMask -type=.bed hg38.p12.2bit /dev/null hg38.p12.unmasked.2bit
# chrom.sizes
sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcH38P12/chrom.sizes > chrom.sizes.p12
# chromInfo
cd /hive/data/genomes/hg38/bed/chromInfo
awk '{print $1 "\t" $2 "\t/gbdb/hg38/hg38.2bit";}' ../../chrom.sizes.p12 \
> chromInfo.p12.tab
wc -l chromInfo*.tab
# 578 chromInfo.p11.tab
# 595 chromInfo.p12.tab
# 455 chromInfo.tab
# Install
cd /hive/data/genomes/hg38
ln -sf hg38.p12.2bit hg38.2bit
ln -sf hg38.p12.unmasked.2bit hg38.unmasked.2bit
ln -sf chrom.sizes.p12 chrom.sizes
cd /hive/data/genomes/hg38/bed/chromInfo
hgLoadSqlTab hg38 chromInfo chromInfo.sql chromInfo.p12.tab
##############################################################################
# Extend main database tables for fileless tracks (DONE - 2018-08-10 - Angie)
# Just add the patch table rows to the main database tables
for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do
echo $table
hgsql hg38 -e "insert into hg38.$table select * from grcH38P12.$table"
done
#*** NOTE for NEXT TIME: check results with positionalTblCheck in case they need to be resorted:
for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do
positionalTblCheck hg38 $table
done
##############################################################################
# Extend main database gc5BaseBw.bw (DONE - 2018-08-10 - Angie)
cd /hive/data/genomes/hg38/bed/gc5Base/
# Concatenate original assembly results with grcH38P12 results
time (zcat hg38.gc5Base.wigVarStep.gz \
/hive/data/genomes/grcH38P12/bed/gc5Base/grcH38P12.gc5Base.wigVarStep.gz \
| gzip -c \
> hg38.p12.gc5Base.wigVarStep.gz)
#real 8m9.913s
# Make a new gc5BaseBw.bw
time wigToBigWig hg38.p12.gc5Base.wigVarStep.gz ../../chrom.sizes.p12 \
hg38.p12.gc5Base.bw
#real 16m33.792s
# Install
cd /hive/data/genomes/hg38/bed/gc5Base/
ln -sf hg38.p12.gc5Base.wigVarStep.gz hg38.gc5Base.wigVarStep.gz
ln -sf hg38.p12.gc5Base.bw hg38.gc5Base.bw
##############################################################################
# Extend main database download files (DONE - 2019-07-24 - Angie)
# Previously done 2018-11-11
# NOTE FOR NEXT TIME: set up hg38*.chrom.sizes links correctly the first time (see 7/24/19)
cd /hive/data/genomes/hg38/goldenPath/bigZips
mkdir p12
# hg38.2bit was already extended above.
ln -sf /hive/data/genomes/hg38/hg38.p12.2bit p12/
# AGP:
zcat hg38.agp.gz \
/hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.agp.gz \
| grep -v ^# \
| gzip -c > p12/hg38.p12.agp.gz
# FASTA
twoBitToFa ../../hg38.p12.2bit stdout \
| gzip -c > p12/hg38.p12.fa.gz
faSize p12/hg38.p12.fa.gz
#3257347282 bases (161368694 N's 3095978588 real 1483113183 upper 1612865405 lower) in 595 sequences in 1 files
#Total size: mean 5474533.2 sd 27729929.3 min 970 (chrUn_KI270394v1) max 248956422 (chr1) median 166200
twoBitToFa hg38.2bit stdout \
| maskOutFa stdin hard stdout \
| gzip -c > p12/hg38.p12.fa.masked.gz
# RepeatMasker (don't include header of patch file):
cat <(zcat hg38.fa.out.gz) \
<(zcat /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.out.gz | tail -n +4) \
| gzip -c > p12/hg38.p12.fa.out.gz
# SimpleRepeats/TRF:
zcat hg38.trf.bed.gz \
/hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.trf.bed.gz \
| gzip -c > p12/hg38.p12.trf.bed.gz
# We don't expect a complete set of chroms to have simpleRepeats, but at least an increase:
zcat initial/hg38.trf.bed.gz | cut -f 1 | uniq | wc -l
#363
zcat p12/hg38.p12.trf.bed.gz | cut -f 1 | uniq | wc -l
#502
# hg38 files that are not built by makeDownloads.pl because hg38 is treated as 'scaffold-based':
# Per-chrom soft-masked FASTA:
rm -rf chroms
tar xvzf hg38.chromFa.tar.gz
faSplit byname /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.gz chroms/
ls -1 chroms | wc -l
#595
tar cvzf p12/hg38.p12.chromFa.tar.gz ./chroms
rm -rf chroms
# Per-chrom hard-masked FASTA:
rm -rf maskedChroms
tar xvzf hg38.chromFaMasked.tar.gz
faSplit byname /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.masked.gz \
maskedChroms/
ls -1 maskedChroms | wc -l
#595
tar cvzf p12/hg38.p12.chromFaMasked.tar.gz ./maskedChroms
rm -rf maskedChroms
# RepeatMasker .align files:
zcat hg38.fa.align.gz /hive/data/genomes/grcH38P12/bed/repeatMasker/grcH38P12.fa.align.gz \
| gzip -c > p12/hg38.p12.fa.align.gz
# Make new md5sum.txt
cd p12
md5sum hg38.* > md5sum.txt
# Install
# 11/1/18 -- leave bigZips/ top-level files unchanged (links to initial not p12)
cd /hive/data/genomes/hg38/goldenPath/bigZips
for file in initial/*; do
ln -sf $file .
done
# 4/10/19: make latest a real dir with versionless filenames.
rm -rf latest
mkdir latest
cd latest
for file in ../p12/*; do
noVersion=$(echo $(basename $file) | sed -e 's/.p12//')
ln -s $file $noVersion
done
rm md5sum.txt
md5sum hg38* > md5sum.txt
echo GRCh38.p12 > LATEST_VERSION
rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p12
ln -s /hive/data/genomes/hg38/goldenPath/bigZips/p12 \
/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p12
rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/latest
ln -s /hive/data/genomes/hg38/goldenPath/bigZips/latest \
/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/latest
# 7/24/19: make correct links for chrom.sizes
ln -sf /hive/data/genomes/hg38/chrom.sizes.initial \
/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.chrom.sizes
ln -sf /hive/data/genomes/hg38/chrom.sizes.initial \
/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/initial/hg38.chrom.sizes
ln -sf /hive/data/genomes/hg38/chrom.sizes \
/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/latest/hg38.chrom.sizes
ln -sf /hive/data/genomes/hg38/chrom.sizes.p12 \
/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p12/hg38.p12.chrom.sizes
#############################################################################
# Build perSeqMax file for gfServer (hgBlat) (DONE 19-08-14 angie)
# When the blat server is restarted with the updated hg38.2bit file,
# hg38.altsAndFixes needs to be copied over along with the new hg38.2bit file,
# and gfServer needs to be restarted with -perSeqMax=hg38.altsAndFixes.
cd /hive/data/genomes/hg38
cut -f 1 chrom.sizes.p12 \
| grep -E '_(alt|fix)$' \
| sed -re 's/^/hg38.2bit:/;' \
> hg38.altsAndFixes.p12
# Link for blat server installation convenience:
ln -sf hg38.altsAndFixes.p12 altsAndFixes
#########################################################################
# Regenerate idKeys with extended hg38 (DONE - 2018-08-10 - Angie)
mkdir /hive/data/genomes/hg38/bed/idKeys.p12
cd /hive/data/genomes/hg38/bed/idKeys.p12
# ku down... use hgwdev this time:
time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \
-twoBit=/hive/data/genomes/hg38/hg38.p12.unmasked.2bit \
-bigClusterHub=hgwdev -smallClusterHub=hgwdev \
-buildDir=`pwd` hg38) > do.log 2>&1 &
tail -f do.log
#real 1m21.903s
cat hg38.keySignature.txt
#c9c5d621a52f96886fa9cd785c99248f
# Install
cd /hive/data/genomes/hg38/bed/
rm idKeys
ln -s idKeys.p12 idKeys
#############################################################################
# Extend cytoBand{,Ideo} (DONE 2018-08-10 angie)
cd /hive/data/genomes/hg38/bed/cytoBand
tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcH38P12/chrom.sizes \
> cytoBand.p12.tab
# Install
hgLoadSqlTab -oldTable hg38 cytoBand - cytoBand.p12.tab
hgLoadSqlTab -oldTable hg38 cytoBandIdeo - cytoBand.p12.tab
#########################################################################
# ncbiRefSeq.p12 Genes (DONE - 2018-08-10 - Angie)
mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p12.2018-08-10
cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p12.2018-08-10
# Adding the -toGpWarnOnly flag because there are a handful of cases of CDS extending
# beyond exon coordinates. Terence Murphy says they'll eventually fix it but not soon.
# So, make sure to check do.log for warnings from gff3ToGenePred:
time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
-toGpWarnOnly \
refseq vertebrate_mammalian Homo_sapiens \
GCF_000001405.38_GRCh38.p12 hg38) > do.log 2>&1 & tail -f do.log
# gff3ToGenePred warnings:
#Warning: skipping: no exon in id1912382 contains CDS 555851-556197
#Warning: skipping: no exon in id1790907 contains CDS 22922593-22922913
#Warning: skipping: no exon in id1790877 contains CDS 22906341-22906661
#Warning: skipping: no exon in id1790824 contains CDS 22822981-22823289
#Warning: skipping: no exon in id1365744 contains CDS 106088082-106088428
#5 warnings converting GFF3 file: stdin
# *** All done ! Elapsed time: 17m34s
#real 17m33.906s
cat fb.ncbiRefSeq.hg38.txt
#134109466 bases of 3095998939 (4.332%) in intersection
#############################################################################
# UCSC to RefSeq, INSDC, Assembly; chromAlias (DONE 18-08-10 angie)
# need to have idKeys for the genbank and refseq assemblies:
mkdir -p /hive/data/genomes/hg38/bed/ucscToINSDC/genbankP12
cd /hive/data/genomes/hg38/bed/ucscToINSDC/genbankP12
ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p12/GCA_000001405.27_GRCh38.p12_genomic.fna.gz .
faToTwoBit GCA_000001405.27_GRCh38.p12_genomic.fna.gz genbankP12.2bit
time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP12.2bit \
-bigClusterHub=hgwdev -smallClusterHub=hgwdev \
genbankP12) > do.log 2>&1
#real 1m50.109s
mkdir /hive/data/genomes/hg38/bed/ucscToINSDC/refseqP12
cd /hive/data/genomes/hg38/bed/ucscToINSDC/refseqP12
ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz ./
faToTwoBit GCF_000001405.38_GRCh38.p12_genomic.fna.gz refseqP12.2bit
time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP12.2bit \
-bigClusterHub=hgwdev -smallClusterHub=hgwdev \
refseqP12) > do.log 2>&1
#real 1m19.941s
# with the three idKeys available, join them to make the table bed files:
cd /hive/data/genomes/hg38/bed/ucscToINSDC
join -t$'\t' ../idKeys/hg38.idKeys.txt genbankP12/genbankP12.idKeys.txt \
| cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
| awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
| sort -k1,1 -k2,2n > ucscToINSDC.p12.bed
join -t$'\t' ../idKeys/hg38.idKeys.txt refseqP12/refseqP12.idKeys.txt \
| cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
| awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
| sort -k1,1 -k2,2n > ucscToRefSeq.p12.bed
# loading tables:
export db=hg38
export chrSize=`cut -f1 ucscToINSDC.p12.bed | awk '{print length($0)}' | sort -n | tail -1`
sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
| hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p12.bed
export chrSize=`cut -f1 ucscToRefSeq.p12.bed | awk '{print length($0)}' | sort -n | tail -1`
sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
| sed -e 's/INSDC/RefSeq/g;' \
| hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p12.bed
# must be exactly 100% coverage
featureBits -countGaps ${db} ucscToINSDC
#3257347282 bases of 3257347282 (100.000%) in intersection
featureBits -countGaps ${db} ucscToRefSeq
#3257319537 bases of 3257347282 (99.999%) in intersection
# uh-oh! not 100%
featureBits -countGaps ${db} \!ucscToRefSeq -bed=stdout
#chrUn_KI270752v1 0 27745 chrUn_KI270752v1.1
grep KI270752 \
/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_assembly_report.txt
#HSCHRUN_RANDOM_CTG29 unplaced-scaffold na na KI270752.1 <> na Primary Assembly 27745 chrUn_KI270752v1
# Yep, no RefSeq accession there. Guess it was dropped from the RefSeq p12 assembly???
# Will ask Hiram and probably Terence.
# construct chromAlias:
cd /hive/data/genomes/hg38/bed/chromAlias
hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \
| sort -k1,1 > ucsc.refseq.tab
hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \
| sort -k1,1 > ucsc.genbank.tab
# add NCBI sequence names from assembly report
grep -v ^# \
/hive/data/genomes/grcH38P12/genbank/GCA_000001405.27_GRCh38.p12_assembly_report.txt \
| tawk '{print $5, $1;}' | sort \
> genbankToAssembly.txt
tawk '{print $2, $1;}' ucsc.genbank.tab | sort \
| join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \
| sort -k1,1 > ucsc.assembly.tab
~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \
> ${db}.chromAlias.tab
# verify all there:
for t in refseq genbank assembly
do
c0=`cat ucsc.$t.tab | wc -l`
c1=`grep $t hg38.chromAlias.tab | wc -l`
ok="OK"
if [ "$c0" -ne "$c1" ]; then
ok="ERROR"
fi
printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 594 =? 594 OK
# checking genbank: 595 =? 595 OK
# checking assembly: 595 =? 595 OK
# Note how there's one fewer refseq, consistent with featureBits above.
hgLoadSqlTab hg38 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.tab
##############################################################################
# UCSC to Ensembl (TODO 18-08-06 angie)
# doc??
############################################################################
# altLocations and patchLocations (DONE - 2018-08-10 - Angie)
# indicate corresponding locations between haplotypes and reference
mkdir /hive/data/genomes/hg38/bed/altLocations.p12
cd /hive/data/genomes/hg38/bed/altLocations.p12
~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \
/hive/data/genomes/grcH38P12/genbank/GCA_000001405.27_GRCh38.p12_assembly_structure/{ALT_*,PATCHES}/alt_scaffolds/alt_scaffold_placement.txt \
| sort -k1,1 -k2n,2n \
> altAndFixLocations.bed
wc -l altAndFixLocations.bed
#802 altAndFixLocations.bed
grep _alt altAndFixLocations.bed > altLocations.bed
grep _fix altAndFixLocations.bed > fixLocations.bed
hgLoadBed hg38 altLocations{,.bed}
#Read 664 elements of size 4 from altLocations.bed
hgLoadBed hg38 fixLocations{,.bed}
#Read 140 elements of size 4 from fixLocations.bed
featureBits -countGaps hg38 altLocations
#200094386 bases of 3257347282 (6.143%) in intersection
featureBits -countGaps hg38 fixLocations
#63654955 bases of 3257347282 (1.954%) in intersection
#############################################################################
# Check for new chrX alts/patches to add to par (DONE 2018-08-10 angie)
# Thanks to Hiram for pointing out that intersecting chrX positions in
# altLocations and par shows whether a chrX alt overlaps a PAR.
cd /hive/data/genomes/hg38/bed/par
hgsql hg38 -e 'select * from altLocations where chrom = "chrX"'
#+-----+-------+------------+----------+---------------------+
#| bin | chrom | chromStart | chromEnd | name |
#+-----+-------+------------+----------+---------------------+
#| 73 | chrX | 319337 | 601516 | chrX_KI270880v1_alt |
#| 73 | chrX | 326487 | 601516 | chrX_KI270913v1_alt |
#| 77 | chrX | 4950956 | 5129468 | chrX_KV766199v1_alt |
#| 149 | chrX | 79965153 | 80097082 | chrX_KI270881v1_alt |
#+-----+-------+------------+----------+---------------------+
hgsql hg38 -e 'select * from par where chrom like "chrX%"'
#+-----+---------------------+------------+-----------+------+
#| bin | chrom | chromStart | chromEnd | name |
#+-----+---------------------+------------+-----------+------+
#| 9 | chrX | 10000 | 2781479 | PAR1 |
#| 221 | chrX | 155701382 | 156030895 | PAR2 |
#| 73 | chrX_KI270880v1_alt | 0 | 284869 | PAR1 |
#| 73 | chrX_KI270913v1_alt | 0 | 274009 | PAR1 |
#+-----+---------------------+------------+-----------+------+
# chrX_KI270881v1_alt and chrX_KV766199v1_alt are not in either PAR.
# chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1 --
# and are already in the PAR table, so nothing to add.
##############################################################################
# altSeqLiftOver (DONE 2018-11-06 Angie)
# originally done 2018-08-10; redone 2018-11-06 w/fixed gff3ToPsl to get correct - strand alignments
# mainToPatch over.chain regenerated 2018-12-03 w/fixed pslToChain
mkdir /hive/data/genomes/hg38/bed/altSeqLiftOver.p12
cd /hive/data/genomes/hg38/bed/altSeqLiftOver.p12
# Eventually these will be under the /hive/data/genomes/.../genbank/... directory
# that points to /hive/data/outside/ncbi/genomes/... but at the moment the contents
# of the alignments/ directories are not included in the sync. So for now,
# manually download them here.
# Original alts -- reuse the ones downloaded for p11:
ln -s ../altSeqLiftOver.p11/initialAlts .
# New alts and patches too:
mkdir patches
cd patches
wget --timestamping --no-verbose\
ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p12/GCA_000001405.27_GRCh38.p12_assembly_structure/PATCHES/alt_scaffolds/alignments/\*.gff
cd ..
# Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names
hgsql hg38 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \
| awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed
wc -l gbToUcsc.sed
#595 gbToUcsc.sed
cp /dev/null altToChrom.noScore.psl
for f in initialAlts/*.gff patches/*.gff;
do
e=`basename $f .gff | sed -e 's/_/|/g;'`
s=`grep -E $e gbToUcsc.sed`
sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \
| pslPosTarget stdin stdout \
>> altToChrom.noScore.psl
done
pslCheck altToChrom.noScore.psl
#checked: 421 failed: 0 errors: 0
time pslRecalcMatch altToChrom.noScore.psl ../../hg38.2bit{,} altToChrom.psl
#real 0m29.571s
pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl
sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \
> altAndPatches.psl
grep _alt altAndPatches.psl > altSeqLiftOver.psl
grep _fix altAndPatches.psl > fixSeqLiftOver.psl
# Load tables
hgLoadPsl hg38 -table=altSeqLiftOverPsl altSeqLiftOver.psl
hgLoadPsl hg38 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl
# Make chrom-to-alt PSL file for genbank process.
ln -f -s `pwd`/chromToAlt.psl \
/hive/data/genomes/hg38/jkStuff/hg38.p12.alt.psl
wc -l /hive/data/genomes/hg38/jkStuff/hg38.p12.alt.psl
#421 /hive/data/genomes/hg38/jkStuff/hg38.p12.alt.psl
# Make a liftOver chain file for mapping annotations on main chroms to new patch sequences
# exclude alts that were already in hg38 before p12
# Redone 12/3/18 after Braney fixed pslToChain
cut -f 1 ../../chrom.sizes.p11 | grep _ \
| grep -vwf - chromToAlt.psl \
| pslToChain stdin stdout \
| chainScore stdin ../../hg38.2bit{,} ../../jkStuff/hg38.mainToPatch.p12.over.chain
grep chain ../../jkStuff/hg38.mainToPatch.p12.over.chain | wc -l
#17
##############################################################################
# Extend wgEncodeReg bigWig tracks (DONE 19-01-08 angie)
# originally done 18-08-29; redone 18-11-06, 19-01-08 with recomputed .plusP11
# files and updated .chain
#NOTE: this has not been liftOver'd to original alts!
# Use the *.plusP11.bigWig files and add p12.
for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do
composite=$(basename $dir)
echo $composite
cd $dir
for f in wg*.plusP11.bigWig; do
track=$(basename $f .plusP11.bigWig)
~/kent/src/hg/utils/liftOverBigWigToPatches $f \
/hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \
/hive/data/genomes/hg38/chrom.sizes \
$track.plusP12.bigWig &
done
wait
done
# Install (not necessary after updating .plusP12 files, links already point there)
for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do
composite=$(basename $dir)
echo $composite
cd $dir
for f in wg*.plusP12.bigWig; do
track=$(basename $f .plusP12.bigWig)
ln -sf `pwd`/$track.plusP12.bigWig /gbdb/hg38/bbi/wgEncodeReg/$composite/$track.bigWig
done
done
##############################################################################
# Extend wgEncodeRegDnase (DONE 19-01-08 angie)
# originally done 18-08-28; redone 18-11-08, 19-01-08 with recomputed .plusP11
# files and updated .chain
#NOTE: this has not been liftOver'd to original alts!
cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase
origFile=wgEncodeRegDnaseClustered.plusP11.bed
liftOver -multiple -bedPlus=5 -noSerial $origFile \
/hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \
wgEncodeRegDnaseClustered.p12.bed /dev/null
sort -k1,1 -k2n,2n $origFile wgEncodeRegDnaseClustered.p12.bed \
> wgEncodeRegDnaseClustered.plusP12.bed
hgLoadBed hg38 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.plusP12.bed \
-sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \
-renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as
##############################################################################
# Extend wgEncodeRegTfbsClusteredV3 (DONE 19-01-08 angie)
# originally done 18-08-28; redone 18-11-08, 19-01-08 with recomputed .plusP11
# files and updated .chain
#NOTE: this has not been liftOver'd to original alts!
cd /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3/
origFile=wgEncodeRegTfbsClusteredV3.plusP11.bed
liftOver -multiple -bedPlus=5 -noSerial $origFile \
/hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \
wgEncodeRegTfbsClusteredV3.p12.bed /dev/null
sort -k1,1 -k2n,2n $origFile wgEncodeRegTfbsClusteredV3.p12.bed \
> wgEncodeRegTfbsClusteredV3.plusP12.bed
hgLoadBed hg38 wgEncodeRegTfbsClusteredV3 wgEncodeRegTfbsClusteredV3.plusP12.bed \
-sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \
-renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as
##############################################################################
# Extend GTEX GENE (DONE 19-01-15 angie)
# originally done 18-08-28; redone 18-11-08, 19-01-08 with recomputed .plusP11
# files and updated .chain
# gtexGeneModel redone 19-01-15 after implementing liftOver -multiple -genePred.
mkdir /hive/data/genomes/hg38/bed/gtex.p12
cd /hive/data/genomes/hg38/bed/gtex.p12
liftOver -multiple -bedPlus=6 -noSerial ../gtex.p11/gtexGene.plusP11.bed \
/hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \
gtexGene.p12.bed /dev/null
sort -k1,1 -k2n,2n ../gtex.p11/gtexGene.plusP11.bed gtexGene.p12.bed \
> gtexGene.plusP12.bed
# There is actually no bin column in gtexGene.
hgLoadSqlTab hg38 gtexGene $HOME/kent/src/hg/lib/gtexGeneBed.sql gtexGene.plusP12.bed
# Two of the genes fall on inversions in the mapping of chr to alt/fix, so part of a gene
# maps on a + chain and part on a - chain. The SQL table has a unique index on
# (chr, geneId) so having two results (+ and -) makes it error out. When that happens,
# remove the smaller inversion mapping -- the larger gene region is still mapped.
grep -v '^chr12_KN538369v1_fix.*-.ENSG00000165714' gtexGene.plusP12.bed \
| grep -v '^chr7_KV880765v1_fix.*+.ENSG00000164597' \
| hgLoadSqlTab hg38 gtexGene $HOME/kent/src/hg/lib/gtexGeneBed.sql stdin
# 19-01-15: recompute now that liftOver -multiple -genePred works.
liftOver -multiple -genePred ../gtex.p11/gtexGeneModel.plusP11.gp \
/hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \
gtexGeneModel.p12.gp /dev/null
sort -k2,2 -k3n,3n ../gtex.p11/gtexGeneModel.plusP11.gp gtexGeneModel.p12.gp \
> gtexGeneModel.plusP12.gp
hgLoadGenePred hg38 gtexGeneModel gtexGeneModel.plusP12.gp
#############################################################################
# UPDATE /scratch/data/ 2bit (DONE 2018-09-26 angie)
cp -p /hive/data/genomes/hg38/hg38.p12.2bit /hive/data/staging/data/hg38/
mv /hive/data/staging/data/hg38/hg38.2bit{,.bak}
mv /hive/data/staging/data/hg38/hg38{.p12,}.2bit
cmp /hive/data/genomes/hg38/hg38.initial.2bit /hive/data/staging/data/hg38/hg38.2bit.bak
# No output -- the .bak copy is identical as expected, so clean it up.
rm /hive/data/staging/data/hg38/hg38.2bit.bak
##############################################################################
# Extend wgEncodeRegDnase (DNase HS) (DONE 19-01-23 angie)
# 95 Peak view subtracks
mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHS.p12
cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHS.p12
for f in ../wgEncodeRegDnaseHS.p11/*.plusP11.bed; do
track=$(basename $f .plusP11.bed)
liftOver -multiple -bedPlus=5 -noSerial $f \
/hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \
$track.p12.bed /dev/null
sort -k1,1 -k2n,2n $f $track.p12.bed > $track.plusP12.bed
done
# Install
for f in *.plusP12.bed; do
table=$(basename $f .plusP12.bed)
echo $table
hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \
-type=bed6+4 -as=$HOME/kent/src/hg/lib/bigNarrowPeak.as -noNameIx \
hg38 $table $f
done
rm bed.tab
# 95 Hotspots view subtracks
mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHotspot.p12
cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHotspot.p12
cat >runOne <<'_EOF_'
#!/bin/bash
set -beEu -o pipefail
track=$1
origFile=$2
liftOver -multiple -bedPlus=6 -noSerial $origFile \
/hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \
$track.broadPeak.p12.bed /dev/null
sort -k1,1 -k2n,2n $origFile $track.broadPeak.p12.bed > $track.broadPeak.plusP12.bed
bedToBigBed -as=$HOME/kent/src/hg/lib/encode/broadPeak.as -type=bed6+3 $track.broadPeak.plusP12.bed \
/hive/data/genomes/hg38/chrom.sizes $track.broadPeak.plusP12.bb
_EOF_
chmod a+x runOne
cp /dev/null jobList
for origFile in ../wgEncodeRegDnaseHotspot.p11/*.broadPeak.plusP11.bed; do
track=$(basename $origFile .broadPeak.plusP11.bed)
echo ./runOne $track $origFile >> jobList
done
para make jobList
para time
#Completed: 95 of 95 jobs
#CPU time in finished jobs: 206s 3.43m 0.06h 0.00d 0.000 y
#IO & Wait Time: 251s 4.19m 0.07h 0.00d 0.000 y
#Average job time: 5s 0.08m 0.00h 0.00d
#Longest finished job: 8s 0.13m 0.00h 0.00d
#Submission to last job: 22s 0.37m 0.01h 0.00d
# Install
for f in *.broadPeak.plusP12.bb; do
track=$(basename $f .broadPeak.plusP12.bb)
ln -sf `pwd`/$f /gbdb/hg38/bbi/wgEncodeRegDnase/$track.broadPeak.bb
done
# Don't do wgEncodeRegDnaseSignal view... the data files are same as DnaseWig below!
#############################################################################
# Extend wgEncodeRegDnaseWig (DNase Signal) (DONE 19-01-23 angie)
mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseWig.p12
cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseWig.p12
cp /dev/null jobList
for origFile in ../wgEncodeRegDnaseWig.p11/*.plusP11.bw; do
track=$(basename $origFile .plusP11.bw)
echo ~/kent/src/hg/utils/liftOverBigWigToPatches $origFile \
/hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \
/hive/data/genomes/hg38/chrom.sizes \
{check out exists $track.plusP12.bw} \
>> jobList
done
para make jobList
para time
#Completed: 95 of 95 jobs
#CPU time in finished jobs: 140913s 2348.55m 39.14h 1.63d 0.004 y
#IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y
#Average job time: 428s 7.14m 0.12h 0.00d
#Longest finished job: 828s 13.80m 0.23h 0.01d
#Submission to last job: 1534s 25.57m 0.43h 0.02d
# Install by updating /gbdb/ links.
for f in *.plusP12.bw; do
track=$(basename $f .plusP12.bw)
ln -sf `pwd`/$f /gbdb/hg38/bbi/wgEncodeRegDnase/$track.bw
done
#############################################################################
# Rebuild ncbiRefSeqGenomicDiff (DONE 19-01-25 angie)
mkdir /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies.p12
cd /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies.p12
db=hg38
pre=ncbiRefSeqGenomicDiff
buildDir=/hive/data/genomes/hg38/bed/ncbiRefSeq.p12.2018-08-10
asmId=GCF_000001405.38_GRCh38.p12
time (zcat $buildDir/process/$asmId.rna.cds.gz \
| egrep '[0-9]+\.\.[0-9]+' \
| pslMismatchGapToBed -cdsFile=stdin -db=$db -ignoreQNamePrefix=X \
$buildDir/process/$asmId.$db.psl.gz \
/hive/data/genomes/$db/$db.2bit \
$buildDir/$db.rna.fa \
$pre)
#real 0m40.604s
bedToBigBed -type=bed9+ -tab -as=$HOME/kent/src/hg/lib/txAliDiff.as $pre.bed \
/hive/data/genomes/$db/chrom.sizes $pre.bb
ln -sf `pwd`/$pre.bb /gbdb/hg38/ncbiRefSeq/$pre.bb
#############################################################################
# DBSNP B151 / SNP151 (TODO 19-? angie)
# b151 is for GRCh38.p7 (orgDir human_9606_b151_GRCh38p7) so it doesn't have
# sequences added in p8 and later.
How do we do this without disrupting the existing snp151 track? Can we build with a suffix and rename? Or just stop before loading...
mkdir -p /hive/data/outside/dbSNP/151/human_hg38
cd /hive/data/outside/dbSNP/151/human_hg38
# Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/
# to find the subdir name to use as orgDir below (human_9606_b151_GRCh38p7 in this case).
# Go to that subdirectory, then to database/organism_data/ and look for files
# whose names start with b151_* and may or may not end with a suffix that identifies
# the build assembly version or some annotation version. If there is a suffix shared
# by all b151_* files, add that to config.ra as the "buildAssembly".
# dbSNP has all NT_/NW_ contig IDs, but our 2bit has UCSC-ified genbank names.
# make a lift file.
cat > config.ra <<EOF
db hg38
orgDir human_9606_b151_GRCh38p7
build 151
buildAssembly 108
refAssemblyLabel GRCh38.p7
EOF
# Skip the download step -- link to files already downloaded for hg38.
~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -debug
rmdir data schema rs_fasta
ln -s ../human_hg38/{data,schema,rs_fasta} .
# And the last bits of ../download_human_hg38_151.csh:
# Make all files group writeable so others can update them if necessary
find /hive/data/outside/dbSNP/151 -user $USER -not -perm -660 \
| xargs --no-run-if-empty chmod ug+w
# Extract the set of assembly labels in case we need to exclude any.
zcat /hive/data/outside/dbSNP/151/human_hg38/data/b151_ContigInfo_108.bcp.gz \
| cut -f 12 | uniq | sort -u \
> /hive/data/outside/dbSNP/151/human_hg38/assemblyLabels.txt
# Start the usual pipeline at the loadDbSnp step.
~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
tail -f do.log
#*** b151_ContigInfo_108 has coords for 305 sequences; these have been written to
#*** /hive/data/outside/dbSNP/151/human_hg38/suggested.lft .
#
#*** GCF_000001405.33_GRCh38.p7_assembly_report.txt has mappings for 500 sequences;
#*** these have been written to
#*** /hive/data/outside/dbSNP/151/human_hg38/suggested.lft .
#
#*** You must account for all 805 contig_acc values in config.ra,
#*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output).
#*** Check the auto-generated suggested.lft to see if it covers all
#*** 805 contigs; if it does, add 'liftUp suggested.lft' to config.ra.
#*** Then run again with -continue=loadDbSnp .
cp suggested.lft hg38.lft
cat >> config.ra <<EOF
liftUp hg38.lft
EOF
# Try again from the loadDbSnp step.
~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
tail -f do.log
# *** All done! (through the 'bigBed' step)
# Make a liftOver chain from main chromosomes to patches added after p7.
# Do the liftOver for all positional snp151* tables, load with -oldTable
##############################################################################
# OMIM tracks (TODO 19-? angie)
# the otto process builds the omim* tables; edit otto/omim/buildOmimTracks.sh to make sure
# the most recent dbSNP version is listed for the db. After the snpNNN table is updated to
# include patch sequences, the next otto update will include patches.
# omimGene2 is still using refGene, but I think it would be better if it used ncbiRefSeqCurated
# if it exists.
# TODO: OMIM Genes needs liftOver to new alts and fixes (or redo from ncbiRefSeq).
# OMIM Phenotypes needs liftOvers to all alts and fixes. Sometimes it spans a region larger
# than an alt/fix, so maybe lower the percentage that has to map?
##############################################################################
# GRC Incident Database (TODO - 19-? - Angie)
# Wait until the updated hg19 files have been pushed to RR because GRC Incident update is
# automated. Then update the file used to map GRC's RefSeq accessions to our names:
hgsql hg19 -NBe 'select alias,chrom from chromAlias where source = "refseq" order by alias;' \
> /hive/data/outside/grc/incidentDb/GRCh38/refSeq.chromNames.tab
#############################################################################
+# Update hg38.p12.chromAlias.txt (DONE 2024-01-17 Angie)
+
+ # In MLQ#32874, the user reported that chrUn_KI270752v1 is missing from hg38.p12.chromAlias.txt.
+ # That's because when the hg38.{p11,p12,p13}.chromAlias.txt files were initially created
+ # in patchUpdate.13.txt ("Correctly versioned hg38.chromAlias.txt files in downloads"),
+ # chrUn_KI270752v1 was omitted because it had been removed from the RefSeq assembly as
+ # contamination. However, it is confusing for users to have a sequence in the db with no
+ # aliases despite it having Assembly, Ensembl and INSDC aliases. So add it back.
+ hgsql hg38 -NBe 'select * from chromAlias where chrom = "chrUn_KI270752v1"'
+#HSCHRUN_RANDOM_CTG29 chrUn_KI270752v1 assembly
+#KI270752.1 chrUn_KI270752v1 ensembl,genbank
+ cd /hive/data/genomes/hg38/goldenPath/bigZips
+ echo -e "chrUn_KI270752v1\tHSCHRUN_RANDOM_CTG29\tKI270752.1\t" >> hg38.p12.chromAlias.txt
+ # p12/hg38.p12.chromAlias.txt is a symlink to the one in this directory.
+ # p12/hg38.p12.chromAlias.bb is a file in p12/ . -- But it already has chrUn_KI270752v1
+ # so it does not need to be updated, great.
+ bigBedToBed -chrom=chrUn_KI270752v1 p12/hg38.p12.chromAlias.bb stdout
+#chrUn_KI270752v1 0 27745 chrUn_KI270752v1 HSCHRUN_RANDOM_CTG29 KI270752.1 KI270752.1
+
+
+#############################################################################