5000aa2e1caad21d2066d15d087deb95945b17db max Fri Jul 1 09:12:41 2022 -0700 adding missing makedoc for special regions track on hg19, refs #29685 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 208ba0f..db1103a 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -33671,30 +33671,48 @@ wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterConflicting.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterCov.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterHapNoVar.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterMap.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterSSE.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterlt2Datasets.bed.gz gunzip *.gz cd .. for i in orig/*.bed; do out=`echo $i | sed -e 's|orig/VQSRv2.18_||g'`; out=`basename $out .bed`; echo $out; chromToUcsc -a hg19.chromAlias.tsv -i $i -o bed/$out.bed; done for i in bed/filter*.bed; do echo $i; bedSort $i $i; bedToBigBed $i /hive/data/genomes/hg19/chrom.sizes bb/`basename $i .bed`.bed -type=bed3; done cd /gbdb/hg19/bbi/special; for i in /hive/data/genomes/hg19/bed/specialRegions/bb/filter*.bb; do ln -s $i; done bedSort orig/hg19-blacklist.v2.bed orig/hg19-blacklist.v2.bed bedToBigBed orig/hg19-blacklist.v2.bed /hive/data/genomes/hg19/chrom.sizes bb/encBlacklist.bb -tab +# a few special regions and manual regions - extract a few from NCBI and chromInfo first + +hgsql hg19 -e 'select * from chromInfo' | grep chrUn | cut -f-2 |tawk '{$3=$2; $2=0; $4="unplaced"; $5="ChrUn contains clone contigs that cannot be confidently placed on a specific chromosome."; $6="none"; print}' > chrUn.bed +hgsql hg19 -e 'select * from chromInfo' | grep fixed | cut -f-2 |tawk '{$3=$2; $2=0; $4="fix"; $5="The chr_fix chromosomes, such as chr1_KN538361v1_fix, are fix patches currently available for the hg19 and hg38 assemblies that represent changes to the existing sequence. These are generally error corrections (such as base changes, component replacements/updates, switch point updates or tiling path changes) or assembly improvements (such as extension of sequence into gaps). These fix patch scaffold sequences are given chromosome context through alignments to the corresponding chromosome regions."; $6="none"; print}' > chrFix.bed +hgsql hg19 -e 'select * from chromInfo' | grep alt | cut -f-2 |tawk '{$3=$2; $2=0; $4="alt"; $5="The chr_alt chromosomes, such as chr5_KI270794v1_alt, are alternative sequences that differ from the reference genome currently available for a few assemblies including danRer11, hg19, and hg38. These are regions of the genome that exhibit sufficient variability to prevent adequate representation by a single sequence. UCSC labels these haplotype sequences by appending "_alt" to their names. These alternative loci scaffolds (such as KI270794.1 in the hg38 assembly, referenced as chr5_KI270794v1_alt in the browser), are mapped to the genome and provide suppemental genomic information on these variable locations. To find the regions these alternate sequences correspond to in the genome you may use the Alt Haplotypes track if one is available."; $6="none"; print}' > chrAlt.bed +hgsql hg19 -e 'select * from chromInfo' | grep chrUn | cut -f-2 |tawk '{$3=$2; $2=0; $4="unplaced"; $5="ChrUn contains clone contigs that cannot be confidently placed on a specific chromosome."; $6="none"; print}' > chrUn.bed +bigBedToBed /gbdb/hg19/ncbiRefSeq/ncbiRefSeqOther.bb stdout | less | grep -v pseudo | grep -v "T cell" -i | grep -v tRNA | grep -v immuno | grep -v constant | grep -v miR | grep -v UGT1A | cut -f1-4,18 | tawk '{$5=$5" HGNC ID:"$4" This is a cluster of many very similar genes based on the NCBI RefSeq Other Track"; $4="cluster"; print}' > chrClusters.bed + +# manual.bed was manually typed up, based on what I know from textbooks, from email from HGNC and reading Wikipedia + +# put these together +cat manual.bed chr*.bed | sort -k1,1 -k2,2n | tawk '{desc=$5; desc2=$6; $5="0"; $6="+"; $7=$2; $8=$3; $9="0,0,0"; $10=desc; $11=desc2; print}' > all.bed +bedToBigBed all.bed /hive/data/genomes/hg19/chrom.sizes comments.bb -tab -as=manual.as -type=bed9+ -extraIndex=name + +cut -f4,10 all.bed > notes.txt +ixIxx notes.txt notes.ix notes.ixx + + ############################################################################# # GTEx V8 (April 2020) Kate # Create BED from hgFixed tables (see doc/gtex) cd /hive/data/outside/gtex/V8/rnaSeq # Lift GTEx LDACC gene models (GENCODE V26 isoforms collapsed to single gene model) from hg38 annotation by GTEx LDACC set chain = /hive/data/genomes/hg38/bed/liftOver/hg38ToHg19.over.chain.gz liftOver -genePred gencodeV26.hg38.genePred $chain gencodeV26.hg19.lifted.genePred \ gencodeV26.hg19.unmapped # 1300 gencodeV26.hg19.unmapped # (was 925 in V6 lift hg19 to hg38)