481aac4c5d50c29d04226490eb99913927250981
max
Tue Apr 11 02:13:10 2023 -0700
small addition to problematic regions track, email from Markd, refs #30801
diff --git src/hg/makeDb/doc/hg38/problematic.txt src/hg/makeDb/doc/hg38/problematic.txt
index 6e0b556..7ed6ea6 100644
--- src/hg/makeDb/doc/hg38/problematic.txt
+++ src/hg/makeDb/doc/hg38/problematic.txt
@@ -23,15 +23,21 @@
# fixes
hgsql hg38 -e 'select * from chromInfo' | grep _fix | cut -f-2 |tawk '{$3=$2; $2=0; $4="fix"; $5="The chr_fix chromosomes, such as chr1_KN538361v1_fix, are fix patches currently available for the hg19 and hg38 assemblies that represent changes to the existing sequence. These are generally error corrections (such as base changes, component replacements/updates, switch point updates or tiling path changes) or assembly improvements (such as extension of sequence into gaps). These fix patch scaffold sequences are given chromosome context through alignments to the corresponding chromosome regions, in the \"Mapping and Sequencing > GRC Patch Release\" track. See also the FAQ for more information."; $6="none"; print}' > chrFix.bed
# alts
hgsql hg38 -e 'select * from chromInfo' | grep alt | cut -f-2 |tawk '{$3=$2; $2=0; $4="alt"; $5="The chr_alt chromosomes, such as chr5_KI270794v1_alt, are alternative sequences that differ from the reference genome currently available for a few assemblies including danRer11, hg19, and hg38. These are regions of the genome that exhibit sufficient variability to prevent adequate representation by a single sequence. UCSC labels these haplotype sequences by appending "_alt" to their names. These alternative loci scaffolds (such as KI270794.1 in the hg38 assembly, referenced as chr5_KI270794v1_alt in the browser), are mapped to the genome and provide suppemental genomic information on these variable locations. To find the regions these alternate sequences correspond to in the genome you may use the Alt Haplotypes track if one is available. See also the FAQ for more information."; $6="none"; print}' > chrAlt.bed
# various other gene clusters
bigBedToBed /gbdb/hg38/ncbiRefSeq/ncbiRefSeqOther.bb stdout | less | grep -v pseudo | grep -v "T cell" -i | grep -v tRNA | grep -v immuno | grep -v constant | grep -v miR | grep -v UGT1A | grep -v PCDHA | grep -v PCDHB | cut -f1-4,18 | tawk '{$5=$5" HGNC ID:"$4" This is a cluster of many very similar genes based on the Genes and Gene Predictions > NCBI RefSeq > RefSeq Other Track"; $4="cluster"; print}' > chrClusters.bed
# put everything together and make trix files
cat manual.bed chr*.bed | sort -k1,1 -k2,2n | tawk '{desc=$5; desc2=$6; $5="0"; $6="+"; $7=$2; $8=$3; $9="0,0,0"; $10=desc; $11=desc2; print}' > all.bed
bedToBigBed all.bed /hive/data/genomes/hg38/chrom.sizes comments.bb -tab -as=manual.as -type=bed9+ -extraIndex=name
cut -f4,10 all.bed > notes.txt
ixIxx notes.txt notes.ix notes.ixx
+
+# Tue Apr 11 02:12:18 PDT 2023
+# add the GRC exclude list, from MarkD
+cat ~markd/public_html/browser/grc-bad/GCA_000001405.15_GRCh38_GRC_exclusions.bed | grep -v description > grcExclusions.bed
+bedSort grcExclusions.bed grcExclusions.bed
+bedToBigBed grcExclusions.bed ../../chrom.sizes bb/grcExclusions.bb -tab -type=bed4