687239fe5e1d9407369eb985448dc1f844b261b0
gperez2
  Tue Nov 5 11:24:23 2024 -0800
Moving the programaticGIAB.txt text to the problematic.txt makedoc,  refs #32715

diff --git src/hg/makeDb/doc/hg38/problematic.txt src/hg/makeDb/doc/hg38/problematic.txt
index be4a6c6..a655cfd 100644
--- src/hg/makeDb/doc/hg38/problematic.txt
+++ src/hg/makeDb/doc/hg38/problematic.txt
@@ -1,75 +1,126 @@
 cd /hive/data/genomes/hg38/bed/problematic/
 # note that the hg19 directory is /hive/data/genomes/hg19/bed/specialRegions/, the track name used to be different
 
 # add the ENCODE blacklist
 wget 'https://github.com/Boyle-Lab/Blacklist/blob/master/lists/hg38-blacklist.v2.bed.gz?raw=true' -O hg38-blacklist.v2.bed.gz
 gunzip hg38-blacklist.v2.bed.gz 
 mkdir bb
 bedSort hg38-blacklist.v2.bed hg38-blacklist.v2.bed
 bedToBigBed hg38-blacklist.v2.bed /hive/data/genomes/hg38/chrom.sizes bb/encBlacklist.bb  -type=bed4 -tab
 
 # bring over the manual comments track, mostly manual work
 cd ../manualComments
 ln -s ../../../../hg19/bed/specialRegions/manualComments
 mv manualComments hg19Comments
 
 cat hg19Comments/manual.bed | cut -f1-4 > hg19.bed
 ~/bin/x86_64/liftOver -tab hg19.bed /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz manual.lift.bed unmapped.lift.bed
 # -- manual step! -- fixed up manual.lift.bed manually adding PAR regions
 cat hg19Comments/manual.bed | cut -f4- | sort > annots.tsv
 tabJoin annots.tsv 0 manual.lift.bed 3  > manual.bed
 # -- manual step -- fixed up manual.bed 
 hgsql hg38 -e 'select * from chromInfo' | grep chrUn | cut -f-2 |tawk '{$3=$2; $2=0; $4="unplaced"; $5="ChrUn contains clone contigs that cannot be confidently placed on a specific chromosome."; $6="none"; print}' > chrUn.bed
 
 # fixes
 hgsql hg38 -e 'select * from chromInfo' | grep _fix | cut -f-2 |tawk '{$3=$2; $2=0; $4="fix"; $5="The chr_fix chromosomes, such as chr1_KN538361v1_fix, are fix patches currently available for the hg19 and hg38 assemblies that represent changes to the existing sequence. These are generally error corrections (such as base changes, component replacements/updates, switch point updates or tiling path changes) or assembly improvements (such as extension of sequence into gaps). These fix patch scaffold sequences are given chromosome context through alignments to the corresponding chromosome regions, in the \"Mapping and Sequencing &gt; GRC Patch Release\" track. See also <a href=\"https://genome.ucsc.edu/FAQ/FAQdownloads#downloadFix\" target=_blank>the FAQ</a> for more information."; $6="none"; print}' > chrFix.bed
 
 # alts
 hgsql hg38 -e 'select * from chromInfo' | grep alt | cut -f-2 |tawk '{$3=$2; $2=0; $4="alt"; $5="The chr_alt chromosomes, such as chr5_KI270794v1_alt, are alternative sequences that differ from the reference genome currently available for a few assemblies including danRer11, hg19, and hg38. These are regions of the genome that exhibit sufficient variability to prevent adequate representation by a single sequence. UCSC labels these haplotype sequences by appending "_alt" to their names. These alternative loci scaffolds (such as KI270794.1 in the hg38 assembly, referenced as chr5_KI270794v1_alt in the browser), are mapped to the genome and provide suppemental genomic information on these variable locations. To find the regions these alternate sequences correspond to in the genome you may use the Alt Haplotypes track if one is available. See also <a href=\"https://genome.ucsc.edu/FAQ/FAQdownloads#downloadAlt\" target=_blank>the FAQ</a> for more information."; $6="none"; print}'  > chrAlt.bed
 
 # various other gene clusters
 bigBedToBed /gbdb/hg38/ncbiRefSeq/ncbiRefSeqOther.bb stdout  | less | grep -v pseudo | grep -v "T cell" -i  | grep -v tRNA | grep -v immuno | grep -v constant | grep -v miR | grep -v UGT1A | grep -v PCDHA | grep -v PCDHB | cut -f1-4,18 | tawk '{$5=$5" HGNC ID:"$4" This is a cluster of many very similar genes based on the <b>Genes and Gene Predictions &gt; NCBI RefSeq &gt; RefSeq Other</b> Track"; $4="cluster"; print}' > chrClusters.bed
 
 # put everything together and make trix files
 cat manual.bed chr*.bed | sort -k1,1 -k2,2n | tawk '{desc=$5; desc2=$6; $5="0"; $6="+"; $7=$2; $8=$3; $9="0,0,0"; $10=desc; $11=desc2; print}' > all.bed 
 bedToBigBed all.bed /hive/data/genomes/hg38/chrom.sizes comments.bb -tab -as=manual.as -type=bed9+ -extraIndex=name
 cut -f4,10 all.bed > notes.txt
 ixIxx notes.txt notes.ix notes.ixx
 
 # Tue Apr 11 02:12:18 PDT 2023
 # add the GRC exclude list, from MarkD
 cat ~markd/public_html/browser/grc-bad/GCA_000001405.15_GRCh38_GRC_exclusions.bed | grep -v description > grcExclusions.bed
 bedSort grcExclusions.bed grcExclusions.bed 
 bedToBigBed grcExclusions.bed ../../chrom.sizes bb/grcExclusions.bb -tab -type=bed4
 
 # Add highly reproducible regions (#31122):
 # https://zenodo.org/record/5275189#.ZEhKRezMI-S
 cd /hive/data/genomes/hg38/bed/problematic/
 mkdir highRepro
 
 # The zenodo link has a bunch of links with "?download=1" on the url, copy and paste them
 # into a text file to remove that and then download them
 vim urls.txt
 wget --quiet --input-file=urls.txt
 # all of the beds are just bed3's
 head *.bed
 ==> CQ-56.bed <==
 chr1    12857   12904
 chr1    12932   13028
 chr1    13129   13367
 chr1    13520   13633
 ...
 
 # so turn those into bigBeds and link everything to /gbdb
 for f in $(ls *.bed); do base=$(basename -s .bed $f); bedSort ${f} ${f}.sorted; bedToBigBed ${f}.sorted../../../chrom.sizes ${base}.bb; done
 # for some reason tabix doesn't like to be run in a for loop from ls, needs find
 for f in $(find . -name "*.vcf.gz"); do tabix --verbosity 3 -p vcf "${f}"; done
 mkdir -p /gbdb/hg38/problematic/highRepro
 ln -s `pwd`/*.bb /gbdb/hg38/problematic/highRepro/
 ln -s `pwd`/*.vcf.gz* /gbdb/hg38/problematic/highRepro/
 
 # turns the beds into one single bed with the overlapped regions:
 for f in $(ls *.bed); do echo $f; n=${f/.bed/}; tawk -v name=${n} '{print $0,name}' $f > ${n}.bed4; done
 cat *.bed4 | sort -k1,1 -k2,2n > highRepro.allRegions
 bedOverlapMerge highRepro.allRegions | tawk '{print $0,0,".",$2,$2,"0,0,0",gensub(/\//, ",", "g",$4)}' > highRepro.merged
 bedToBigBed -type=bed9+1 -tab -as=highRepro.as highRepro.merged ../../../chrom.sizes highRepro.bb
+
+#############################################################################
+# problematicGIAB: Difficult regions from GIAB via NCBI - Megna/Gerardo
+# Redmine #34253
+
+# Megna workflow for v3.3:
+cp /cluster/home/mchalama/public_html/tracks/CoLoRSdb/genomes.txt /cluster/home/mchalama/public_html/tracks/difficult
+cp /cluster/home/mchalama/public_html/tracks/CoLoRSdb/hub.txt /cluster/home/mchalama/public_html/tracks/difficult
+cp /cluster/home/mchalama/public_html/tracks/CoLoRSdb/hg38/CoLoRSdb.html /cluster/home/mchalama/public_html/tracks/difficult
+cp /cluster/home/mchalama/public_html/tracks/CoLoRSdb/hg38/trackDb.txt /cluster/home/mchalama/public_html/tracks/difficult
+mkdir hg38
+mv /cluster/home/mchalama/public_html/tracks/difficult/CoLoRSdb.html /cluster/home/mchalama/public_html/tracks/difficult/hg38
+mv /cluster/home/mchalama/public_html/tracks/difficult/trackDb.txt /cluster/home/mchalama/public_html/tracks/difficult/hg38
+edit genomes.txt
+edit hub.txt; got contact of PI from here: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.3/README.md
+wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.3/GRCh38@all/Union/GRCh38_alldifficultregions.bed.gz
+wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.3/GRCh38@all/Union/GRCh38_alllowmapandsegdupregions.bed.gz
+wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.3/GRCh38@all/Union/GRCh38_notinalldifficultregions.bed.gz
+wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.3/GRCh38@all/Union/GRCh38_notinalllowmapandsegdupregions.bed.gz
+#edit trackDb.txt to include the 4 datasets
+#Bed to BigBed conversion because I realized you can't see beds properly on the hub
+bedToBigBed /cluster/home/mchalama/public_html/tracks/difficult/hg38/GRCh38_alldifficultregions.bed.gz /cluster/home/mchalama/public_html/tracks/difficult/hg38.chrom.sizes GRCh38_alldifficultregions.bb
+bedToBigBed /cluster/home/mchalama/public_html/tracks/difficult/hg38/GRCh38_alllowmapandsegdupregions.bed.gz /cluster/home/mchalama/public_html/tracks/difficult/hg38.chrom.sizes GRCh38_alllowmapandsegdupregions.bb
+bedToBigBed /cluster/home/mchalama/public_html/tracks/difficult/hg38/GRCh38_notinalldifficultregions.bed.gz /cluster/home/mchalama/public_html/tracks/difficult/hg38.chrom.sizes GRCh38_notinalldifficultregions.bb
+bedToBigBed /cluster/home/mchalama/public_html/tracks/difficult/hg38/GRCh38_notinalllowmapandsegdupregions.bed.gz /cluster/home/mchalama/public_html/tracks/difficult/hg38.chrom.sizes GRCh38_notinalllowmapandsegdupregions.bb
+edit the html file to include information about the track
+#open genome browser
+#navigate to custom hub and paste the following URL
+#https://hgwdev.gi.ucsc.edu/~mchalama/tracks/difficult/hub.txt
+
+
+# Gerardo workflow for v3.5:
+# Copied the trackDb.txt into the human/hg38/problematic.ra and copied html description into problematic html
+# Downloaded the v3.5 data
+cd /hive/data/genomes/hg38/bed/
+mkdir problematic; cd problematic
+mkdir GIAB; cd GIAB
+wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.5/GRCh38@all/Union/GRCh38_alldifficultregions.bb
+wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.5/GRCh38@all/Union/GRCh38_alllowmapandsegdupregions.bb
+wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.5/GRCh38@all/Union/GRCh38_notinalldifficultregions.bb
+wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.5/GRCh38@all/Union/GRCh38_notinalllowmapandsegdupregions.bb
+cd /gbdb/hg38/
+mkdir problematic; cd problematic
+mkdir GIAB; cd GIAB
+# Made symlinks
+ln -s /hive/data/genomes/hg38/bed/problematic/GIAB/alldifficultregions.bb
+ln -s /hive/data/genomes/hg38/bed/problematic/GIAB/notinalldifficultregions.bb
+ln -s /hive/data/genomes/hg38/bed/problematic/GIAB/alllowmapandsegdupregions.bb
+ln -s /hive/data/genomes/hg38/bed/problematic/GIAB/notinalllowmapandsegdupregions.bb
+# Updated the bigDataUrl problematic.ra and problematic.html
+#############################################################################