9e0c4de768fdd79c8c5332814c2b13db5a84fe12
chmalee
  Wed May 10 16:09:20 2023 -0700
Add highly reproducible regions track, refs #31122

diff --git src/hg/makeDb/doc/hg38/problematic.txt src/hg/makeDb/doc/hg38/problematic.txt
index 7ed6ea6..27dd8cf 100644
--- src/hg/makeDb/doc/hg38/problematic.txt
+++ src/hg/makeDb/doc/hg38/problematic.txt
@@ -29,15 +29,41 @@
 
 # various other gene clusters
 bigBedToBed /gbdb/hg38/ncbiRefSeq/ncbiRefSeqOther.bb stdout  | less | grep -v pseudo | grep -v "T cell" -i  | grep -v tRNA | grep -v immuno | grep -v constant | grep -v miR | grep -v UGT1A | grep -v PCDHA | grep -v PCDHB | cut -f1-4,18 | tawk '{$5=$5" HGNC ID:"$4" This is a cluster of many very similar genes based on the <b>Genes and Gene Predictions &gt; NCBI RefSeq &gt; RefSeq Other</b> Track"; $4="cluster"; print}' > chrClusters.bed
 
 # put everything together and make trix files
 cat manual.bed chr*.bed | sort -k1,1 -k2,2n | tawk '{desc=$5; desc2=$6; $5="0"; $6="+"; $7=$2; $8=$3; $9="0,0,0"; $10=desc; $11=desc2; print}' > all.bed 
 bedToBigBed all.bed /hive/data/genomes/hg38/chrom.sizes comments.bb -tab -as=manual.as -type=bed9+ -extraIndex=name
 cut -f4,10 all.bed > notes.txt
 ixIxx notes.txt notes.ix notes.ixx
 
 # Tue Apr 11 02:12:18 PDT 2023
 # add the GRC exclude list, from MarkD
 cat ~markd/public_html/browser/grc-bad/GCA_000001405.15_GRCh38_GRC_exclusions.bed | grep -v description > grcExclusions.bed
 bedSort grcExclusions.bed grcExclusions.bed 
 bedToBigBed grcExclusions.bed ../../chrom.sizes bb/grcExclusions.bb -tab -type=bed4
+
+# Add highly reproducible regions (#31122):
+# https://zenodo.org/record/5275189#.ZEhKRezMI-S
+cd /hive/data/genomes/hg38/bed/problematic/
+mkdir highRepro
+
+# The zenodo link has a bunch of links with "?download=1" on the url, copy and paste them
+# into a text file to remove that and then download them
+vim urls.txt
+wget --quiet --input-file=urls.txt
+# all of the beds are just bed3's
+head *.bed
+==> CQ-56.bed <==
+chr1    12857   12904
+chr1    12932   13028
+chr1    13129   13367
+chr1    13520   13633
+...
+
+# so turn those into bigBeds and link everything to /gbdb
+for f in $(ls *.bed); do base=$(basename -s .bed $f); bedSort ${f} ${f}.sorted; bedToBigBed ${f}.sorted../../../chrom.sizes ${base}.bb; done
+# for some reason tabix doesn't like to be run in a for loop from ls, needs find
+for f in $(find . -name "*.vcf.gz"); do tabix --verbosity 3 -p vcf "${f}"; done
+mkdir -p /gbdb/hg38/problematic/highRepro
+ln -s `pwd`/*.bb /gbdb/hg38/problematic/highRepro/
+ln -s `pwd`/*.vcf.gz* /gbdb/hg38/problematic/highRepro/