cbe7391bbaffe17d27056884a9c33d0489e57f90
max
  Mon Mar 9 11:39:29 2020 -0700
adding hg19 analysis set, refs #25100

diff --git src/hg/makeDb/doc/hg19.analysisSet.txt src/hg/makeDb/doc/hg19.analysisSet.txt
new file mode 100644
index 0000000..aa327af
--- /dev/null
+++ src/hg/makeDb/doc/hg19.analysisSet.txt
@@ -0,0 +1,41 @@
+# get NCBI's analysis set fasta files
+mkdir /hive/data/genomes/hg19/bed/analysisSet
+cd /hive/data/genomes/hg19/bed/analysisSet
+wget https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/seqs_for_alignment_pipelines/GCA_000001405.14_GRCh37.p13_full_analysis_set.fna.gz 
+wget https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/seqs_for_alignment_pipelines/GCA_000001405.14_GRCh37.p13_no_alt_analysis_set.fna.gz 
+
+# 298 unique sequence lengths so can use them to join with UCSC sequences
+cat /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/bigZips/p13.plusMT/hg19.p13.plusMT.chrom.sizes | grep -v 'chrM'$'\t' | sort -k2n > ucscSizes.txt
+faSize GCA_000001405.14_GRCh37.p13_full_analysis_set.fna.gz -detailed | sort -k2n | grep -v chrEBV > ncbiSizes.txt
+# make sure that they match
+paste ncbiSizes.txt ucscSizes.txt  |awk '($1!=$3)' | awk '($2!=$4)'
+# create sed replacement file
+paste ncbiSizes.txt ucscSizes.txt  | awk '($1!=$3) {OFS="\t"; print($1,$3);} ' > ncbiToUcsc.txt
+cat ncbiToUcsc.txt  | awk '{print ("s/"$1"/"$2"/g")}' > ncbiToUcsc.sed
+
+# convert NCBI chroms
+zcat GCA_000001405.14_GRCh37.p13_full_analysis_set.fna.gz | sed -f ncbiToUcsc.sed > hg19.p13.plusMT.full_analysis_set.fa
+zcat GCA_000001405.14_GRCh37.p13_no_alt_analysis_set.fna.gz | sed -f ncbiToUcsc.sed > hg19.p13.plusMT.no_alt_analysis_set.fa &
+gzip hg19.p13.plusMT.no_alt_analysis_set.fa 
+gzip hg19.p13.plusMT.full_analysis_set.fa 
+
+# not sure if anyone needs a 2bit file for these?
+faToTwoBit hg19.p13.plusMT.no_alt_analysis_set.fa.gz hg19.p13.plusMT.no_alt_analysis_set.2bit 
+faToTwoBit hg19.p13.plusMT.full_analysis_set.fa.gz hg19.p13.plusMT.full_analysis_set.2bit
+
+# make table for g1k genome
+twoBitInfo hg19.p13.plusMT.no_alt_analysis_set.2bit stdout | grep -v 'chrEBV' | grep -v 'chrM'$'\t' | sort -k2n > ucscSizes.noAlt.txt
+wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz 
+faSize -detailed human_g1k_v37.fasta.gz > g1k.sizes
+paste g1k.sizes ucscSizes.noAlt.txt |cut -f1,3 > g1kToUcsc.txt
+
+# make table for Ensembl genome
+wget http://ftp.ensembl.org/pub/grch37/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna_sm.primary_assembly.fa.gz 
+faSize -detailed Homo_sapiens.GRCh37.dna_sm.primary_assembly.fa.gz > ensemblSizes.txt &
+paste ensemblSizes.txt ucscSizes.noAlt.txt  | cut -f1,3 > ensemblToUcsc.txt
+
+# copy stuff to webserver
+mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/bigZips/analysisSet/chromAlias -p
+mv *.2bit /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/bigZips/analysisSet/
+mv hg19.*.gz /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/bigZips/analysisSet/
+mv *toUcsc.txt /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/bigZips/analysisSet/chromAlias/