src/hg/makeDb/doc/hg38/ncbiRefSeq.txt be4311c07e14feb728abc6425ee606ffaa611a58

be4311c07e14feb728abc6425ee606ffaa611a58
markd
  Fri Jan 22 06:46:58 2021 -0800
merge with master

diff --git src/hg/makeDb/doc/hg38/ncbiRefSeq.txt src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
index b443fcb..d65f817 100644
--- src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
+++ src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
@@ -1,339 +1,340 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes the construction of NCBI RefSeq track -- latest first
 
 #########################################################################
 # ncbiRefSeq.p11 Genes (DONE - 2018-01-09 - Angie)
 # Previously done 2017-09-28; redone 11-16 to include mito "rna" from chrM genomic seq;
 # redone 2018-01-09 to pick up changes in NCBI download files from 2017-12-22
     mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-01-09
     cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-01-09
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.37_GRCh38.p11 hg38) > do.log 2>&1 & tail -f do.log
     # *** All done !  Elapsed time: 23m51s
     # real    23m51.091s
 
     cat fb.ncbiRefSeq.hg38.txt
     # 129593530 bases of 3049335806 (4.250%) in intersection
 
 #########################################################################
 
 #########################################################################
 # ncbiRefSeq.p8 Genes (DONE - 2016-08-18 - Hiram)
 
     mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p8
     cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p8
 
     # running step wise as this script is still under development
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -bigClusterHub=ku -dbHost=hgwdev \
       -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.34_GRCh38.p8 hg38) > download.log 2>&1
     # real    12m35.601s
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -continue=process -bigClusterHub=ku -dbHost=hgwdev \
       -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.34_GRCh38.p8 hg38) > process.log 2>&1
     # real    14m18.953s
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -continue=load -bigClusterHub=ku -dbHost=hgwdev \
       -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.34_GRCh38.p8 hg38) > load.log 2>&1
     # real    0m59.189s
 
     cat fb.ncbiRefSeq.hg38.txt
     #  141131948 bases of 3049335806 (4.628%) in intersection
 
 #############################################################################
 
 #########################################################################
 # ncbiRefSeq.p7 Genes (DONE - 2016-05-20 - Hiram)
 
     mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p7
     cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p7
 
     # running step wise as this script is still under development
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -bigClusterHub=ku -dbHost=hgwdev \
       -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.33_GRCh38.p7 hg38) > download.log 2>&1
     # real    12m35.601s
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -continue=process -bigClusterHub=ku -dbHost=hgwdev \
       -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.33_GRCh38.p7 hg38) > process.log 2>&1
     # real    14m18.953s
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -continue=load -bigClusterHub=ku -dbHost=hgwdev \
       -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.33_GRCh38.p7 hg38) > load.log 2>&1
     # real    0m59.189s
 
     cat fb.ncbiRefSeq.hg38.txt
     #  141131948 bases of 3049335806 (4.628%) in intersection
 
 #############################################################################
 mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq
 cd /hive/data/genomes/hg38/bed/ncbiRefSeq
 
 export asmName="GCF_000001405.31_GRCh38.p5"
 
 # most of these files could be linked in from the refseq download, for example:
 ln -s /hive/data/genomes/hg38/refseq/${asmName}_rna.gbff.gz ./
 
 # or, fetched freshly from  NCBI refseq assemblies:
 for F in ${asmName}_genomic.fna.gz ${asmName}_genomic.gff.gz \
    ${asmName}_rna.fna.gz ${asmName}_assembly_report.txt
 do
  time rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/${asmName}/${F} ./
 done
 
 # measure these items:
 
 faCount ${asmName}_rna.fna.gz > rna.faCount.txt
 faCount ${asmName}_genomic.fna.gz > genomic.faCount.txt
 
 egrep -v "^#seq|^total" rna.faCount.txt | cut -f1,2 \
    | sort -k2nr > rna.chrom.sizes
 egrep -v "^#seq|^total" genomic.faCount.txt | cut -f1,2 \
    | sort -k2nr > genomic.chrom.sizes
 
 # this constructes the unlifted genePred file with NCBI chrom names
 gff3ToGenePred -useName -attrsOut=${asmName}.attrs.useName.txt \
     ${asmName}_genomic.gff.gz \
        ${asmName}.useName.gp
 
 # extract additional names from column nine to use as an index for
 #  the variety of names that identify each item
 ~/kent/src/hg/makeDb/doc/hg38/gffToIx.pl ${asmName}_genomic.gff.gz \
    > ${asmName}_genomic.ix.txt
 
 # for psl alignments, only need to work on the items with Gap specifications
 # The NG_ named items do not function correctly
 zgrep -v "NG_" ${asmName}_genomic.gff.gz | grep "Gap=" >> only.gap.gff
 
 # construct the PSL alignments from the Gap records
 sed -e 's/cDNA_match/match/;' only.gap.gff \
   | gff3ToPsl genomic.chrom.sizes rna.chrom.sizes stdin ${asmName}.psl
 
 # this doesn't work, too many UCSC names missing from the NCBI file:
 grep -v "^#" ${asmName}_assembly_report.txt | sed -e 's/^M//g;' \
     | cut -f7,9,10 \
        |awk -F'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $3, $2}' \
            > insdcToHg38.lift
 
 # the automated hubs process does construct a good NCBI to UCSC lift file:
 
 export liftFile="/hive/data/inside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/${asmName}/${asmName}.ncbiToUcsc.lift"
 
 # liftUp the genePred from NCBI chrom names to UCSC chrom names
 liftUp -extGenePred -type=.gp stdout \
      ${liftFile} warn ${asmName}.useName.gp | gzip -c \
         > ${asmName}.ucsc.gp.gz
 touch -r ${asmName}.useName.gp ${asmName}.ucsc.gp.gz
 # some items will fail because they are on patches or alternative sequence
 # from Patch 5 that does not exist in hg38
 genePredCheck -db=hg38 ${asmName}.ucsc.gp.gz
 # checked: 150709 failed: 1455
 
 # filter out the patches and alternative sequences that are not in hg38
 genePredFilter -db=hg38 ${asmName}.ucsc.gp.gz stdout \
     | gzip -c > ${asmName}.hg38.gp.gz
 # this should be perfectly good
 genePredCheck -db=hg38 ${asmName}.hg38.gp.gz
 # checked: 149254 failed: 0
 
 
 # liftUp the psl file from NCBI chrom names to UCSC chrom names
 pslSwap ${asmName}.psl stdout \
    | liftUp -type=.psl stdout ${liftFile} drop stdin \
       | gzip -c > ${asmName}.ucsc.psl.gz
 touch -r ${asmName}.psl ${asmName}.ucsc.psl.gz
 # same problem here as above, not everything is in hg38,
 #   filter patch 5 items out:
 pslCheck -pass=stdout -db=hg38 ${asmName}.ucsc.psl.gz 2> /dev/null \
   | gzip -c > ${asmName}.hg38.psl.gz
 pslCheck -db=hg38 ${asmName}.hg38.psl.gz
 # checked: 1062 failed: 0 errors: 0
 
 # separate categories of genes:
 
 zegrep "^N(M|R)|^YP" ${asmName}.hg38.gp.gz > curated.gp
 zegrep "^X(M|R)" ${asmName}.hg38.gp.gz > predicted.gp
 zegrep -v "^N(M|R)|^YP|X(M|R)" ${asmName}.hg38.gp.gz > other.gp
 
 # verify none lost
 wc -l curated.gp predicted.gp other.gp
    54869 curated.gp
    88703 predicted.gp
     5682 other.gp
   149254 total
 
 zcat ${asmName}.hg38.gp.gz | wc -l
 149254
 
 # extract CDS records from the rna.gbff.gz for bigPsl construction:
 ~/kent/src/hg/makeDb/doc/hg38/gbffToCds.pl ${asmName}_rna.gbff.gz > rna.cds
 
 # ensure simple names are in the rna.fa file
 zcat ${asmName}_rna.fna.gz | sed -e 's/ .*//;' > rna.fa
 pslToBigPsl -fa=rna.fa -cds=rna.cds ${asmName}.hg38.psl.gz test.bigPsl
 sort -k1,1 -k2,2n test.bigPsl > ${asmName}.hg38.bigPsl
 bedToBigBed -type=bed12+12 -tab -as=$HOME/kent/src/hg/lib/bigPsl.as \
   ${asmName}.hg38.bigPsl ../../chrom.sizes ${asmName}.hg38.bigPsl.bb
 
 # loading genePred tracks:
 
 hgLoadGenePred -genePredExt hg38 ncbiRefSeqCurated curated.gp
 genePredCheck -db=hg38 ncbiRefSeqCurated
 # checked: 54869 failed: 0
 
 hgLoadGenePred -genePredExt hg38 ncbiRefSeqPredicted predicted.gp
 genePredCheck -db=hg38 ncbiRefSeqPredicted
 # checked: 88703 failed: 0
 
 hgLoadGenePred -genePredExt hg38 ncbiRefSeqOther other.gp
 genePredCheck -db=hg38 ncbiRefSeqOther
 # checked: 5682 failed: 0
 
 ########## early experiment, not used later
 # # and the bigPsl file:
 # mkdir -p /gbdb/hg38/bbi/ncbiRefSeq
 # ln -s `pwd`/${asmName}.hg38.bigPsl.bb /gbdb/hg38/bbi/ncbiRefSeqBigPsl.bb
 # hgBbiDbLink hg38 ncbiRefSeqBigPsl /gbdb/hg38/bbi/ncbiRefSeqBigPsl.bb
 ########## early experiment, not used later
 
 #############################################################################
 # addition of HGMD-restricted subset, Max, Jan 29 2019
 # updated Dec 2019
-cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06/
+# updated Dec 2020
+cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-10-27/
 # change in 2019: ignore the version numbers, otherwise only 1815 transcripts left, big update by HGMD in 2019?
 # adding "." so NM_123 doesn't match NM_123123                                                                             
 cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | cut -f7 | cut -d. -f1 | sort -u | awk '{print $1"."}' > hgmdTranscripts.txt
 zcat process/hg38.curated.gp.gz | fgrep -f hgmdTranscripts.txt - > hgmd.curated.gp
 hgLoadGenePred -genePredExt hg38 ncbiRefSeqHgmd hgmd.curated.gp
 #############################################################################
 # ncbiRefSeq.p13 update (DONE - 2019-12-06 - Hiram)
 
 # current version information
     cat /gbdb/hg38/ncbiRefSeq/ncbiRefSeqVersion.txt
     # NCBI Homo sapiens Annotation Release 109 (2018-03-29)
 
 # Version information from the file:
 
 # /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/
 #   all_assembly_versions/GCF_000001405.39_GRCh38.p13/
 #   GCF_000001405.39_GRCh38.p13_genomic.gff.gz
 
 #!annotation-date 09/05/2019
 #!annotation-source NCBI Homo sapiens Updated Annotation Release 109.20190905
 
     mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06
     cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06
 
     ### BEFORE loading this updated table
 
     featureBits -countGaps hg38 ncbiRefSeq
  # 134109466 bases of 3257347282 (4.117%) in intersection
 
     featureBits -enrichment hg38 refGene ncbiRefSeq
  # refGene 3.098%, ncbiRefSeq 4.332%, both 2.920%, cover 94.23%, enrich 21.75x
 
     featureBits -enrichment hg38 ncbiRefSeq refGene
  # ncbiRefSeq 4.332%, refGene 3.098%, both 2.920%, cover 67.40%, enrich 21.75x
 
     featureBits -enrichment hg38 ncbiRefSeqCurated refGene
  # ncbiRefSeqCurated 2.880%, refGene 3.098%, both 2.846%, cover 98.84%, enrich 31.90x
 
     featureBits -enrichment hg38 refGene ncbiRefSeqCurated
  # refGene 3.098%, ncbiRefSeqCurated 2.880%, both 2.846%, cover 91.86%, enrich 31.90x
 
     # running step wise just to be careful
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -bigClusterHub=ku -dbHost=hgwdev \
       -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.39_GRCh38.p13 hg38) > download.log 2>&1
     # real    3m23.090s
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -continue=process -bigClusterHub=ku -dbHost=hgwdev \
       -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.39_GRCh38.p13 hg38) > process.log 2>&1
     # real    6m10.922s
 
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -continue=load -bigClusterHub=ku -dbHost=hgwdev \
       -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Homo_sapiens \
       GCF_000001405.39_GRCh38.p13 hg38) > load.log 2>&1
     # real    0m41.366s
 
     ### AFTER loading this updated table
     # compare this result:
     cat fb.ncbiRefSeq.hg38.txt
     # 136778258 bases of 3095998939 (4.418%) in intersection
 
     # with previous version existing table (from fb above):
     # 134109466 bases of 3257347282 (4.117%) in intersection
 
     featureBits -enrichment hg38 refGene ncbiRefSeq
  # refGene 3.098%, ncbiRefSeq 4.418%, both 3.073%, cover 99.19%, enrich 22.45x
  # previous:
  # refGene 3.098%, ncbiRefSeq 4.332%, both 2.920%, cover 94.23%, enrich 21.75x
 
     featureBits -enrichment hg38 ncbiRefSeq refGene
  # ncbiRefSeq 4.418%, refGene 3.098%, both 3.073%, cover 69.56%, enrich 22.45x
  # previous:
  # ncbiRefSeq 4.332%, refGene 3.098%, both 2.920%, cover 67.40%, enrich 21.75x
 
     featureBits -enrichment hg38 ncbiRefSeqCurated refGene
  # ncbiRefSeqCurated 3.073%, refGene 3.098%, both 3.067%, cover 99.81%, enrich 32.22x
  # previous:
  # ncbiRefSeqCurated 2.880%, refGene 3.098%, both 2.846%, cover 98.84%, enrich 31.90x
 
     featureBits -enrichment hg38 refGene ncbiRefSeqCurated
  # refGene 3.098%, ncbiRefSeqCurated 3.073%, both 3.067%, cover 98.99%, enrich 32.22x
  # previous:
  # refGene 3.098%, ncbiRefSeqCurated 2.880%, both 2.846%, cover 91.86%, enrich 31.90x
 
 #########################################################################
 # addition of RefSeq Select-restricted subset, Max, Feb 10 2019
 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06/
 zcat download/*_genomic.gff.gz | egrep 'tag=(RefSeq|MANE) Select'  | cut -f9- | tr ';' '\n' | grep Name= | grep -v NP_ | cut -d= -f2 | sort -u > refseqSelectTranscripts.txt
 cat process/hg38.curated.gp | fgrep -f refseqSelectTranscripts.txt - > refseqSelect.curated.gp
 hgLoadGenePred -genePredExt hg38 ncbiRefSeqSelect refseqSelect.curated.gp
 wc -l refseqSelect.curated.gp
 21071 refseqSelect.curated.gp
 #############################################################################
 
 #############################################################################
 # update 2020-10-27 (DONE - Hiram - 2020-10-27)
 
   mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-10-27
   cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-10-27
 
   time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -bigClusterHub=ku -dbHost=hgwdev \
       -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \
       GCF_000001405.39_GRCh38.p13 hg38) > do.log 2>&1 &
   # real    11m46.506s
 
   cat fb.ncbiRefSeq.hg38.txt
   # 137385668 bases of 3110768607 (4.416%) in intersection
 
 #############################################################################