d13887bf1bc4a13e590d7c1ff16b2d81b2cdb18e
hiram
  Mon Sep 9 10:01:11 2019 -0700
added chromAlias table no redmine

diff --git src/hg/makeDb/doc/pteVam1.txt src/hg/makeDb/doc/pteVam1.txt
index 78e0e90..f208627 100644
--- src/hg/makeDb/doc/pteVam1.txt
+++ src/hg/makeDb/doc/pteVam1.txt
@@ -1,325 +1,374 @@
 # for emacs: -*- mode: sh; -*-
 
 # Pteropus vampyrus
 
 #	http://www.ncbi.nlm.nih.gov/bioproject/20325
 #	http://www.ncbi.nlm.nih.gov/genome/757
 #	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRP00
 
 #########################################################################
 # DOWNLOAD SEQUENCE
     ssh kkstore05
     mkdir /cluster/store12/pteVam1
     ln -s /cluster/store12/pteVam1 /cluster/data
     mkdir /cluster/data/pteVam1/broad
     cd /cluster/data/pteVam1/broad
 
     wget --timestamping \
 ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.agp \
 ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.bases.gz \
 ftp://ftp.broad.mit.edu/pub/assemblies/mammals/megabat/Ptevap1.0/assembly.quals.gz
     md5sum ass* > assembly.md5sum
 
     qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin pteVam1.qual.qac
 
     cut -f 1 assembly.agp | uniq -c | wc -l
 # Number of scaffolds: 96944
 
 #########################################################################
 # Create .ra file and run makeGenomeDb.pl
     ssh kolossus
     cd /cluster/data/pteVam1
 cat << _EOF_ >pteVam1.config.ra
 # Config parameters for makeGenomeDb.pl:
 db pteVam1
 clade mammal
 genomeCladePriority 35
 scientificName Pteropus vampyrus
 commonName Megabat
 assemblyDate Jul. 2008
 assemblyLabel Broad Institute pteVam1
 orderKey 233.5
 #mitoAcc AJ222767
 mitoAcc none
 fastaFiles /cluster/data/pteVam1/broad/assembly.bases.gz
 agpFiles /cluster/data/pteVam1/broad/assembly.agp
 qualFiles /cluster/data/pteVam1/broad/pteVam1.qual.qac
 dbDbSpeciesDir megabat
 _EOF_
 
 # use 'screen' make sure on kkstore05
     makeGenomeDb.pl -workhorse kolossus pteVam1.config.ra > makeGenomeDb.out 2>&1 &
 
     cut -f 2 chrom.sizes | ave stdin
 # Q1 1162.000000
 # median 1777.000000
 # Q3 9074.500000
 # average 20589.994327
 # min 601.000000
 # max 1019178.000000
 # count 96944
 # total 1996076410.000000
 # standard deviation 54685.855293
 
 #########################################################################
 ## Repeat Masker (DONE - 2008-10-16 - Hiram)
     screen	# to manage this several day job
     mkdir /hive/data/genomes/pteVam1/bed/repeatMasker
     cd /hive/data/genomes/pteVam1/bed/repeatMasker
     time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
 	-workhorse=hgwdev -bigClusterHub=swarm \
 	-buildDir=`pwd` pteVam1 > do.log 2>&1 &
     #	real    806m48.065s
     twoBitToFa pteVam1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
     #	1996076410 bases (156639750 N's 1839436660 real 1256029530 upper
     #	583407130 lower) in 96944 sequences in 1 files
     #	%29.23 masked total, %31.72 masked real
 
 #########################################################################
 # SIMPLE REPEATS TRF (WORKING - 2008-10-15 - Hiram)
     screen # use a screen to manage this job
     mkdir /cluster/data/pteVam1/bed/simpleRepeat
     cd /cluster/data/pteVam1/bed/simpleRepeat
     #
     time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
 	-buildDir=/cluster/data/pteVam1/bed/simpleRepeat pteVam1 > do.log 2>&1 &
     #	real    98m59.097
     cat fb.simpleRepeat
     #	33179383 bases of 1839436660 (1.804%) in intersection
 
 XXX - waiting for RM to complete 2008-10-15 13:32
     #	after RM run is done, add this mask:
     cd /hive/data/genomes/pteVam1
     rm pteVam1.2bit
     twoBitMask pteVam1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed pteVam1.2bit
     #	can safely ignore warning about >=13 fields in bed file
 
     twoBitToFa pteVam1.2bit stdout | faSize stdin > pteVam1.2bit.faSize.txt
 # 1996076410 bases (156639750 N's 1839436660 real 1255425444 upper 584011216
 # lower) in 96944 sequences in 1 files
 # %29.26 masked total, %31.75 masked rea
 
     #	link to gbdb
     ln -s `pwd`/pteVam1.2bit /gbdb/pteVam1
 
 ###########################################################################
 # prepare for kluster runs (DONE _ 2008-10-16 - Hiram)
     # compare to size of real bases to adjust the repMatch
     #	hg18: 2881421696
     #	pteVam1: 1839436660
     # thus: 1024 * 1839436660/2881421696 = 653
     #	rounding up to 700 for a bit more conservative masking
     cd /hive/data/genomes/pteVam1
     time blat pteVam1.2bit \
 	/dev/null /dev/null -tileSize=11 -makeOoc=pteVam1.11.ooc -repMatch=700
     #	Wrote 19840 overused 11-mers to pteVam1.11.ooc
     #	real    1m44.676s
 
     #	and staging data for push to kluster nodes
     mkdir /hive/data/staging/data/pteVam1
     cp -p pteVam1.2bit chrom.sizes pteVam1.11.ooc \
 	/hive/data/staging/data/pteVam1
     #	request to cluster admin to push this to the kluster nodes
     #	/scratch/data/
 
 ###########################################################################
 # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
     hgsql -e 'update dbDb set
 sourceName="Broad Institute pteVam1 (NCBI project 20325, ABRP01000000)" where name="pteVam1";' hgcentraltest
 
 ############################################################################
 #  pteVam1 - Megabat - Ensembl Genes version 51  (DONE - 2008-12-02 - hiram)
     ssh kkr14u07
     cd /hive/data/genomes/pteVam1
     cat << '_EOF_' > pteVam1.ensGene.ra
 # required db variable
 db pteVam1
 # do we need to translate geneScaffold coordinates
 geneScaffolds yes
 # ignore genes that do not properly convert to a gene pred, and contig
 #       names that are not in the UCSC assembly
 skipInvalid yes
 # ignore the two genes that have invalid structures from Ensembl:
 #       6381: ENSPVAT00000012919 no exonFrame on CDS exon 14
 #       23522: ENSPVAT00000010661 no exonFrame on CDS exon 0
 '_EOF_'
 #  << happy emacs
 
     doEnsGeneUpdate.pl -ensVersion=51 pteVam1.ensGene.ra
     ssh hgwdev
     cd /hive/data/genomes/pteVam1/bed/ensGene.51
     featureBits pteVam1 ensGene
     # 28722584 bases of 1839436660 (1.561%) in intersection
 
  *** All done!  (through the 'makeDoc' step)
  *** Steps were performed in /hive/data/genomes/pteVam1/bed/ensGene.51
 
 ############################################################################
 # cpgIslands - (DONE - 2011-04-24 - Hiram)
     mkdir /hive/data/genomes/pteVam1/bed/cpgIslands
     cd /hive/data/genomes/pteVam1/bed/cpgIslands
     time doCpgIslands.pl pteVam1 > do.log 2>&1
     #   real    239m18.855s
 
     cat fb.pteVam1.cpgIslandExt.txt
     #   49493178 bases of 1839436660 (2.691%) in intersection
 
 #########################################################################
 # genscan - (DONE - 2011-04-26 - Hiram)
     mkdir /hive/data/genomes/pteVam1/bed/genscan
     cd /hive/data/genomes/pteVam1/bed/genscan
     time doGenscan.pl pteVam1 > do.log 2>&1
     #   Elapsed time: 356m2s
 
     cat fb.pteVam1.genscan.txt
     #   55912473 bases of 1839436660 (3.040%) in intersection
     cat fb.pteVam1.genscanSubopt.txt
     #   60619613 bases of 1839436660 (3.296%) in intersection
 
 #########################################################################
 # windowMasker - (DONE - 2012-05-02 - Hiram)
     screen -S pteVam1
     mkdir /hive/data/genomes/pteVam1/bed/windowMasker
     cd /hive/data/genomes/pteVam1/bed/windowMasker
     # trying out new version of the script that does all the usual steps
     #   that used to be performed manually after it was done
     time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
         -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev pteVam1 > do.log 2>&1
     #   real    278m34.535s
     # fixing the last couple of commands from doLoadClean step
     time ./lastClean.csh
     # real    4m47.606s
 
     cat fb.pteVam1.windowmaskerSdust.clean.txt
     #   514392905 bases of 1996076410 (25.770%) in intersection
 
     cat faSize.pteVam1.cleanWMSdust.txt
     #   1996076410 bases (156639750 N's 1839436660 real 1325043755 upper 514392905 lower) in 96944 sequences in 1 files
     #   Total size: mean 20590.0 sd 54686.1 min 601 (scaffold_96943) max 1019178 (scaffold_0) median 1777
     #   %25.77 masked total, %27.96 masked real
 
      cat fb.pteVam1.rmsk.windowmaskerSdust.txt
     #   238443817 bases of 1996076410 (11.946%) in intersection
 
     time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
         -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \
         -dbHost=hgwdev pteVam1 > cleanup.log 2>&1
     #   real    2m19.430s
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
     # Use -repMatch=900, based on size -- for human we use 1024
     # use the "real" number from the faSize measurement,
     # hg19 is 2897316137, calculate the ratio factor for 1024:
     calc \( 1839436660 / 2897316137 \) \* 1024
     #	( 1839436660 / 2897316137 ) * 1024 = 650.113088
 
     # round up to 700
 
     cd /hive/data/genomes/pteVam1
     time blat pteVam1.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=jkStuff/pteVam1.11.ooc -repMatch=700
     #   Wrote 19840 overused 11-mers to jkStuff/pteVam1.11.ooc
     #   real    0m31.002s
 
     # there are no non-bridged gaps, no lift file needed for genbank
     hgsql -N -e "select bridge from gap;" pteVam1 | sort | uniq -c
     #   291864 yes
 #    cd /hive/data/genomes/pteVam1/jkStuff
 #    gapToLift pteVam1 pteVam1.nonBridged.lift -bedFile=pteVam1.nonBridged.bed
     # largest non-bridged contig:
 #    awk '{print $3-$2,$0}' pteVam1.nonBridged.bed | sort -nr | head
     #   123773608 chrX  95534   123869142       chrX.01
 
 #########################################################################
 # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
     # examine the file:
     /cluster/data/genbank/data/organism.lst
     # for your species to see what counts it has for:
 # organism       mrnaCnt estCnt  refSeqCnt
 # Pteropus vampyrus	2	1	0
     # to decide which "native" mrna or ests you want to specify in genbank.conf
 
     ssh hgwdev
     cd $HOME/kent/src/hg/makeDb/genbank
     git pull
     # edit etc/genbank.conf to add pteVam1 just after ce2
 # pteVam1 (megabat)
 pteVam1.serverGenome = /hive/data/genomes/pteVam1/pteVam1.2bit
 pteVam1.clusterGenome = /hive/data/genomes/pteVam1/pteVam1.2bit
 pteVam1.ooc = /hive/data/genomes/pteVam1/jkStuff/pteVam1.11.ooc
 pteVam1.lift = no
 pteVam1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
 pteVam1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
 pteVam1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
 pteVam1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
 pteVam1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
 pteVam1.refseq.mrna.native.load = no
 pteVam1.refseq.mrna.xeno.load = yes
 pteVam1.genbank.mrna.xeno.load = yes
 pteVam1.genbank.est.native.load = no
 pteVam1.downloadDir = pteVam1
 pteVam1.perChromTables = no
 
     # end of section added to etc/genbank.conf
     git commit -m "adding pteVam1 megabat" etc/genbank.conf
     git push
     make etc-update
 
     git pull
     # Edit src/lib/gbGenome.c to add new species.
     git commit -m "adding definition for pteVamNames" src/lib/gbGenome.c
     git push
     make install-server
 
     ssh hgwdev			# used to do this on "genbank" machine
     screen -S pteVam1           # long running job managed in screen
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbAlignStep -initial pteVam1 &
     #   var/build/logs/2012.06.08-09:55:28.pteVam1.initalign.log
     #   real    1862m46.698s
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad pteVam1 &
     #	logFile:  var/dbload/hgwdev/logs/2012.06.11-13:09:05.dbload.log
     #   real    14m48.000s
 
     # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     git pull
     # add pteVam1 to:
     vi etc/align.dbs etc/hgwdev.dbs
     git commit -m "Added pteVam1." etc/align.dbs etc/hgwdev.dbs
     git push
     make etc-update
 
 #########################################################################
 # set default position to RHO gene displays  (DONE - 2012-07-24 - Hiram)
     hgsql -e \
 'update dbDb set defaultPos="scaffold_8765:35746-39000" where name="pteVam1";' \
 	hgcentraltest
 
 ############################################################################
 # pushQ entry (DONE - 2012-07-24 - Hiram)
     mkdir /hive/data/genomes/pteVam1/pushQ
     cd /hive/data/genomes/pteVam1/pushQ
     # Mark says don't let the transMap track get there
     time makePushQSql.pl pteVam1 2> stderr.txt | grep -v transMap > pteVam1.sql
     #   real    3m39.590s
     # check the stderr.txt for bad stuff, these kinds of warnings are OK:
 # WARNING: hgwdev does not have /gbdb/pteVam1/wib/gc5Base.wib
 # WARNING: hgwdev does not have /gbdb/pteVam1/wib/quality.wib
 # WARNING: hgwdev does not have /gbdb/pteVam1/bbi/quality.bw
 # WARNING: pteVam1 does not have seq
 # WARNING: pteVam1 does not have extFile
 # WARNING: pteVam1 does not have estOrientInfo
 
     scp -p pteVam1.sql hgwbeta:/tmp
     ssh hgwbeta "hgsql qapushq < /tmp/pteVam1.sql"
 
 ############################################################################
 ##############################################################################
 # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
 ##############################################################################
+# add chromAlias table (DONE - 2016-10-15 - Hiram)
+
+    mkdir /hive/data/genomes/pteVam1/bed/chromAlias
+    cd /hive/data/genomes/pteVam1/bed/chromAlias
+
+    hgsql -N -e 'select chrom,name from ucscToRefSeq;' pteVam1 \
+        | sort -k1,1 > ucsc.refseq.tab
+    hgsql -N -e 'select chrom,name from ucscToINSDC;' pteVam1 \
+        | sort -k1,1 > ucsc.genbank.tab
+
+    ### Adding Ensembl alias with v95 release, after idKeys made: 2019-01-16
+    join -t$'\t' ../idKeys/pteVam1.idKeys.txt \
+        ../../ens95/ensPteVam1.idKeys.txt | cut -f2- \
+          | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
+            | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
+               | sort -k1,1 -k2,2n > ucscToEns.bed
+    cut -f1,4 ucscToEns.bed | sort > ucsc.ensembl.tab
+    # and chrM name for genbank is: AJ421455.1
+    wc -l *.tab
+#    6835 ucsc.ensembl.tab
+#    6835 ucsc.genbank.tab
+#    6835 ucsc.refseq.tab
+
+    grep chrM *.tab
+# ucsc.ensembl.tab:chrM   MT
+# ucsc.genbank.tab:chrM   AJ421455.1
+# ucsc.refseq.tab:chrM    NC_004299.1
+
+    ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \
+	> pteVam1.chromAlias.tab
+
+for t in refseq genbank ensembl
+do
+  c0=`cat ucsc.$t.tab | wc -l`
+  c1=`grep $t pteVam1.chromAlias.tab | wc -l`
+  ok="OK"
+  if [ "$c0" -ne "$c1" ]; then
+     ok="ERROR"
+  fi
+  printf "# checking $t: $c0 =? $c1 $ok\n"
+done
+# checking refseq: 6835 =? 6835 OK
+# checking genbank: 6835 =? 6835 OK
+# checking ensembl: 6835 =? 6835 OK
+
+    hgLoadSqlTab pteVam1 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
+        pteVam1.chromAlias.tab
+
+#########################################################################