f47753d62c5f4905a2ffa14547117e322d19c699 hiram Fri Mar 12 12:50:55 2021 -0800 added ucscToEnsembl table and update chromAlias table refs #27194 diff --git src/hg/makeDb/doc/mm39/initialBuild.txt src/hg/makeDb/doc/mm39/initialBuild.txt index 03ac9d4..ec8c4ce 100644 --- src/hg/makeDb/doc/mm39/initialBuild.txt +++ src/hg/makeDb/doc/mm39/initialBuild.txt @@ -599,30 +599,73 @@ ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking assembly: 61 =? 61 OK # checking genbank: 61 =? 61 OK # verify chrM is here properly: grep chrM mm39.chromAlias.tab # AY172335.1 chrM genbank # MT chrM assembly # NC_005089.1 chrM refseq hgLoadSqlTab mm39 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ mm39.chromAlias.tab +` + # Adding Ensembl 2021-03-12 upon release of v103. + # And refseq names exist now too + + cd /hive/data/genomes/mm39/bed/chromAlias + hgsql -N -e 'select * from ucscToEnsembl;' mm39 > ucsc.ensembl.tab + + join -t$'\t' ../idKeys/mm39.idKeys.txt \ + /hive/data/genomes/asmHubs/refseqBuild/GCF/000/001/635/GCF_000001635.27_GRCm39/idKeys/GCF_000001635.27_GRCm39.idKeys.txt \ + | cut -f2-3 | sort > ucsc.refseq.tab + + mv mm39.chromAlias.tab mm39.chromAlias.tab.0 + + ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ + > mm39.chromAlias.tab +# working: assembly +# working: ensembl +# working: genbank +# working: refseq + +for t in assembly ensembl genbank refseq +do + c0=`cat ucsc.$t.tab | wc -l` + c1=`grep $t mm39.chromAlias.tab | wc -l` + ok="OK" + if [ "$c0" -ne "$c1" ]; then + ok="ERROR" + fi + printf "# checking $t: $c0 =? $c1 $ok\n" +done +# checking assembly: 61 =? 61 OK +# checking ensembl: 61 =? 61 OK +# checking genbank: 61 =? 61 OK +# checking refseq: 61 =? 61 OK + + # verify chrM is here properly: + grep chrM mm39.chromAlias.tab +# AY172335.1 chrM genbank +# MT chrM assembly,ensembl +# NC_005089.1 chrM refseq + + hgLoadSqlTab mm39 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ + mm39.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2020-07-27 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/mouse/mm39 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" mm39 \ | sed -e 's/[0-9.]\+//;' | sort | uniq -c | sed -e 's/^/# /;' # 15228 AC # 816 AEKQ # 8 AEKR # 1 AF # 3876 AL # 1 AY # 844 BX # 191 CAAA @@ -1344,15 +1387,42 @@ # update 2020-10-27 (DONE - Hiram - 2020-10-27) mkdir /hive/data/genomes/mm39/bed/ncbiRefSeq.2020-10-27 cd /hive/data/genomes/mm39/bed/ncbiRefSeq.2020-10-27 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \ GCF_000001635.27_GRCm39 mm39) > do.log 2>&1 & # real 10m2.220s cat fb.ncbiRefSeq.mm39.txt # 128640844 bases of 2654624157 (4.846%) in intersection ############################################################################# +# create ucscToEnsembl name mapping (DONE - 2021-03-12 - Hiram) + # this allows the "ensembl" blue bar button to appear + mkdir /hive/data/genomes/mm39/bed/ucscToEnsembl + cd /hive/data/genomes/mm39/bed/ucscToEnsembl + + join -t$'\t' ../idKeys/mm39.idKeys.txt \ +/hive/data/outside/ensembl/genomes/release-103/idKeys/Mus_musculus/Mus_musculus.GRCm39.idKeys.txt \ + | cut -f2-3 | sort > ucscToEnsembl.tab + + # determine size of PRIMARY KEY index + awk '{print length($1)}' *.tab | sort -n | tail + # 22 + + printf '# UCSC to Ensembl chr name translation +CREATE TABLE ucscToEnsembl ( + ucsc varchar(255) not null, # UCSC chromosome name + ensembl varchar(255) not null, # Ensembl chromosome name + #Indices + PRIMARY KEY(ucsc(22)) +); +' > ucscToEnsembl.sql + + hgLoadSqlTab mm39 ucscToEnsembl ucscToEnsembl.sql ucscToEnsembl.tab + + # verify the blue bar "ensembl" link is now available under the 'View' + # tab +#############################################################################