src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh a9c9cf2828c124535a1562f709bd4437b1bf60a2

a9c9cf2828c124535a1562f709bd4437b1bf60a2
braney
  Mon Jun 22 16:41:31 2020 -0700
starting on making hgGene use and external database for the know* tables

diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh
new file mode 100644
index 0000000..1c91213
--- /dev/null
+++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh
@@ -0,0 +1,97 @@
+# This doc assumes that the gencode* tables have been built on $db
+db=hg38
+GENCODE_VERSION=V35
+dir=/hive/data/genomes/$db/bed/gencode$GENCODE_VERSION
+genomes=/hive/data/genomes
+tempDb=knownGeneV35
+kent=$HOME/kent
+
+mkdir -p $dir
+cd $dir
+
+echo "create database $tempDb" | hgsql ""
+echo "create table chromInfo like  $db.chromInfo" | hgsql $tempDb
+echo "insert into chromInfo select * from   $db.chromInfo" | hgsql $tempDb
+
+hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 |  genePredToBed stdin stdout | sort -k1,1 -k2,2n | gzip -c > gencode${GENCODE_VERSION}.bed.gz
+
+touch oldToNew.tab
+
+zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed
+txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg
+txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq | bedClip stdin /cluster/data/hg38/chrom.sizes  ucscSplice.bed
+
+zcat gencode${GENCODE_VERSION}.bed.gz |  awk '{print $4}' | sort > newGencodeName.txt
+hgsql $db -Ne "select name,alignId from knownGene" | sort > oldGenToUcsc.txt
+kgAllocId oldGenToUcsc.txt newGencodeName.txt 5070122 stdout | sort -u >  txToAcc.tab
+# lastId 5072896
+
+makeUcscGeneTables -justKnown $db $tempDb $GENCODE_VERSION txToAcc.tab
+hgLoadSqlTab -notOnServer $tempDb knownGene $kent/src/hg/lib/knownGene.sql knownGene.gp
+hgLoadGenePred -genePredExt $tempDb  knownGeneExt knownGeneExt.gp
+hgMapToGene -type=psl -all -tempDb=$tempDb $db all_mrna knownGene knownToMrna
+hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToRefSeq
+hgMapToGene -type=psl -tempDb=$tempDb $db all_mrna knownGene knownToMrnaSingle
+makeUcscGeneTables $db $tempDb $GENCODE_VERSION txToAcc.tab
+hgLoadSqlTab -notOnServer $tempDb knownCanonical $kent/src/hg/lib/knownCanonical.sql knownCanonical.tab
+sort knownIsoforms.tab | uniq | hgLoadSqlTab -notOnServer $tempDb knownIsoforms $kent/src/hg/lib/knownIsoforms.sql stdin
+genePredToProt knownGeneExt.gp /cluster/data/$db/$db.2bit tmp.faa
+faFilter -uniq tmp.faa ucscGenes.faa
+hgPepPred $tempDb generic knownGenePep ucscGenes.faa
+
+
+hgLoadSqlTab -notOnServer $tempDb kgXref $kent/src/hg/lib/kgXref.sql kgXref.tab
+
+#ifdef NOTNOW
+# calculate score field with bitfields
+hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | cut -f 2- | sort > gencodeAnnot$GENCODE_VERSION.txt
+hgsql $db -Ne "select name,2 from gencodeAnnot$GENCODE_VERSION" | sort  > knownCanon.txt
+hgsql $db -Ne "select * from gencodeTag$GENCODE_VERSION" | grep basic |  sed 's/basic/1/' | sort > knownTag.txt
+hgsql $db -Ne "select transcriptId,transcriptClass from gencodeAttrs$GENCODE_VERSION where transcriptClass='pseudo'" |  sed 's/pseudo/4/' > knownAttrs.txt
+sort knownCanon.txt knownTag.txt knownAttrs.txt | awk '{if ($1 != last) {print last, sum; sum=$2; last=$1}  else {sum += $2; }} END {print last, sum}' | tail -n +2  > knownScore.txt
+#endif
+
+hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 |  tawk '{print $1,$13,$14,$8,$15}' | sort | uniq > knownCds.tab
+hgLoadSqlTab -notOnServer $tempDb knownCds $kent/src/hg/lib/knownCds.sql knownCds.tab
+
+
+cat  << __EOF__  > colors.sed
+s/coding/12\t12\t120/
+s/nonCoding/0\t100\t0/
+s/pseudo/255\t51\t255/
+s/other/254\t0\t0/
+__EOF__
+
+hgsql $db -Ne "select * from gencodeAttrs$GENCODE_VERSION" | tawk '{print $4,$10}' | sed -f colors.sed > colors.txt
+
+#ifdef NOTNOW
+hgsql $db -Ne "select * from gencodeToUniProt$GENCODE_VERSION" | tawk '{print $1,$2}'|  sort > uniProt.txt
+hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | tawk '{print $1,$12}' | sort > gene.txt
+join -a 1 gene.txt uniProt.txt > geneNames.txt
+#endif
+
+//genePredToBigGenePred -score=knownScore.txt  -colors=colors.txt -geneNames=geneNames.txt  -known gencodeAnnot$GENCODE_VERSION.txt  gencodeAnnot$GENCODE_VERSION.bgpInput
+hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | cut -f 2- > gencodeAnnot$GENCODE_VERSION.txt
+#genePredToBigGenePred -colors=colors.txt gencodeAnnot$GENCODE_VERSION.txt stdout | sort -k1,1 -k2,2n >  gencodeAnnot$GENCODE_VERSION.bgpInput
+
+hgsql $tempDb -Ne "select kgId, geneSymbol, spID from kgXref" > geneNames.txt
+#hgsql $tempDb -Ne "select * from knownCds" > knownCds.txt
+#genePredToBigGenePred -colors=colors.txt -known knownGene.gp stdout | sort -k1,1 -k2,2n >  gencodeAnnot$GENCODE_VERSION.bgpInput
+genePredToBigGenePred -colors=colors.txt -geneNames=geneNames.txt -known -cds=knownCds.tab   knownGene.gp  stdout | sort -k1,1 -k2,2n >  gencodeAnnot$GENCODE_VERSION.bgpInput
+
+tawk '{print $4,$0}' gencodeAnnot$GENCODE_VERSION.bgpInput | sort > join1
+hgsql $db -Ne "select transcriptId, transcriptClass from gencodeAttrs$GENCODE_VERSION" | sort > attrs.txt
+join -t $'\t'   join1 attrs.txt > join2
+hgsql $db -Ne "select transcriptId, source from gencodeTranscriptSource$GENCODE_VERSION" | sort > source.txt
+join -t $'\t'   join2 source.txt > join3
+hgsql $db -Ne "select transcriptId, transcriptType from gencodeAttrs$GENCODE_VERSION" | sort > biotype.txt
+join -t $'\t'   join3 biotype.txt > join4
+hgsql $db -Ne "select transcriptId, tag from gencodeTag$GENCODE_VERSION" | sort | tawk '{if ($1 != last) {print last,buff; buff=$2}else {buff=buff "," $2} last=$1} END {print last,buff}' | tail -n +2 > tags.txt
+join -t $'\t' -a 1  -e"none" -o auto   join4 tags.txt > join5
+hgsql $db -Ne "select transcriptId, level from gencodeAttrs$GENCODE_VERSION" | sort > level.txt
+join -t $'\t'   join5 level.txt > join6
+cut -f 2- -d $'\t' join6 | sort -k1,1 -k2,2n > bgpInput.txt
+
+bedToBigBed -type=bed12+15 -tab -as=$HOME/kent/src/hg/lib/gencodeBGP.as bgpInput.txt /cluster/data/$db/chrom.sizes $db.gencode$GENCODE_VERSION.bb
+
+ln -s `pwd`/$db.gencode$GENCODE_VERSION.bb /gbdb/$db/gencode/gencode$GENCODE_VERSION.bb