beded063af0162677da2324db2f15ac8ada64936
braney
  Fri Jul 3 17:26:50 2020 -0700
add tiers to gencode file..  fix up filters.

diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh
index 9548366..225c5cb 100644
--- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh
+++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh
@@ -80,33 +80,38 @@
 #hgsql $tempDb -Ne "select * from knownCds" > knownCds.txt
 #genePredToBigGenePred -colors=colors.txt -known knownGene.gp stdout | sort -k1,1 -k2,2n >  gencodeAnnot$GENCODE_VERSION.bgpInput
 genePredToBigGenePred -colors=colors.txt -geneNames=geneNames.txt -known -cds=knownCds.tab   knownGene.gp  stdout | sort -k1,1 -k2,2n >  gencodeAnnot$GENCODE_VERSION.bgpInput
 
 tawk '{print $4,$0}' gencodeAnnot$GENCODE_VERSION.bgpInput | sort > join1
 hgsql $db -Ne "select transcriptId, transcriptClass from gencodeAttrs$GENCODE_VERSION" | sort > attrs.txt
 join -t $'\t'   join1 attrs.txt > join2
 hgsql $db -Ne "select transcriptId, source from gencodeTranscriptSource$GENCODE_VERSION" | sort > source.txt
 join -t $'\t'   join2 source.txt > join3
 hgsql $db -Ne "select transcriptId, transcriptType from gencodeAttrs$GENCODE_VERSION" | sort > biotype.txt
 join -t $'\t'   join3 biotype.txt > join4
 hgsql $db -Ne "select transcriptId, tag from gencodeTag$GENCODE_VERSION" | sort | tawk '{if ($1 != last) {print last,buff; buff=$2}else {buff=buff "," $2} last=$1} END {print last,buff}' | tail -n +2 > tags.txt
 join -t $'\t' -a 1  -e"none" -o auto   join4 tags.txt > join5
 hgsql $db -Ne "select transcriptId, level from gencodeAttrs$GENCODE_VERSION" | sort > level.txt
 join -t $'\t'   join5 level.txt > join6
-cut -f 2- -d $'\t' join6 | sort -k1,1 -k2,2n > bgpInput.txt
+grep basic tags.txt | tawk '{print $1, 1, "basic"}' > basic.txt
+tawk '{print $5,0,"canonical"}'  knownCanonical.tab | sort > canonical.txt
+tawk '{print $4,2,"all"}' gencodeAnnot$GENCODE_VERSION.bgpInput | sort > all.txt
+sort -k1,1 -k2,2n basic.txt canonical.txt all.txt | tawk '{if ($1 != last) {print last,buff; buff=$3}else {buff=buff "," $3} last=$1} END {print last,buff}' | tail -n +2  > tier.txt
+join -t $'\t'   join6 tier.txt > join7
+cut -f 2- -d $'\t' join7 | sort -k1,1 -k2,2n > bgpInput.txt
 
-bedToBigBed -type=bed12+15 -tab -as=$HOME/kent/src/hg/lib/gencodeBGP.as bgpInput.txt /cluster/data/$db/chrom.sizes $db.gencode$GENCODE_VERSION.bb
+bedToBigBed -type=bed12+16 -tab -as=$HOME/kent/src/hg/lib/gencodeBGP.as bgpInput.txt /cluster/data/$db/chrom.sizes $db.gencode$GENCODE_VERSION.bb
 
 ln -s `pwd`/$db.gencode$GENCODE_VERSION.bb /gbdb/$db/gencode/gencode$GENCODE_VERSION.bb
 
 hgsql $tempDb -e "select * from knownToMrna" | tail -n +2 | tawk '{if ($1 != last) {print last, count, buffer; count=1; buffer=$2} else {count++;buffer=$2","buffer} last=$1}' | tail -n +2 | sort > tmp1
 hgsql $tempDb  -e "select * from knownToMrnaSingle" | tail -n +2 | sort > tmp2
 join  tmp2 tmp1 > knownGene.ev
 
 txGeneAlias $db $spDb kgXref.tab knownGene.ev oldToNew.tab foo.alias foo.protAlias
 tawk '{split($2,a,"."); for(ii = 1; ii <= a[2]; ii++) print $1,a[1] "." ii }' txToAcc.tab >> foo.alias
 sort foo.alias | uniq > ucscGenes.alias
 sort foo.protAlias | uniq > ucscGenes.protAlias
 rm foo.alias foo.protAlias
 hgLoadSqlTab -notOnServer $tempDb kgAlias $kent/src/hg/lib/kgAlias.sql ucscGenes.alias
 hgLoadSqlTab -notOnServer $tempDb kgProtAlias $kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias
 
@@ -145,30 +150,31 @@
 hgMapToGene -tempDb=$tempDb $db affyU133 knownGene knownToU133
 hgMapToGene -tempDb=$tempDb $db affyU95 knownGene knownToU95
 
 hgsql $tempDb -Ne "create view all_mrna as select * from $db.all_mrna"
 hgsql $tempDb -Ne "create view ensGene as select * from $db.ensGene"
 hgsql $tempDb -Ne "create view gtexGene as select * from $db.gtexGene"
 hgsql $tempDb -Ne "create view gencodeAnnotV35 as select * from $db.gencodeAnnotV35"
 hgsql $tempDb -Ne "create view gencodeAttrsV35 as select * from $db.gencodeAttrsV35"
 hgsql $tempDb -Ne "create view gencodeGeneSourceV35 as select * from $db.gencodeGeneSourceV35"
 hgsql $tempDb -Ne "create view gencodeTranscriptSourceV35 as select * from $db.gencodeTranscriptSourceV35"
 hgsql $tempDb -Ne "create view gencodeToPdbV35 as select * from $db.gencodeToPdbV35"
 hgsql $tempDb -Ne "create view gencodeToPubMedV35 as select * from $db.gencodeToPubMedV35"
 hgsql $tempDb -Ne "create view gencodeToRefSeqV35 as select * from $db.gencodeToRefSeqV35"
 hgsql $tempDb -Ne "create view gencodeTagV35 as select * from $db.gencodeTagV35"
 hgsql $tempDb -Ne "create view gencodeTranscriptSupportV35 as select * from $db.gencodeTranscriptSupportV35"
+hgsql $tempDb -Ne "create view gencodeToUniProtV35 as select * from $db.gencodeToUniProtV35"
 
 bioCycDir=/hive/data/outside/bioCyc/190905/download/humancyc/21.0/data
 mkdir $dir/bioCyc
 cd $dir/bioCyc
 
 grep -E -v "^#" $bioCycDir/genes.col  > genes.tab  
 grep -E -v "^#" $bioCycDir/pathways.col  | awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' > pathways.tab
 
 kgBioCyc1 -noEnsembl genes.tab pathways.tab $tempDb bioCycPathway.tab bioCycMapDesc.tab  
 hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
 hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
 
 mkdir -p $dir/kegg
 cd $dir/kegg