919f6584f7a27525a8c287e10eafc31f1d96987f
braney
  Mon Mar 1 07:39:07 2021 -0800
create knownGene short-circuit trix file from knownCanonical without
pseudogenes

diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh
index dfd333a..f284599 100644
--- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh
+++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh
@@ -350,31 +350,31 @@
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 hgKgGetText $tempDb tempSearch.txt
 sort tempSearch.txt > tempSearch2.txt
 tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt
 join tempSearch2.txt tempSearch3.txt | sort > knownGene.txt
 ixIxx knownGene.txt knownGene${GENCODE_VERSION}.ix knownGene${GENCODE_VERSION}.ixx
  rm -rf /gbdb/$db/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ixx
 ln -s $dir/knownGene${GENCODE_VERSION}.ix  /gbdb/$db/knownGene${GENCODE_VERSION}.ix
 ln -s $dir/knownGene${GENCODE_VERSION}.ixx /gbdb/$db/knownGene${GENCODE_VERSION}.ixx  
 tawk '{print $5}' knownCanonical.tab | sort > knownCanonicalId.txt
-join knownCanonicalId.txt knownGene.txt > knownGeneFast.txt
+join knownCanonicalId.txt knownGene.txt | join -v 1 /dev/stdin pseudo.txt > knownGeneFast.txt
 ixIxx knownGeneFast.txt knownGeneFast${GENCODE_VERSION}.ix knownGeneFast${GENCODE_VERSION}.ixx
  rm -rf /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ix /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ixx
 ln -s $dir/knownGeneFast${GENCODE_VERSION}.ix  /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ix
 ln -s $dir/knownGeneFast${GENCODE_VERSION}.ixx /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ixx  
 
 #zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed
 #jtwoBitToFa -noMask /cluster/data/$db/$db.2bit -bed=ucscGenes.bed stdout | faFilter -uniq stdin  ucscGenes.fa
 #jhgPepPred $tempDb generic knownGeneMrna ucscGenes.fa
 bedToPsl /cluster/data/$db/chrom.sizes ucscGenes.bed ucscGenes.psl
 pslRecalcMatch ucscGenes.psl /cluster/data/$db/$db.2bit ucscGenes.fa kgTargetAli.psl
 # should be zero
 awk '$11 != $1 + $3+$4' kgTargetAli.psl
 hgLoadPsl $tempDb kgTargetAli.psl
 
 cd $dir