919f6584f7a27525a8c287e10eafc31f1d96987f braney Mon Mar 1 07:39:07 2021 -0800 create knownGene short-circuit trix file from knownCanonical without pseudogenes diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh index dfd333a..f284599 100644 --- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh @@ -350,31 +350,31 @@ # Clean up rm -r split fold err batch.bak cd ../utr5 rm -r split fold err batch.bak hgKgGetText $tempDb tempSearch.txt sort tempSearch.txt > tempSearch2.txt tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt join tempSearch2.txt tempSearch3.txt | sort > knownGene.txt ixIxx knownGene.txt knownGene${GENCODE_VERSION}.ix knownGene${GENCODE_VERSION}.ixx rm -rf /gbdb/$db/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ixx ln -s $dir/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ix ln -s $dir/knownGene${GENCODE_VERSION}.ixx /gbdb/$db/knownGene${GENCODE_VERSION}.ixx tawk '{print $5}' knownCanonical.tab | sort > knownCanonicalId.txt -join knownCanonicalId.txt knownGene.txt > knownGeneFast.txt +join knownCanonicalId.txt knownGene.txt | join -v 1 /dev/stdin pseudo.txt > knownGeneFast.txt ixIxx knownGeneFast.txt knownGeneFast${GENCODE_VERSION}.ix knownGeneFast${GENCODE_VERSION}.ixx rm -rf /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ix /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ixx ln -s $dir/knownGeneFast${GENCODE_VERSION}.ix /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ix ln -s $dir/knownGeneFast${GENCODE_VERSION}.ixx /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ixx #zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed #jtwoBitToFa -noMask /cluster/data/$db/$db.2bit -bed=ucscGenes.bed stdout | faFilter -uniq stdin ucscGenes.fa #jhgPepPred $tempDb generic knownGeneMrna ucscGenes.fa bedToPsl /cluster/data/$db/chrom.sizes ucscGenes.bed ucscGenes.psl pslRecalcMatch ucscGenes.psl /cluster/data/$db/$db.2bit ucscGenes.fa kgTargetAli.psl # should be zero awk '$11 != $1 + $3+$4' kgTargetAli.psl hgLoadPsl $tempDb kgTargetAli.psl cd $dir