0b81078213eff40174d357fbc8632ed7077baec4 braney Fri Aug 21 09:47:38 2020 -0700 changes to support the lack of native knownGene tables in an assembly. diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh index 28cd813..ccedd69 100644 --- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh @@ -98,30 +98,31 @@ tawk '{print $4,2,"all"}' gencodeAnnot$GENCODE_VERSION.bgpInput | sort > all.txt sort -k1,1 -k2,2n basic.txt canonical.txt all.txt | tawk '{if ($1 != last) {print last,buff; buff=$3}else {buff=buff "," $3} last=$1} END {print last,buff}' | tail -n +2 > tier.txt join -t $'\t' join6 tier.txt > join7 cut -f 2- -d $'\t' join7 | sort -k1,1 -k2,2n > bgpInput.txt bedToBigBed -extraIndex=name -type=bed12+16 -tab -as=$HOME/kent/src/hg/lib/gencodeBGP.as bgpInput.txt /cluster/data/$db/chrom.sizes $db.gencode$GENCODE_VERSION.bb ln -s `pwd`/$db.gencode$GENCODE_VERSION.bb /gbdb/$db/gencode/gencode$GENCODE_VERSION.bb hgsql $tempDb -e "select * from knownToMrna" | tail -n +2 | tawk '{if ($1 != last) {print last, count, buffer; count=1; buffer=$2} else {count++;buffer=$2","buffer} last=$1}' | tail -n +2 | sort > tmp1 hgsql $tempDb -e "select * from knownToMrnaSingle" | tail -n +2 | sort > tmp2 join tmp2 tmp1 > knownGene.ev txGeneAlias $db $spDb kgXref.tab knownGene.ev oldToNew.tab foo.alias foo.protAlias tawk '{split($2,a,"."); for(ii = 1; ii <= a[2]; ii++) print $1,a[1] "." ii }' txToAcc.tab >> foo.alias +tawk '{split($1,a,"."); for(ii = 1; ii <= a[2] - 1; ii++) print $1,a[1] "." ii }' txToAcc.tab >> foo.alias sort foo.alias | uniq > ucscGenes.alias sort foo.protAlias | uniq > ucscGenes.protAlias rm foo.alias foo.protAlias hgLoadSqlTab -notOnServer $tempDb kgAlias $kent/src/hg/lib/kgAlias.sql ucscGenes.alias hgLoadSqlTab -notOnServer $tempDb kgProtAlias $kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from hgFixed.refLink" $db > refToLl.txt hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToLocusLink -lookup=refToLl.txt knownToVisiGene $tempDb -probesDb=$db hgMapToGene $db -tempDb=$tempDb -all -type=genePred gtexGeneModelV6 knownGene knownToGtex awk '{OFS="\t"} {print $4,$4}' ucscGenes.bed | sort | uniq > knownToEnsembl.tab cp knownToEnsembl.tab knownToGencode${GENCODE_VERSION}.tab hgLoadSqlTab -notOnServer $tempDb knownToEnsembl $kent/src/hg/lib/knownTo.sql knownToEnsembl.tab