9f870fc604395b879ccf923b6f1c0a5f291c55ad braney Mon Sep 21 16:06:44 2020 -0700 blastDb's for Gencode V35 diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh index ccedd69..403ca14 100644 --- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh @@ -1,25 +1,56 @@ # This doc assumes that the gencode* tables have been built on $db db=hg38 GENCODE_VERSION=V35 dir=/hive/data/genomes/$db/bed/gencode$GENCODE_VERSION/build genomes=/hive/data/genomes tempDb=knownGeneV35 kent=$HOME/kent spDb=sp180404 cpuFarm=ku export curVer=13 +export Db=Hg38 +export xdb=mm10 +export Xdb=Mm10 +export ydb=canFam3 +export zdb=rheMac8 +export ratDb=rn6 +export RatDb=Rn6 +export fishDb=danRer11 +export flyDb=dm6 +export wormDb=ce11 +export yeastDb=sacCer3 +export tempFa=$dir/ucscGenes.faa +export genomes=/hive/data/genomes +export xdbFa=$genomes/$xdb/bed/ucsc.18.1/ucscGenes.faa +export ratFa=$genomes/$ratDb/bed/ensGene.95/ensembl.faa +#export ratFa=$genomes/$ratDb/bed/blastpRgdGene2/rgdGene2Pep.faa +export fishFa=$genomes/$fishDb/bed/ensGene.95/ensembl.faa +#export fishFa=$genomes/$fishDb/bed/blastp/ensembl.faa +export flyFa=$genomes/$flyDb/bed/ensGene.95/ensembl.faa +#export flyFa=$genomes/$flyDb/bed/hgNearBlastp/100806/$flyDb.flyBasePep.faa +export wormFa=$genomes/$wormDb/bed/ws245Genes/ws245Pep.faa +#export wormFa=$genomes/$wormDb/bed/blastp/wormPep190.faa +export yeastFa=$genomes/$yeastDb/bed/sgdAnnotations/blastTab/sacCer3.sgd.faa +export scratchDir=/hive/users/braney/scratch + +export blastTab=hgBlastTab +export xBlastTab=mmBlastTab +export rnBlastTab=rnBlastTab +export dbHost=hgwdev +export ramFarm=ku +export cpuFarm=ku mkdir -p $dir cd $dir echo "create database $tempDb" | hgsql "" echo "create table chromInfo like $db.chromInfo" | hgsql $tempDb echo "insert into chromInfo select * from $db.chromInfo" | hgsql $tempDb hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | genePredToBed stdin stdout | sort -k1,1 -k2,2n | gzip -c > gencode${GENCODE_VERSION}.bed.gz touch oldToNew.tab zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq | bedClip stdin /cluster/data/hg38/chrom.sizes ucscSplice.bed @@ -318,15 +349,129 @@ hgLoadPsl $tempDb kgTargetAli.psl # 3. Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on # /gbdb/$db/targetDb/${db}KgSeq${curVer}.2bit # 4. On hgwdev, insert new records into blatServers and targetDb, using the # host (field 2) and port (field 3) specified by cluster-admin. Identify the # blatServer by the keyword "$db"Kg with the version number appended # untrans gfServer for hg38KgSeq12 on host blat1b, port 17897 hgsql hgcentraltest -e \ 'INSERT into blatServers values ("hg38KgSeq13", "blat1b", 1909, 0, 1);' hgsql hgcentraltest -e \ 'INSERT into targetDb values("hg38KgSeq13", "GENCODE Genes", \ "hg38", "knownGeneV35.kgTargetAli", "", "", \ "/gbdb/hg38/targetDb/hg38KgSeq13.2bit", 1, now(), "");' + +for i in $tempFa $xdbFa $ratFa $fishFa $flyFa $wormFa $yeastFa +do +if test ! -f $i +then echo $i not found +fi +done + +rm -rf $dir/hgNearBlastp +mkdir $dir/hgNearBlastp +cd $dir/hgNearBlastp +tcsh +cat << _EOF_ > config.ra +# Latest human vs. other Gene Sorter orgs: +# mouse, rat, zebrafish, worm, yeast, fly + +targetGenesetPrefix known +targetDb $tempDb +queryDbs $xdb $ratDb $fishDb $flyDb $wormDb $yeastDb + +${tempDb}Fa $tempFa +${xdb}Fa $xdbFa +${ratDb}Fa $ratFa +${fishDb}Fa $fishFa +${flyDb}Fa $flyFa +${wormDb}Fa $wormFa +${yeastDb}Fa $yeastFa + +buildDir $dir/hgNearBlastp +scratchDir $scratchDir/brHgNearBlastp +_EOF_ + +# exit tcsh + +rm -rf $scratchDir/brHgNearBlastp +doHgNearBlastp.pl -noLoad -clusterHub=ku -distrHost=hgwdev -dbHost=hgwdev -workhorse=hgwdev config.ra > do.log 2>&1 & + +# Load self +cd $dir/hgNearBlastp/run.$tempDb.$tempDb +# builds knownBlastTab +./loadPairwise.csh + +# Load mouse and rat +cd $dir/hgNearBlastp/run.$tempDb.$xdb +hgLoadBlastTab $tempDb $xBlastTab -maxPer=1 out/*.tab +cd $dir/hgNearBlastp/run.$tempDb.$ratDb +hgLoadBlastTab $tempDb $rnBlastTab -maxPer=1 out/*.tab + +# Remove non-syntenic hits for mouse and rat +# Takes a few minutes +mkdir -p /gbdb/$tempDb/liftOver +rm -f /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz +ln -s $genomes/$db/bed/liftOver/${db}To$RatDb.over.chain.gz \ + /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz +ln -s $genomes/$db/bed/liftOver/${db}To${Xdb}.over.chain.gz \ + /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz + +# delete non-syntenic genes from rat and mouse blastp tables +cd $dir/hgNearBlastp +synBlastp.csh $tempDb $xdb +# old number of unique query values: 93277 +# old number of unique target values 27504 +# new number of unique query values: 86329 +# new number of unique target values 26151 + +synBlastp.csh $tempDb $ratDb knownGene ensGene +#old number of unique query values: 92243 +#old number of unique target values 19898 +#new number of unique query values: 85663 +#new number of unique target values 19025 + +# Make reciprocal best subset for the blastp pairs that are too +# Far for synteny to help + +# Us vs. fish +cd $dir/hgNearBlastp +export aToB=run.$db.$fishDb +export bToA=run.$fishDb.$db +cat $aToB/out/*.tab > $aToB/all.tab +cat $bToA/out/*.tab > $bToA/all.tab +blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab +hgLoadBlastTab $tempDb drBlastTab $aToB/recipBest.tab + +# Us vs. fly +cd $dir/hgNearBlastp +export aToB=run.$db.$flyDb +export bToA=run.$flyDb.$db +cat $aToB/out/*.tab > $aToB/all.tab +cat $bToA/out/*.tab > $bToA/all.tab +blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab +hgLoadBlastTab $tempDb dmBlastTab $aToB/recipBest.tab + +# Us vs. worm +cd $dir/hgNearBlastp +export aToB=run.$db.$wormDb +export bToA=run.$wormDb.$db +cat $aToB/out/*.tab > $aToB/all.tab +cat $bToA/out/*.tab > $bToA/all.tab +blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab +hgLoadBlastTab $tempDb ceBlastTab $aToB/recipBest.tab + +# Us vs. yeast +cd $dir/hgNearBlastp +export aToB=run.$db.$yeastDb +export bToA=run.$yeastDb.$db +cat $aToB/out/*.tab > $aToB/all.tab +cat $bToA/out/*.tab > $bToA/all.tab +blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab +hgLoadBlastTab $tempDb scBlastTab $aToB/recipBest.tab + +# Clean up +cd $dir/hgNearBlastp +cat run.$tempDb.$tempDb/out/*.tab | gzip -c > run.$tempDb.$tempDb/all.tab.gz +gzip run.*/all.tab