66b1030888d2fbc504ed737350256b20442367c7 braney Fri Sep 3 10:43:22 2021 -0700 ongoing work on knownGene automation diff --git src/hg/utils/otto/knownGene/buildBlast.sh src/hg/utils/otto/knownGene/buildBlast.sh index e3dafda..f72f856 100755 --- src/hg/utils/otto/knownGene/buildBlast.sh +++ src/hg/utils/otto/knownGene/buildBlast.sh @@ -1,123 +1,122 @@ #!/bin/sh -ex cd $dir { # Run nice Perl script to make all protein blast runs for # Gene Sorter and Known Genes details page. Takes about # 45 minutes to run. rm -rf $dir/hgNearBlastp mkdir $dir/hgNearBlastp cd $dir/hgNearBlastp # make sure all the fasta is there ls -l $tempFa ls -l $xdbFa ls -l $ratFa ls -l $fishFa ls -l $flyFa ls -l $wormFa ls -l $yeastFa ( cat << ThisEnd # Latest human vs. other Gene Sorter orgs: # mouse, rat, zebrafish, worm, yeast, fly targetGenesetPrefix known targetDb $tempDb queryDbs $xdb $ratDb $fishDb $flyDb $wormDb $yeastDb ${tempDb}Fa $tempFa ${xdb}Fa $xdbFa ${ratDb}Fa $ratFa ${fishDb}Fa $fishFa ${flyDb}Fa $flyFa ${wormDb}Fa $wormFa ${yeastDb}Fa $yeastFa buildDir $dir/hgNearBlastp scratchDir $scratchDir/brHgNearBlastp ThisEnd ) > config.ra rm -rf $scratchDir/brHgNearBlastp doHgNearBlastp.pl -noLoad -clusterHub=ku -distrHost=hgwdev -dbHost=hgwdev -workhorse=hgwdev config.ra # Load self cd $dir/hgNearBlastp/run.$tempDb.$tempDb # builds knownBlastTab ./loadPairwise.csh -# Load human and rat cd $dir/hgNearBlastp/run.$tempDb.$xdb hgLoadBlastTab $tempDb $xBlastTab -maxPer=1 out/*.tab cd $dir/hgNearBlastp/run.$tempDb.$ratDb hgLoadBlastTab $tempDb $rnBlastTab -maxPer=1 out/*.tab # Remove non-syntenic hits for human and rat # Takes a few minutes mkdir -p /gbdb/$tempDb/liftOver rm -f /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz ln -s $genomes/$db/bed/liftOver/${db}To$RatDb.over.chain.gz \ /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz ln -s $genomes/$db/bed/liftOver/${db}To${Xdb}.over.chain.gz \ /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz # delete non-syntenic genes from rat and mouse blastp tables cd $dir/hgNearBlastp synBlastp.csh $tempDb $xdb # old number of unique query values: 62743 # old number of unique target values 27998 # new number of unique query values: 54818 # new number of unique target values 26309 synBlastp.csh $tempDb $ratDb knownGene ensGene # old number of unique query values: 63123 # old number of unique target values 21163 # new number of unique query values: 57012 # new number of unique target values 20298 # Make reciprocal best subset for the blastp pairs that are too # Far for synteny to help # Us vs. fish cd $dir/hgNearBlastp export aToB=run.$tempDb.$fishDb export bToA=run.$fishDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb drBlastTab $aToB/recipBest.tab # Us vs. fly cd $dir/hgNearBlastp export aToB=run.$tempDb.$flyDb export bToA=run.$flyDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb dmBlastTab $aToB/recipBest.tab # Us vs. worm cd $dir/hgNearBlastp export aToB=run.$tempDb.$wormDb export bToA=run.$wormDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb ceBlastTab $aToB/recipBest.tab # Us vs. yeast cd $dir/hgNearBlastp export aToB=run.$tempDb.$yeastDb export bToA=run.$yeastDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb scBlastTab $aToB/recipBest.tab # Clean up cd $dir/hgNearBlastp cat run.$tempDb.$tempDb/out/*.tab | gzip -c > run.$tempDb.$tempDb/all.tab.gz gzip run.*/all.tab echo "BuildBlast successfully finished" } > doBlast.log 2>&1