1a36012f9f2eb9d087e503b1fd2c37f3a9adedfd max Fri Nov 26 08:11:01 2021 -0800 A major update for the UniProt otto job, refs #28560 diff --git src/hg/utils/otto/uniprot/makeUniProtPsl.sh src/hg/utils/otto/uniprot/makeUniProtPsl.sh index 204f25e..c216684 100755 --- src/hg/utils/otto/uniprot/makeUniProtPsl.sh +++ src/hg/utils/otto/uniprot/makeUniProtPsl.sh @@ -1,46 +1,43 @@ #!/bin/bash -# create mapping from uniProt protein positions to genome positions +# create mapping from UniProt protein sequences to genome +# input is protein sequence file and transcript table, and optional a mapping proteinId <-> transcriptId # originally inspired/copied from Markd's script LS-SNP pipeline # original: snpProtein/build/Makefile -# for hg38, this should probably use RefSeq, not Ensembl (=Gencode=knownGene) +# for human, this is now using RefSeq, see #27836 # The reason is that for all proteins that are broken in the reference, the positions will be wrong -# for Gencode, but not with RefSeq (for which we have a proper alignment file). -# This also means that for RefSeq, I need to find the real Refseq sequences file, not the our -# reconstructed one with getRnaPred -# pslSelect then probably has some table that it can use... or maybe use refSeq's refSeqId -> uniProt ID mapping? -# or could use UniProt's refseq mapping... +# for Gencode, but not with RefSeq (for which we have a proper PSL alignment). # parameters: # $1 = the fasta file with uniprot sequences -# $2 = the UCSC database name, e.g. hg19 or hg38 or else -# $3 = the gene table -# $4 = the cluster -# $5 = the nucleotide output file - -# Currently: does not work with refGene, finds different query lengths at the pslMap step, when refseq differ from faked Psls +# $2 = the transcript fasta file +# $3 = the transcript->genome PSL file +# $4 = MINALI, the minimum percent ID, e.g. 0.93 +# $5 = the parasol cluster +# $6 = temporary workdir +# $7 = OUTPUT: the nucleotide PSL output file to create +# $8 = optional: tsv table with mapping from uniprot to transcript # Will always rm -rf the work directory, before and after a run if [ "$1" == "" ]; then echo Please specify a uniprotToTab fasta input file and a db, a gene table, a cluster name and the output file name exit 0 fi -MINALI=0.93 # ideally, we have a table of uniprotID <-> gene model ID, so can decide where to map protein sequences that match identically twice. # but often we don't have known genes tables. # to get an idea of the impact, I compared hg38 with and without known gene tables # # 2385 out of 38931 PSLs are multi-mapping # most of them are mapping twice, but some 35 times # 2 ************************************************************ 1477 # 3 ********** 238 # 4 ***** 126 # 5 **** 109 # 6 *** 84 # 7 ****** 142 # 8 *** 82 # 9 10 @@ -59,91 +56,97 @@ # worst ones: #A6NER0 20 0.042% #Q99706-6 20 0.042% #P43629-2 20 0.042% #P43630-2 28 0.059% #P43630-1 28 0.059% #Q99706-3 30 0.063% #Q99706-4 30 0.063% #Q99706-2 30 0.063% #Q99706-1 30 0.063% #Q8N743 32 0.067% # the input fasta file has to be created by the uniprotToTab parser (originally from the publications track code) UNIPROTFAGZ=$1 -DB=$2 -GENETRACK=$3 -CLUSTER=$4 -OUTFNAME=$5 - -if [[ "$DB" == "ci3" ]]; then - MINALI=0.85 -fi +TRANSCRIPTFA=$2 +TRANSCRIPTPSL=$3 +MINALI=$4 +CLUSTER=$5 +WORKDIR=$6 +OUTFNAME=$7 +PAIRNAME=$8 + +#if [[ "$DB" == "ci3" ]]; then + #MINALI=0.85 +#fi # if you change BLASTDIR, also must change the cluster script mapUniprot_doBlast BLASTDIR=/cluster/bin/blast/x86_64/blast-2.2.16/bin # stop on errors set -e # show commands set -x -WORKDIR=makeUniProtPsl-$2-$3-$4.tmp +#WORKDIR=makeUniProtPsl-$2-$3-$4.tmp #rm -rf $WORKDIR mkdir -p $WORKDIR -# get transcript sequences and a (pseudo) alignment from transcript to genome using the exon information cp ${UNIPROTFAGZ} $WORKDIR/uniProt.fa -if [ ! -f $WORKDIR/transcripts.fa ] ; then - hgsql -Ne 'select name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds from '$GENETRACK' where cdsStart < cdsEnd' $DB > $WORKDIR/transcripts.gp - genePredToFakePsl $DB $WORKDIR/transcripts.gp $WORKDIR/transcripts.psl $WORKDIR/transcripts.cds - getRnaPred $DB $WORKDIR/transcripts.gp all $WORKDIR/transcripts.fa -fi - -if [ "$GENETRACK" == "knownGene" ]; then - # our kg system sometimes keeps the variant number, sometimes it doesn't (e.g. mm10 vs hg19) - #hgsql -Ne 'select spID, kgID from kgXref where spID!=""' $DB | awk '{OFS="\t"; print;} /-/ {split($1,a,"-"); print a[1], $2}' > $WORKDIR/ucscUniProt.pairs - hgsql -Ne 'select spID, kgID from kgXref where spID!=""' $DB > $WORKDIR/ucscUniProt.pairs +cp $TRANSCRIPTFA $WORKDIR/transcripts.fa +cp $TRANSCRIPTPSL $WORKDIR/transcripts.psl +if [ "$PAIRNAME" != "" ] ; then + cp $PAIRNAME $WORKDIR/upToTrans.pairs fi # setup blast uniprot -> transcript cDNAs -if [ ! -d $WORKDIR/aligns ] ; then +if [ -f $WORKDIR/bestAln.psl ] ; then + echo WARNING: re-using existing protein-transcript alignments to save time! see $WORKDIR/bestAln.psl +else mkdir $WORKDIR/queries mkdir $WORKDIR/aligns faSplit about $WORKDIR/uniProt.fa 2500 $WORKDIR/queries/ ${BLASTDIR}/formatdb -i $WORKDIR/transcripts.fa -p F # create joblist and run set +x # quiet for now >$WORKDIR/jobList for i in $WORKDIR/queries/*.fa; do echo "mapUniprot_doBlast transcripts.fa queries/`basename $i` {check out exists aligns/`basename $i .fa`.psl}" >> $WORKDIR/jobList done; set -x cp mapUniprot_doBlast $WORKDIR/ ssh $CLUSTER "cd `pwd`/$WORKDIR && para make jobList" + echo Concatenating and filtering protein/transcript alignments + # sort and pick the best alignments for each protein + find $WORKDIR/aligns -name '*.psl' | xargs cat | pslReps -noIntrons -nohead -nearTop=0.01 -minAli=$MINALI stdin stdout /dev/null > $WORKDIR/bestAln.psl fi -# sort and pick the best alignments for each protein -# when we have a knownGene table, just use our existing uniProt mapping from there -if [ -f $WORKDIR/ucscUniProt.pairs ]; then - # this are very special pslSelect options: +# when we have a list of pairs to filter on, then use it +if [ -f $WORKDIR/upToTrans.pairs ]; then + # these are very special pslSelect options: # - pass through all PSLs for queries that do not appear in our tables (=all Trembl and a few swissprot sequences missed by our tables) # - only compare the part before the dot - find $WORKDIR/aligns -name '*.psl' | xargs cat | pslSelect -qPass -qDelim=. -qtPairs=$WORKDIR/ucscUniProt.pairs stdin stdout | pslReps -noIntrons -nohead -nearTop=0.01 -minAli=$MINALI stdin stdout /dev/null > $WORKDIR/uniProtVsTranscripts.psl -#when we have no known gene table, take the top 1% alignments with 93% query coverage. Hopefully that gives similar results. -else + # - use pslSwap as a workaround, as pslSelect doesn't have -tDelim yet and the query IDs don't have dots in them anyways + # too lazy to patch pslSelect now + cat $WORKDIR/upToTrans.pairs | gawk '{OFS="\t"; print $2, $1; }' > $WORKDIR/transToUp.pairs + pslSwap $WORKDIR/bestAln.psl stdout | pslSelect -qPass -qDelim=. -qtPairs=$WORKDIR/transToUp.pairs stdin stdout | pslSwap stdin $WORKDIR/uniProtVsTranscripts.psl + # when we have no known gene table, blat the proteins directly and + # take the top 1% alignments with 93% query coverage. Hopefully that gives similar results... # inspired by kent/src/hg/makeDb/doc/ucscGenes/hg19.ucscGenes13.csh # as requested by Alejo: lower to 93%, which are the pslReps defaults - find $WORKDIR/aligns -name '*.psl' | xargs cat | pslReps -noIntrons -nohead -nearTop=0.01 -minAli=$MINALI stdin stdout /dev/null > $WORKDIR/uniProtVsTranscripts.psl +else + cp $WORKDIR/bestAln.psl $WORKDIR/uniProtVsTranscripts.psl fi # now combine the two alignments with pslMap -pslMap $WORKDIR/uniProtVsTranscripts.psl $WORKDIR/transcripts.psl $WORKDIR/uniProtVsGenome.psl -# 2016: lowering to 99% identity due to hg38 alt loci sucking up our main (and more important) alignments from the +pslMap $WORKDIR/uniProtVsTranscripts.psl $WORKDIR/transcripts.psl $WORKDIR/uniProtVsGenome.psl -mapInfo=$WORKDIR/mapInfo.tab +# 2016: lowering to 95% identity due to hg38 alt loci sucking up our main (and more important) alignments from the +# 2021: using MINALI is more consistent # normal chromosomes -sort -k10,10 $WORKDIR/uniProtVsGenome.psl | pslCDnaFilter stdin -globalNearBest=0.99 -bestOverlap -filterWeirdOverlapped stdout | sort | uniq > $OUTFNAME -rm -rf $WORKDIR +sort -k10,10 $WORKDIR/uniProtVsGenome.psl | pslCDnaFilter stdin -globalNearBest=$MINALI -bestOverlap -filterWeirdOverlapped stdout | sort | uniq > $OUTFNAME +cp $WORKDIR/mapInfo.tab ${OUTFNAME}.mapInfo +#rm -rf $WORKDIR