d0f46600984eb50b3ede404e5fa240093c68ce68 angie Thu Sep 28 10:10:57 2017 -0700 annoStreamDbPslPlus: join the appropriate tables for either refGene or ncbiRefSeq* to get PSL+CDS+seq info. Still need to implement the right-join support in annoStreamDb! diff --git src/hg/lib/annoStreamDbPslPlus.c src/hg/lib/annoStreamDbPslPlus.c new file mode 100644 index 0000000..2e633e4 --- /dev/null +++ src/hg/lib/annoStreamDbPslPlus.c @@ -0,0 +1,185 @@ +/* annoStreamDbPslPlus -- subclass of annoStreamer for joining PSL+CDS+seq database tables */ + +/* Copyright (C) 2017 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "annoStreamDbPslPlus.h" +#include "annoStreamDb.h" +#include "hdb.h" + +static char *pslPlusAutoSqlString = +"table pslPlus" +"\"transcript PSL, CDS and seq info\"" +" (" +" uint match; \"Number of bases that match that aren't repeats\"" +" uint misMatch; \"Number of bases that don't match\"" +" uint repMatch; \"Number of bases that match but are part of repeats\"" +" uint nCount; \"Number of 'N' bases\"" +" uint qNumInsert; \"Number of inserts in query (transcript)\"" +" int qBaseInsert; \"Number of bases inserted in query (transcript)\"" +" uint tNumInsert; \"Number of inserts in target (chromosome/scaffold)\"" +" int tBaseInsert; \"Number of bases inserted in target (chromosome/scaffold)\"" +" char[2] strand; \"+ or - for query strand (transcript to genome orientation)\"" +" string qName; \"Transcript accession\"" +" uint qSize; \"Transcript sequence size\"" +" uint qStart; \"Alignment start position in query (transcript)\"" +" uint qEnd; \"Alignment end position in query (transcript)\"" +" string tName; \"Target (chromosome/scaffold) name\"" +" uint tSize; \"Target (chromosome/scaffold) size\"" +" uint tStart; \"Alignment start position in target\"" +" uint tEnd; \"Alignment end position in target\"" +" uint blockCount; \"Number of blocks in alignment\"" +" uint[blockCount] blockSizes; \"Size of each block\"" +" uint[blockCount] qStarts; \"Start of each block in query.\"" +" uint[blockCount] tStarts; \"Start of each block in target.\"" +" string cds; \"CDS start and end in transcript (if applicable)\"" +" string protAcc; \"Protein accession (if applicable)\"" +" string name2; \"Gene symbolic name\"" +" string path; \"Path to FASTA file containing transcript sequence\"" +" uint fileOffset; \"Offset of transcript record in FASTA file\"" +" uint fileSize; \"Number of bytes of transcript record in FASTA file\"" +" )"; + +struct annoStreamDbPslPlus + { + struct annoStreamer streamer; // Parent class members & methods (external interface) + // Private members + char *gpTable; // Associated genePred (refGene, ncbiRefSeqCurated etc) + struct annoStreamer *mySource; // Internal source of PSL+CDS+seq info + }; + +// select p.*, c.cds, l.protAcc, l.name, e.path, s.file_offset, s.file_size +// from (((ncbiRefSeqPsl p join ncbiRefSeqCurated n on p.qName = n.name) +// left join ncbiRefSeqCds c on p.qName = c.id) +// join ncbiRefSeqLink l on p.qName = l.mrnaAcc) +// left join (seqNcbiRefSeq s left join extNcbiRefSeq e on s.extFile = e.id) on p.qName = s.acc +// where p.tName = "chr1" +// order by p.tName, p.tStart +// limit 5; + +static char *ncbiRefSeqConfigJsonFormat = + "{ \"naForMissing\": false," + " \"rightJoinTable\": \"%s\"," + " \"relatedTables\": [ { \"table\": \"ncbiRefSeqCds\"," + " \"fields\": [\"cds\"] }," + " { \"table\": \"ncbiRefSeqLink\"," + " \"fields\": [\"protAcc\", \"name\"] }," + " { \"table\": \"extNcbiRefSeq\"," + " \"fields\": [\"path\"] }," + " { \"table\": \"seqNcbiRefSeq\"," + " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; + +//select p.*,c.name,l.protAcc,l.name,e.path,s.file_offset,s.file_size, i.version +//from refSeqAli p +// join (hgFixed.gbCdnaInfo i +// left join hgFixed.cds c on i.cds = c.id) on i.acc = p.qName +// left join (hgFixed.gbSeq s +// join hgFixed.gbExtFile e on e.id = s.gbExtFile) on s.acc = p.qName +// join hgFixed.refLink l on p.qName = l.mrnaAcc +// where p.tName = "chr1" +// order by p.tName, p.tStart +// limit 5; + +static char *refSeqAliConfigJson = + "{ \"naForMissing\": false," + " \"relatedTables\": [ { \"table\": \"hgFixed.cds\"," + " \"fields\": [\"name\"] }," + " { \"table\": \"hgFixed.refLink\"," + " \"fields\": [\"protAcc\", \"name\"] }," + " { \"table\": \"hgFixed.gbExtFile\"," + " \"fields\": [\"path\"] }," + " { \"table\": \"hgFixed.gbSeq\"," + " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; + +struct asObject *annoStreamDbPslPlusAsObj() +/* Return an autoSql object with PSL, gene name, protein acc, CDS and sequence file info fields. */ +{ +return asParseText(pslPlusAutoSqlString); +} + +static void asdppSetRegion(struct annoStreamer *sSelf, char *chrom, uint rStart, uint rEnd) +/* Pass setRegion down to internal source. */ +{ +annoStreamerSetRegion(sSelf, chrom, rStart, rEnd); +struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; +self->mySource->setRegion(self->mySource, chrom, rStart, rEnd); +} + +static struct annoRow *asdppNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, + struct lm *lm) +/* Return next psl+ row. */ +{ +struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; +char **ppWords; +lmAllocArray(lm, ppWords, PSLPLUS_NUM_COLS); +struct annoRow *ppRow; +boolean rightJoinFail = FALSE; +while ((ppRow = self->mySource->nextRow(self->mySource, minChrom, minEnd, lm)) != NULL) + { + ppWords = ppRow->data; + // If there are filters on experiment attributes, apply them, otherwise just return aRow. + if (sSelf->filters) + { + boolean fails = annoFilterRowFails(sSelf->filters, ppWords, PSLPLUS_NUM_COLS, + &rightJoinFail); + // If this row passes the filter, or fails but is rightJoin, then we're done looking. + if (!fails || rightJoinFail) + break; + } + else + // no filtering to do, just use this row + break; + } +if (ppRow != NULL) + return annoRowFromStringArray(ppRow->chrom, ppRow->start, ppRow->end, rightJoinFail, + ppWords, PSLPLUS_NUM_COLS, lm); +else + return NULL; +} + +static void asdppClose(struct annoStreamer **pSSelf) +/* Free up state. */ +{ +if (pSSelf == NULL) + return; +struct annoStreamDbPslPlus *self = *(struct annoStreamDbPslPlus **)pSSelf; +freez(&self->gpTable); +self->mySource->close(&(self->mySource)); +annoStreamerFree(pSSelf); +} + +struct annoStreamer *annoStreamDbPslPlusNew(struct annoAssembly *aa, char *gpTable, int maxOutRows) +/* Create an annoStreamer (subclass) object that streams PSL, CDS and seqFile info. + * gpTable is a genePred table that has associated PSL, CDS and sequence info + * (i.e. refGene, ncbiRefSeq, ncbiRefSeqCurated or ncbiRefSeqPredicted). */ +{ +char *pslTable = NULL, *configJson = NULL; +if (sameString("refGene", gpTable)) + { + pslTable = "refSeqAli"; + configJson = refSeqAliConfigJson; + } +else if (startsWith("ncbiRefSeq", gpTable)) + { + pslTable = "ncbiRefSeqPsl"; + struct dyString *dy = dyStringCreate(ncbiRefSeqConfigJsonFormat, gpTable); + configJson = dyStringCannibalize(&dy); + } +else + errAbort("annoStreamDbPslPlusNew: unrecognized table \"%s\"", gpTable); +struct annoStreamDbPslPlus *self; +AllocVar(self); +struct annoStreamer *streamer = &(self->streamer); +// Set up external streamer interface +annoStreamerInit(streamer, aa, annoStreamDbPslPlusAsObj(), pslTable); +streamer->rowType = arWords; +self->gpTable = cloneString(gpTable); +// Get internal streamer for joining PSL with other tables. +struct jsonElement *configEl = jsonParse(configJson); +self->mySource = annoStreamDbNew(aa->name, pslTable, aa, maxOutRows, configEl); +// Override methods that need to pass through to internal source: +streamer->setRegion = asdppSetRegion; +streamer->nextRow = asdppNextRow; +streamer->close = asdppClose; +return (struct annoStreamer *)self; +}