3f06ffc10df40e72dfa18a150eeca23fda6b41df angie Thu Feb 8 13:42:04 2018 -0800 annoStreamDbPslPlus internally used a json config to join in CDS and sequence with the underlying PSL -- so annoStreamDbPslPlusNew did not accept an incoming config, so hgVai's RefSeq Status option was ignored. Add jsonObjectMerge so that the config from hgVai can be merged with the internal config, and make sure that the extra columns from the internal annoStreamDb are added to annoStreamDbPslPlusNew's asObj. We can't just use the internal annoStreamDb's asObj because the field names for CDS etc. fields would differ for refSeqAli vs ncbiRefSeq and column-matching to recognize annoStreamDbPslPlus would fail. refs #20948 diff --git src/hg/lib/annoStreamDbPslPlus.c src/hg/lib/annoStreamDbPslPlus.c index 1ce1dba..4e1b9b6 100644 --- src/hg/lib/annoStreamDbPslPlus.c +++ src/hg/lib/annoStreamDbPslPlus.c @@ -1,186 +1,243 @@ /* annoStreamDbPslPlus -- subclass of annoStreamer for joining PSL+CDS+seq database tables */ /* Copyright (C) 2017 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "annoStreamDbPslPlus.h" #include "annoStreamDb.h" #include "hdb.h" static char *pslPlusAutoSqlString = "table pslPlus" "\"transcript PSL, CDS and seq info\"" " (" " uint match; \"Number of bases that match that aren't repeats\"" " uint misMatch; \"Number of bases that don't match\"" " uint repMatch; \"Number of bases that match but are part of repeats\"" " uint nCount; \"Number of 'N' bases\"" " uint qNumInsert; \"Number of inserts in query (transcript)\"" " int qBaseInsert; \"Number of bases inserted in query (transcript)\"" " uint tNumInsert; \"Number of inserts in target (chromosome/scaffold)\"" " int tBaseInsert; \"Number of bases inserted in target (chromosome/scaffold)\"" " char[2] strand; \"+ or - for query strand (transcript to genome orientation)\"" " string qName; \"Transcript accession\"" " uint qSize; \"Transcript sequence size\"" " uint qStart; \"Alignment start position in query (transcript)\"" " uint qEnd; \"Alignment end position in query (transcript)\"" " string tName; \"Target (chromosome/scaffold) name\"" " uint tSize; \"Target (chromosome/scaffold) size\"" " uint tStart; \"Alignment start position in target\"" " uint tEnd; \"Alignment end position in target\"" " uint blockCount; \"Number of blocks in alignment\"" " uint[blockCount] blockSizes; \"Size of each block\"" " uint[blockCount] qStarts; \"Start of each block in query.\"" " uint[blockCount] tStarts; \"Start of each block in target.\"" " string cds; \"CDS start and end in transcript (if applicable)\"" " string protAcc; \"Protein accession (if applicable)\"" " string name2; \"Gene symbolic name\"" " string path; \"Path to FASTA file containing transcript sequence\"" " uint fileOffset; \"Offset of transcript record in FASTA file\"" " uint fileSize; \"Number of bytes of transcript record in FASTA file\"" " )"; struct annoStreamDbPslPlus { struct annoStreamer streamer; // Parent class members & methods (external interface) // Private members char *gpTable; // Associated genePred (refGene, ncbiRefSeqCurated etc) struct annoStreamer *mySource; // Internal source of PSL+CDS+seq info }; // select p.*, c.cds, l.protAcc, l.name, e.path, s.file_offset, s.file_size // from (((ncbiRefSeqPsl p join ncbiRefSeqCurated n on p.qName = n.name) // left join ncbiRefSeqCds c on p.qName = c.id) // join ncbiRefSeqLink l on p.qName = l.mrnaAcc) // left join (seqNcbiRefSeq s left join extNcbiRefSeq e on s.extFile = e.id) on p.qName = s.acc // where p.tName = "chr1" // order by p.tName, p.tStart // limit 5; static char *ncbiRefSeqConfigJsonFormat = "{ \"naForMissing\": false," " \"rightJoinDtf\": \"%s.\%s.name\"," " \"rightJoinMainField\": \"qName\"," " \"relatedTables\": [ { \"table\": \"ncbiRefSeqCds\"," " \"fields\": [\"cds\"] }," " { \"table\": \"ncbiRefSeqLink\"," " \"fields\": [\"protAcc\", \"name\"] }," " { \"table\": \"extNcbiRefSeq\"," " \"fields\": [\"path\"] }," " { \"table\": \"seqNcbiRefSeq\"," " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; //select p.*,c.name,l.protAcc,l.name,e.path,s.file_offset,s.file_size, i.version //from refSeqAli p // join (hgFixed.gbCdnaInfo i // left join hgFixed.cds c on i.cds = c.id) on i.acc = p.qName // left join (hgFixed.gbSeq s // join hgFixed.gbExtFile e on e.id = s.gbExtFile) on s.acc = p.qName // join hgFixed.refLink l on p.qName = l.mrnaAcc // where p.tName = "chr1" // order by p.tName, p.tStart // limit 5; static char *refSeqAliConfigJson = "{ \"naForMissing\": false," " \"relatedTables\": [ { \"table\": \"hgFixed.cds\"," " \"fields\": [\"name\"] }," " { \"table\": \"hgFixed.refLink\"," " \"fields\": [\"protAcc\", \"name\"] }," " { \"table\": \"hgFixed.gbExtFile\"," " \"fields\": [\"path\"] }," " { \"table\": \"hgFixed.gbSeq\"," " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; struct asObject *annoStreamDbPslPlusAsObj() -/* Return an autoSql object with PSL, gene name, protein acc, CDS and sequence file info fields. */ +/* Return an autoSql object with PSL, gene name, protein acc, CDS and sequence file info fields. + * An annoStreamDbPslPlus instance may return additional additional columns if configured, but + * these columns will always be present. */ { return asParseText(pslPlusAutoSqlString); } static void asdppSetRegion(struct annoStreamer *sSelf, char *chrom, uint rStart, uint rEnd) /* Pass setRegion down to internal source. */ { annoStreamerSetRegion(sSelf, chrom, rStart, rEnd); struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; self->mySource->setRegion(self->mySource, chrom, rStart, rEnd); } static struct annoRow *asdppNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, struct lm *lm) /* Return next psl+ row. */ { struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; char **ppWords; -lmAllocArray(lm, ppWords, PSLPLUS_NUM_COLS); +lmAllocArray(lm, ppWords, sSelf->numCols); struct annoRow *ppRow; boolean rightJoinFail = FALSE; while ((ppRow = self->mySource->nextRow(self->mySource, minChrom, minEnd, lm)) != NULL) { ppWords = ppRow->data; // If there are filters on experiment attributes, apply them, otherwise just return aRow. if (sSelf->filters) { - boolean fails = annoFilterRowFails(sSelf->filters, ppWords, PSLPLUS_NUM_COLS, + boolean fails = annoFilterRowFails(sSelf->filters, ppWords, sSelf->numCols, &rightJoinFail); // If this row passes the filter, or fails but is rightJoin, then we're done looking. if (!fails || rightJoinFail) break; } else // no filtering to do, just use this row break; } if (ppRow != NULL) return annoRowFromStringArray(ppRow->chrom, ppRow->start, ppRow->end, rightJoinFail, - ppWords, PSLPLUS_NUM_COLS, lm); + ppWords, sSelf->numCols, lm); else return NULL; } static void asdppClose(struct annoStreamer **pSSelf) /* Free up state. */ { if (pSSelf == NULL) return; struct annoStreamDbPslPlus *self = *(struct annoStreamDbPslPlus **)pSSelf; freez(&self->gpTable); self->mySource->close(&(self->mySource)); annoStreamerFree(pSSelf); } -struct annoStreamer *annoStreamDbPslPlusNew(struct annoAssembly *aa, char *gpTable, int maxOutRows) +static struct asColumn *asColumnClone(struct asColumn *colIn) +/* Return a full clone of colIn, or NULL if colIn is NULL. */ +{ +if (colIn == NULL) + return NULL; +if (colIn->obType != NULL || colIn->index != NULL) + errAbort("asColumnClone: support for obType and index not implemented"); +struct asColumn *colOut; +AllocVar(colOut); +colOut->name = cloneString(colIn->name); +colOut->comment = cloneString(colIn->comment); +colOut->lowType = colIn->lowType; // static struct in asParse.c +colOut->obName = cloneString(colIn->obName); +colOut->fixedSize = colIn->fixedSize; +colOut->linkedSizeName = cloneString(colIn->linkedSizeName); +colOut->linkedSize = asColumnClone(colIn->linkedSize); +colOut->isSizeLink = colIn->isSizeLink; +colOut->isList = colIn->isList; +colOut->isArray = colIn->isArray; +colOut->autoIncrement = colIn->autoIncrement; +colOut->values = slNameCloneList(colIn->values); +return colOut; +} + +static void asObjAppendExtraColumns(struct asObject *asObjTarget, struct asObject *asObjSource) +/* If asObjSource has more columns than asObjTarget then clone and append those additional columns + * to asObjTarget. */ +{ +int tColCount = slCount(asObjTarget->columnList); +int sColCount = slCount(asObjSource->columnList); +if (tColCount < 1) + errAbort("asObjAppendExtraColumns: support for empty target columnList not implemented"); +if (sColCount > tColCount) + { + struct asColumn *tCol = asObjTarget->columnList, *sCol = asObjSource->columnList; + int i; + for (i = 0; i < tColCount-1; i++) + { + tCol = tCol->next; + sCol = sCol->next; + } + while (sCol->next != NULL) + { + tCol->next = asColumnClone(sCol->next); + tCol = tCol->next; + sCol = sCol->next; + } + } +} + +struct annoStreamer *annoStreamDbPslPlusNew(struct annoAssembly *aa, char *gpTable, int maxOutRows, + struct jsonElement *extraConfig) /* Create an annoStreamer (subclass) object that streams PSL, CDS and seqFile info. * gpTable is a genePred table that has associated PSL, CDS and sequence info * (i.e. refGene, ncbiRefSeq, ncbiRefSeqCurated or ncbiRefSeqPredicted). */ { char *pslTable = NULL, *configJson = NULL; if (sameString("refGene", gpTable)) { pslTable = "refSeqAli"; configJson = refSeqAliConfigJson; } else if (startsWith("ncbiRefSeq", gpTable)) { pslTable = "ncbiRefSeqPsl"; struct dyString *dy = dyStringCreate(ncbiRefSeqConfigJsonFormat, aa->name, gpTable); configJson = dyStringCannibalize(&dy); } else errAbort("annoStreamDbPslPlusNew: unrecognized table \"%s\"", gpTable); struct annoStreamDbPslPlus *self; AllocVar(self); -struct annoStreamer *streamer = &(self->streamer); +// Get internal streamer for joining PSL with other tables. +struct jsonElement *config = jsonParse(configJson); +jsonObjectMerge(config, extraConfig); +self->mySource = annoStreamDbNew(aa->name, pslTable, aa, maxOutRows, config); +struct asObject *asObj = annoStreamDbPslPlusAsObj(); +if (extraConfig) + asObjAppendExtraColumns(asObj, self->mySource->asObj); // Set up external streamer interface -annoStreamerInit(streamer, aa, annoStreamDbPslPlusAsObj(), pslTable); +struct annoStreamer *streamer = &(self->streamer); +annoStreamerInit(streamer, aa, asObj, pslTable); streamer->rowType = arWords; self->gpTable = cloneString(gpTable); -// Get internal streamer for joining PSL with other tables. -struct jsonElement *configEl = jsonParse(configJson); -self->mySource = annoStreamDbNew(aa->name, pslTable, aa, maxOutRows, configEl); // Override methods that need to pass through to internal source: streamer->setRegion = asdppSetRegion; streamer->nextRow = asdppNextRow; streamer->close = asdppClose; return (struct annoStreamer *)self; }