13dddf36dc0f43a27a792519213d87939b51d99c angie Wed Jul 25 16:23:50 2018 -0700 Revert "Use the new annoStreamDb params rightJoinDtf and rightJoinMainField to restrict ncbiRefSeqPsl rows to only those that appear in the selected genePred table (e.g. ncbiRefSeqCurated)." This reverts commit dd285f81141e45dabf1caf0f90723810f2c6f8a1. diff --git src/hg/lib/annoStreamDbPslPlus.c src/hg/lib/annoStreamDbPslPlus.c index 0516169..9ec4612 100644 --- src/hg/lib/annoStreamDbPslPlus.c +++ src/hg/lib/annoStreamDbPslPlus.c @@ -1,244 +1,243 @@ /* annoStreamDbPslPlus -- subclass of annoStreamer for joining PSL+CDS+seq database tables */ /* Copyright (C) 2017 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "annoStreamDbPslPlus.h" #include "annoStreamDb.h" #include "hdb.h" static char *pslPlusAutoSqlString = "table pslPlus" "\"transcript PSL, CDS and seq info\"" " (" " uint match; \"Number of bases that match that aren't repeats\"" " uint misMatch; \"Number of bases that don't match\"" " uint repMatch; \"Number of bases that match but are part of repeats\"" " uint nCount; \"Number of 'N' bases\"" " uint qNumInsert; \"Number of inserts in query (transcript)\"" " int qBaseInsert; \"Number of bases inserted in query (transcript)\"" " uint tNumInsert; \"Number of inserts in target (chromosome/scaffold)\"" " int tBaseInsert; \"Number of bases inserted in target (chromosome/scaffold)\"" " char[2] strand; \"+ or - for query strand (transcript to genome orientation)\"" " string qName; \"Transcript accession\"" " uint qSize; \"Transcript sequence size\"" " uint qStart; \"Alignment start position in query (transcript)\"" " uint qEnd; \"Alignment end position in query (transcript)\"" " string tName; \"Target (chromosome/scaffold) name\"" " uint tSize; \"Target (chromosome/scaffold) size\"" " uint tStart; \"Alignment start position in target\"" " uint tEnd; \"Alignment end position in target\"" " uint blockCount; \"Number of blocks in alignment\"" " uint[blockCount] blockSizes; \"Size of each block\"" " uint[blockCount] qStarts; \"Start of each block in query.\"" " uint[blockCount] tStarts; \"Start of each block in target.\"" " string cds; \"CDS start and end in transcript (if applicable)\"" " string protAcc; \"Protein accession (if applicable)\"" " string name2; \"Gene symbolic name\"" " string path; \"Path to FASTA file containing transcript sequence\"" " uint fileOffset; \"Offset of transcript record in FASTA file\"" " uint fileSize; \"Number of bytes of transcript record in FASTA file\"" " )"; struct annoStreamDbPslPlus { struct annoStreamer streamer; // Parent class members & methods (external interface) // Private members char *gpTable; // Associated genePred (refGene, ncbiRefSeqCurated etc) struct annoStreamer *mySource; // Internal source of PSL+CDS+seq info }; // select p.*, c.cds, l.protAcc, l.name, e.path, s.file_offset, s.file_size // from (((ncbiRefSeqPsl p join ncbiRefSeqCurated n on p.qName = n.name) // left join ncbiRefSeqCds c on p.qName = c.id) // join ncbiRefSeqLink l on p.qName = l.mrnaAcc) // left join (seqNcbiRefSeq s left join extNcbiRefSeq e on s.extFile = e.id) on p.qName = s.acc // where p.tName = "chr1" // order by p.tName, p.tStart // limit 5; static char *ncbiRefSeqConfigJsonFormat = "{ \"naForMissing\": false," - " \"rightJoinDtf\": \"%s.\%s.name\"," - " \"rightJoinMainField\": \"qName\"," + " \"rightJoinTable\": \"%s\"," " \"relatedTables\": [ { \"table\": \"ncbiRefSeqCds\"," " \"fields\": [\"cds\"] }," " { \"table\": \"ncbiRefSeqLink\"," " \"fields\": [\"protAcc\", \"name\"] }," " { \"table\": \"extNcbiRefSeq\"," " \"fields\": [\"path\"] }," " { \"table\": \"seqNcbiRefSeq\"," " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; //select p.*,c.name,l.protAcc,l.name,e.path,s.file_offset,s.file_size, i.version //from refSeqAli p // join (hgFixed.gbCdnaInfo i // left join hgFixed.cds c on i.cds = c.id) on i.acc = p.qName // left join (hgFixed.gbSeq s // join hgFixed.gbExtFile e on e.id = s.gbExtFile) on s.acc = p.qName // join hgFixed.refLink l on p.qName = l.mrnaAcc // where p.tName = "chr1" // order by p.tName, p.tStart // limit 5; static char *refSeqAliConfigJson = "{ \"naForMissing\": false," " \"relatedTables\": [ { \"table\": \"hgFixed.cds\"," " \"fields\": [\"name\"] }," " { \"table\": \"hgFixed.refLink\"," " \"fields\": [\"protAcc\", \"name\"] }," " { \"table\": \"hgFixed.gbExtFile\"," " \"fields\": [\"path\"] }," " { \"table\": \"hgFixed.gbSeq\"," " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; struct asObject *annoStreamDbPslPlusAsObj() /* Return an autoSql object with PSL, gene name, protein acc, CDS and sequence file info fields. * An annoStreamDbPslPlus instance may return additional additional columns if configured, but * these columns will always be present. */ { return asParseText(pslPlusAutoSqlString); } static void asdppSetRegion(struct annoStreamer *sSelf, char *chrom, uint rStart, uint rEnd) /* Pass setRegion down to internal source. */ { annoStreamerSetRegion(sSelf, chrom, rStart, rEnd); struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; self->mySource->setRegion(self->mySource, chrom, rStart, rEnd); } static struct annoRow *asdppNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, struct lm *lm) /* Return next psl+ row. */ { struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; char **ppWords; lmAllocArray(lm, ppWords, sSelf->numCols); struct annoRow *ppRow; boolean rightJoinFail = FALSE; while ((ppRow = self->mySource->nextRow(self->mySource, minChrom, minEnd, lm)) != NULL) { ppWords = ppRow->data; // If there are filters on experiment attributes, apply them, otherwise just return aRow. if (sSelf->filters) { boolean fails = annoFilterRowFails(sSelf->filters, ppWords, sSelf->numCols, &rightJoinFail); // If this row passes the filter, or fails but is rightJoin, then we're done looking. if (!fails || rightJoinFail) break; } else // no filtering to do, just use this row break; } if (ppRow != NULL) return annoRowFromStringArray(ppRow->chrom, ppRow->start, ppRow->end, rightJoinFail, ppWords, sSelf->numCols, lm); else return NULL; } static void asdppClose(struct annoStreamer **pSSelf) /* Free up state. */ { if (pSSelf == NULL) return; struct annoStreamDbPslPlus *self = *(struct annoStreamDbPslPlus **)pSSelf; freez(&self->gpTable); self->mySource->close(&(self->mySource)); annoStreamerFree(pSSelf); } static struct asColumn *asColumnClone(struct asColumn *colIn) /* Return a full clone of colIn, or NULL if colIn is NULL. */ { if (colIn == NULL) return NULL; if (colIn->obType != NULL || colIn->index != NULL) errAbort("asColumnClone: support for obType and index not implemented"); struct asColumn *colOut; AllocVar(colOut); colOut->name = cloneString(colIn->name); colOut->comment = cloneString(colIn->comment); colOut->lowType = colIn->lowType; // static struct in asParse.c colOut->obName = cloneString(colIn->obName); colOut->fixedSize = colIn->fixedSize; colOut->linkedSizeName = cloneString(colIn->linkedSizeName); colOut->linkedSize = asColumnClone(colIn->linkedSize); colOut->isSizeLink = colIn->isSizeLink; colOut->isList = colIn->isList; colOut->isArray = colIn->isArray; colOut->autoIncrement = colIn->autoIncrement; colOut->values = slNameCloneList(colIn->values); return colOut; } static void asObjAppendExtraColumns(struct asObject *asObjTarget, struct asObject *asObjSource) /* If asObjSource has more columns than asObjTarget then clone and append those additional columns * to asObjTarget. */ { int tColCount = slCount(asObjTarget->columnList); int sColCount = slCount(asObjSource->columnList); if (tColCount < 1) errAbort("asObjAppendExtraColumns: support for empty target columnList not implemented"); if (sColCount > tColCount) { struct asColumn *tCol = asObjTarget->columnList, *sCol = asObjSource->columnList; int i; for (i = 0; i < tColCount-1; i++) { tCol = tCol->next; sCol = sCol->next; } while (sCol->next != NULL) { tCol->next = asColumnClone(sCol->next); tCol = tCol->next; sCol = sCol->next; } } } struct annoStreamer *annoStreamDbPslPlusNew(struct annoAssembly *aa, char *gpTable, int maxOutRows, struct jsonElement *extraConfig) /* Create an annoStreamer (subclass) object that streams PSL, CDS and seqFile info. * gpTable is a genePred table that has associated PSL, CDS and sequence info * (i.e. refGene, ncbiRefSeq, ncbiRefSeqCurated or ncbiRefSeqPredicted). */ { char *pslTable = NULL, *configJson = NULL; if (sameString("refGene", gpTable)) { pslTable = "refSeqAli"; configJson = refSeqAliConfigJson; } else if (startsWith("ncbiRefSeq", gpTable)) { pslTable = "ncbiRefSeqPsl"; - struct dyString *dy = dyStringCreate(ncbiRefSeqConfigJsonFormat, aa->name, gpTable); + struct dyString *dy = dyStringCreate(ncbiRefSeqConfigJsonFormat, gpTable); configJson = dyStringCannibalize(&dy); } else errAbort("annoStreamDbPslPlusNew: unrecognized table \"%s\"", gpTable); struct annoStreamDbPslPlus *self; AllocVar(self); // Get internal streamer for joining PSL with other tables. struct jsonElement *config = jsonParse(configJson); if (extraConfig) jsonObjectMerge(config, extraConfig); self->mySource = annoStreamDbNew(aa->name, pslTable, aa, maxOutRows, config); struct asObject *asObj = annoStreamDbPslPlusAsObj(); if (extraConfig) asObjAppendExtraColumns(asObj, self->mySource->asObj); // Set up external streamer interface struct annoStreamer *streamer = &(self->streamer); annoStreamerInit(streamer, aa, asObj, pslTable); streamer->rowType = arWords; self->gpTable = cloneString(gpTable); // Override methods that need to pass through to internal source: streamer->setRegion = asdppSetRegion; streamer->nextRow = asdppNextRow; streamer->close = asdppClose; return (struct annoStreamer *)self; }