61cf5e21f11e752e855b40094d72c0348687c27b angie Wed Jul 25 16:58:53 2018 -0700 Instead of a SQL join to restrict ncbiRefSeqPsl rows to ncbiRefSeqCurated or Predicted subset, use an ID hash because when a transcript has multiple placements on main and _alt chroms, the join was producing redundant results. refs #21770 diff --git src/hg/lib/annoStreamDbPslPlus.c src/hg/lib/annoStreamDbPslPlus.c index 9ec4612..bd73dc8 100644 --- src/hg/lib/annoStreamDbPslPlus.c +++ src/hg/lib/annoStreamDbPslPlus.c @@ -34,44 +34,44 @@ " uint[blockCount] tStarts; \"Start of each block in target.\"" " string cds; \"CDS start and end in transcript (if applicable)\"" " string protAcc; \"Protein accession (if applicable)\"" " string name2; \"Gene symbolic name\"" " string path; \"Path to FASTA file containing transcript sequence\"" " uint fileOffset; \"Offset of transcript record in FASTA file\"" " uint fileSize; \"Number of bytes of transcript record in FASTA file\"" " )"; struct annoStreamDbPslPlus { struct annoStreamer streamer; // Parent class members & methods (external interface) // Private members char *gpTable; // Associated genePred (refGene, ncbiRefSeqCurated etc) struct annoStreamer *mySource; // Internal source of PSL+CDS+seq info + struct hash *idHash; // Used to restrict PSL query result to curated/predicted }; // select p.*, c.cds, l.protAcc, l.name, e.path, s.file_offset, s.file_size -// from (((ncbiRefSeqPsl p join ncbiRefSeqCurated n on p.qName = n.name) +// from (((ncbiRefSeqPsl p // NOT ANYMORE (#21770): join ncbiRefSeqCurated n on p.qName = n.name) // left join ncbiRefSeqCds c on p.qName = c.id) // join ncbiRefSeqLink l on p.qName = l.mrnaAcc) // left join (seqNcbiRefSeq s left join extNcbiRefSeq e on s.extFile = e.id) on p.qName = s.acc // where p.tName = "chr1" // order by p.tName, p.tStart // limit 5; static char *ncbiRefSeqConfigJsonFormat = "{ \"naForMissing\": false," - " \"rightJoinTable\": \"%s\"," " \"relatedTables\": [ { \"table\": \"ncbiRefSeqCds\"," " \"fields\": [\"cds\"] }," " { \"table\": \"ncbiRefSeqLink\"," " \"fields\": [\"protAcc\", \"name\"] }," " { \"table\": \"extNcbiRefSeq\"," " \"fields\": [\"path\"] }," " { \"table\": \"seqNcbiRefSeq\"," " \"fields\": [\"file_offset\", \"file_size\"] } ] }"; //select p.*,c.name,l.protAcc,l.name,e.path,s.file_offset,s.file_size, i.version //from refSeqAli p // join (hgFixed.gbCdnaInfo i // left join hgFixed.cds c on i.cds = c.id) on i.acc = p.qName // left join (hgFixed.gbSeq s // join hgFixed.gbExtFile e on e.id = s.gbExtFile) on s.acc = p.qName @@ -107,31 +107,35 @@ self->mySource->setRegion(self->mySource, chrom, rStart, rEnd); } static struct annoRow *asdppNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, struct lm *lm) /* Return next psl+ row. */ { struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf; char **ppWords; lmAllocArray(lm, ppWords, sSelf->numCols); struct annoRow *ppRow; boolean rightJoinFail = FALSE; while ((ppRow = self->mySource->nextRow(self->mySource, minChrom, minEnd, lm)) != NULL) { ppWords = ppRow->data; - // If there are filters on experiment attributes, apply them, otherwise just return aRow. + // If self->idHash is non-NULL, check PSL qName; skip this row if qName not found. + char *qName = ppWords[9]; + if (self->idHash && ! hashLookup(self->idHash, qName)) + continue; + // If there are filters, apply them, otherwise just return aRow. if (sSelf->filters) { boolean fails = annoFilterRowFails(sSelf->filters, ppWords, sSelf->numCols, &rightJoinFail); // If this row passes the filter, or fails but is rightJoin, then we're done looking. if (!fails || rightJoinFail) break; } else // no filtering to do, just use this row break; } if (ppRow != NULL) return annoRowFromStringArray(ppRow->chrom, ppRow->start, ppRow->end, rightJoinFail, ppWords, sSelf->numCols, lm); @@ -218,26 +222,35 @@ struct dyString *dy = dyStringCreate(ncbiRefSeqConfigJsonFormat, gpTable); configJson = dyStringCannibalize(&dy); } else errAbort("annoStreamDbPslPlusNew: unrecognized table \"%s\"", gpTable); struct annoStreamDbPslPlus *self; AllocVar(self); // Get internal streamer for joining PSL with other tables. struct jsonElement *config = jsonParse(configJson); if (extraConfig) jsonObjectMerge(config, extraConfig); self->mySource = annoStreamDbNew(aa->name, pslTable, aa, maxOutRows, config); struct asObject *asObj = annoStreamDbPslPlusAsObj(); if (extraConfig) asObjAppendExtraColumns(asObj, self->mySource->asObj); +if (sameString("ncbiRefSeqCurated", gpTable) || sameString("ncbiRefSeqPredicted", gpTable)) + { + // Load up an ID hash to restrict PSL query results to the curated/predicted subset: + struct sqlConnection *conn = hAllocConn(aa->name); + char query[1024]; + sqlSafef(query, sizeof(query), "select name, 1 from %s", gpTable); + self->idHash = sqlQuickHash(conn, query); + hFreeConn(&conn); + } // Set up external streamer interface struct annoStreamer *streamer = &(self->streamer); annoStreamerInit(streamer, aa, asObj, pslTable); streamer->rowType = arWords; self->gpTable = cloneString(gpTable); // Override methods that need to pass through to internal source: streamer->setRegion = asdppSetRegion; streamer->nextRow = asdppNextRow; streamer->close = asdppClose; return (struct annoStreamer *)self; }