src/hg/lib/annoStreamDbPslPlus.c 61cf5e21f11e752e855b40094d72c0348687c27b

61cf5e21f11e752e855b40094d72c0348687c27b
angie
  Wed Jul 25 16:58:53 2018 -0700
Instead of a SQL join to restrict ncbiRefSeqPsl rows to ncbiRefSeqCurated or Predicted subset, use an ID hash because when a transcript has multiple placements on main and _alt chroms, the join was producing redundant results.  refs #21770

diff --git src/hg/lib/annoStreamDbPslPlus.c src/hg/lib/annoStreamDbPslPlus.c
index 9ec4612..bd73dc8 100644
--- src/hg/lib/annoStreamDbPslPlus.c
+++ src/hg/lib/annoStreamDbPslPlus.c
@@ -34,44 +34,44 @@
 "    uint[blockCount] tStarts;     \"Start of each block in target.\""
 "    string  cds;          \"CDS start and end in transcript (if applicable)\""
 "    string  protAcc;      \"Protein accession (if applicable)\""
 "    string  name2;        \"Gene symbolic name\""
 "    string  path;         \"Path to FASTA file containing transcript sequence\""
 "    uint    fileOffset;   \"Offset of transcript record in FASTA file\""
 "    uint    fileSize;     \"Number of bytes of transcript record in FASTA file\""
 "   )";
 
 struct annoStreamDbPslPlus
     {
     struct annoStreamer streamer;	// Parent class members & methods (external interface)
     // Private members
     char *gpTable;                      // Associated genePred (refGene, ncbiRefSeqCurated etc)
     struct annoStreamer *mySource;	// Internal source of PSL+CDS+seq info
+    struct hash *idHash;		// Used to restrict PSL query result to curated/predicted
     };
 
 // select p.*, c.cds, l.protAcc, l.name, e.path, s.file_offset, s.file_size 
-//   from (((ncbiRefSeqPsl p join ncbiRefSeqCurated n on p.qName = n.name)
+//   from (((ncbiRefSeqPsl p // NOT ANYMORE (#21770): join ncbiRefSeqCurated n on p.qName = n.name)
 //          left join ncbiRefSeqCds c on p.qName = c.id)
 //         join ncbiRefSeqLink l on p.qName = l.mrnaAcc)
 //        left join (seqNcbiRefSeq s left join extNcbiRefSeq e on s.extFile = e.id) on p.qName = s.acc
 //   where p.tName = "chr1"
 //   order by p.tName, p.tStart
 //  limit 5;
 
 static char *ncbiRefSeqConfigJsonFormat =
     "{ \"naForMissing\": false,"
-    "  \"rightJoinTable\": \"%s\","
     "  \"relatedTables\": [ { \"table\": \"ncbiRefSeqCds\","
     "                         \"fields\": [\"cds\"] },"
     "                       { \"table\": \"ncbiRefSeqLink\","
     "                         \"fields\": [\"protAcc\", \"name\"] },"
     "                       { \"table\": \"extNcbiRefSeq\","
     "                         \"fields\": [\"path\"] },"
     "                       { \"table\": \"seqNcbiRefSeq\","
     "                         \"fields\": [\"file_offset\", \"file_size\"] } ] }";
 
 //select p.*,c.name,l.protAcc,l.name,e.path,s.file_offset,s.file_size, i.version
 //from refSeqAli p
 //  join (hgFixed.gbCdnaInfo i
 //        left join hgFixed.cds c on i.cds = c.id) on i.acc = p.qName
 //       left join (hgFixed.gbSeq s
 //                  join hgFixed.gbExtFile e on e.id = s.gbExtFile) on s.acc = p.qName
@@ -107,31 +107,35 @@
 self->mySource->setRegion(self->mySource, chrom, rStart, rEnd);
 }
 
 static struct annoRow *asdppNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd,
 				    struct lm *lm)
 /* Return next psl+ row. */
 {
 struct annoStreamDbPslPlus *self = (struct annoStreamDbPslPlus *)sSelf;
 char **ppWords;
 lmAllocArray(lm, ppWords, sSelf->numCols);
 struct annoRow *ppRow;
 boolean rightJoinFail = FALSE;
 while ((ppRow = self->mySource->nextRow(self->mySource, minChrom, minEnd, lm)) != NULL)
     {
     ppWords = ppRow->data;
-    // If there are filters on experiment attributes, apply them, otherwise just return aRow.
+    // If self->idHash is non-NULL, check PSL qName; skip this row if qName not found.
+    char *qName = ppWords[9];
+    if (self->idHash && ! hashLookup(self->idHash, qName))
+        continue;
+    // If there are filters, apply them, otherwise just return aRow.
     if (sSelf->filters)
 	{
 	boolean fails = annoFilterRowFails(sSelf->filters, ppWords, sSelf->numCols,
 					   &rightJoinFail);
 	// If this row passes the filter, or fails but is rightJoin, then we're done looking.
 	if (!fails || rightJoinFail)
 	    break;
 	}
     else
 	// no filtering to do, just use this row
 	break;
     }
 if (ppRow != NULL)
     return annoRowFromStringArray(ppRow->chrom, ppRow->start, ppRow->end, rightJoinFail,
 				  ppWords, sSelf->numCols, lm);
@@ -218,26 +222,35 @@
     struct dyString *dy = dyStringCreate(ncbiRefSeqConfigJsonFormat, gpTable);
     configJson = dyStringCannibalize(&dy);
     }
 else
     errAbort("annoStreamDbPslPlusNew: unrecognized table \"%s\"", gpTable);
 struct annoStreamDbPslPlus *self;
 AllocVar(self);
 // Get internal streamer for joining PSL with other tables.
 struct jsonElement *config = jsonParse(configJson);
 if (extraConfig)
     jsonObjectMerge(config, extraConfig);
 self->mySource = annoStreamDbNew(aa->name, pslTable, aa, maxOutRows, config);
 struct asObject *asObj = annoStreamDbPslPlusAsObj();
 if (extraConfig)
     asObjAppendExtraColumns(asObj, self->mySource->asObj);
+if (sameString("ncbiRefSeqCurated", gpTable) || sameString("ncbiRefSeqPredicted", gpTable))
+    {
+    // Load up an ID hash to restrict PSL query results to the curated/predicted subset:
+    struct sqlConnection *conn = hAllocConn(aa->name);
+    char query[1024];
+    sqlSafef(query, sizeof(query), "select name, 1 from %s", gpTable);
+    self->idHash = sqlQuickHash(conn, query);
+    hFreeConn(&conn);
+    }
 // Set up external streamer interface
 struct annoStreamer *streamer = &(self->streamer);
 annoStreamerInit(streamer, aa, asObj, pslTable);
 streamer->rowType = arWords;
 self->gpTable = cloneString(gpTable);
 // Override methods that need to pass through to internal source:
 streamer->setRegion = asdppSetRegion;
 streamer->nextRow = asdppNextRow;
 streamer->close = asdppClose;
 return (struct annoStreamer *)self;
 }