9840af33fea8c07d1b0e017cb9ec77d305535f91
galt
  Tue Feb 3 13:03:09 2026 -0800
Updates support for Multi-region exon mode knownGene processing. It now correctly handles pseudoGenes checkbox in the track settings, all the trackSets including MANE, and also the commma-separted ID-list filtering. fixes #37060

diff --git src/hg/hgTracks/hgTracks.c src/hg/hgTracks/hgTracks.c
index 41a1357dec7..46f4f094738 100644
--- src/hg/hgTracks/hgTracks.c
+++ src/hg/hgTracks/hgTracks.c
@@ -3740,104 +3740,142 @@
     gene->exonEnds[i] += padding;
     if (gene->exonEnds[i] > chromSize)
 	gene->exonEnds[i] = chromSize;
     }
 }
 
 static void convertGenePredGeneToExon(struct genePred *gene)
 /* convert gene into a gene with just one exon that spans the entire gene */
 {
 if (gene->exonCount < 1)
     errAbort("unexpected input in convertGenePredGeneToExon(), gene->exonCount=%d < 1", gene->exonCount);
 gene->exonEnds[0] = gene->exonEnds[gene->exonCount - 1];
 gene->exonCount = 1;
 }
 
-void initVirtRegionsFromEMGeneTableExons(boolean showNoncoding, char *knownCanonical, char *knownToTag, boolean geneMostly)
+void initVirtRegionsFromEMGeneTableExons(boolean showPseudo, boolean showNoncoding, char *knownCanonical,
+ char *knownToTag, boolean geneMostly, char *kgnf)
 /* Create a regionlist from knownGene exons. */
 // Merge exon regions that overlap.
 
 // DONE Jim indicated that he would prefer it to include all transcripts, not just knownCanonical.
 
 // DONE Jim also suggested that we might want to handle padding right here in this step.
 // After thinking about it, I do not think it would be very hard because we are merging already.
 // Basically, just take the record from the db table row, add padding to start and end,
 // and clip for chromosome size.
 
 // TODO If we keep it at full genome level (instead of single chrom), then there is an apparent
 // sorting issue because although they are sorted on disk, they are usually sorted by chrom alphabetically
 // so that chr11 (not chr2) comes after chr1.  Instead of trying to specify the sort order in the query,
 // which is slow, or trying to read one chrom at a time in the sorted order which is also slow, we can instead
 // just fetch them in their native order, and then create a duplicate array and copy the contents
 // to it in memory, one chunk per chrom, which would be very fast, but temporarily require duplicate vchrom array mem.
 // Not sure what to do about assemblies with many scaffolds.
 //
 // Adding support for extra options from Gencode hg38 so we can filter for
 // comprehensive, splice-variants, non-coding subsets.
+// Added add support for pseudo filter for pseudoGenes, default off.
+// Added support for Track Sets including new MANE and Id-list filter.
 
 {
 struct sqlConnection *conn = hAllocConn(database);
 virtRegionList = NULL;
 struct sqlResult *sr;
 char **row;
 int rowOffset = 0;
 struct dyString *query = NULL;
 int padding = emPadding;
 if (sameString(virtModeType, "geneMostly"))
     padding = gmPadding;
+
+// knownPseudo Hash
+struct hash *kpHash = NULL;
+if (!showPseudo) // filter out pseudo variants
+    {
+    // load up hash of pseudo transcriptIds
+    query = sqlDyStringCreate("select kgId from knownAttrs"
+	" where transcriptClass = 'pseudo'");
+    kpHash = newHash(10);
+    sr = sqlGetResult(conn, dyStringContents(query));
+    while ((row = sqlNextRow(sr)) != NULL)
+	{
+	hashAdd(kpHash, row[0], NULL);
+	}
+    sqlFreeResult(&sr);
+    dyStringFree(&query);
+    }
+
 // knownCanonical Hash
 struct hash *kcHash = NULL;
 if (knownCanonical) // filter out alt splicing variants
     {
     // load up hash of canonical transcriptIds
     query = sqlDyStringCreate("select transcript from %s"
 	//" where chrom not like '%%_hap_%%' and chrom not like '%%_random'"
 	, knownCanonical);
     if (virtualSingleChrom())
 	sqlDyStringPrintf(query, " where chrom='%s'", chromName);
     kcHash = newHash(10);
     sr = sqlGetResult(conn, dyStringContents(query));
     while ((row = sqlNextRow(sr)) != NULL)
 	{
 	hashAdd(kcHash, row[0], NULL);
 	}
     sqlFreeResult(&sr);
     dyStringFree(&query);
     }
 // knownToTag basic hash
 struct hash *ktHash = NULL;
 if (knownToTag) // filter out all but Basic
     {
     // load up hash of canonical transcriptIds
-    query = sqlDyStringCreate("select name from %s where value='basic'", knownToTag);
+    query = sqlDyStringCreate("select name from knownToTag where value='%s'", knownToTag);
     ktHash = newHash(10);
     sr = sqlGetResult(conn, dyStringContents(query));
     while ((row = sqlNextRow(sr)) != NULL)
 	{
 	hashAdd(ktHash, row[0], NULL);
 	}
     sqlFreeResult(&sr);
     dyStringFree(&query);
     }
 setEMGeneTrack();
 if (!emGeneTable)
     errAbort("Unexpected error, emGeneTable=NULL in initVirtRegionsFromEMGeneTableExons");
 if (hIsBinned(database, emGeneTable)) // skip first bin column if any
     ++rowOffset;
 query = sqlDyStringCreate("select * from %s", emGeneTable);
+if (virtualSingleChrom() || kgnf)
+    sqlDyStringPrintf(query, " where");
 if (virtualSingleChrom())
-    sqlDyStringPrintf(query, " where chrom='%s'", chromName);
+    sqlDyStringPrintf(query, " chrom='%s'", chromName);
+if (virtualSingleChrom() && kgnf)
+    sqlDyStringPrintf(query, " and");
+if (kgnf)
+    {
+    sqlDyStringPrintf(query, " name in (");
+    struct slName *id, *list = slNameListFromCommaEscaped(kgnf);
+    for (id=list; id; id=id->next)
+	{
+        if (id != list)
+	    sqlDyStringPrintf(query, ",");
+	sqlDyStringPrintf(query, "'%s'", trimSpaces(id->name));
+	}
+    sqlDyStringPrintf(query, ")");
+    }
+
 // TODO GALT may have to change this to in-memory sorting?
 // refGene is out of order because of genbank continuous loading
 // also, using where chrom= causes it to use indexes which disturb order returned.
 sqlDyStringPrintf(query, " order by chrom, txStart");
 sr = sqlGetResult(conn, dyStringContents(query));
 dyStringFree(&query);
 
 char chrom[256] = "";
 int start = -1;
 int end = -1;
 char lastChrom[256] = "";
 int lastStart = -1;
 int lastEnd = -1;
 int chromSize = -1;
 char lastChromSizeChrom[256] = "";
@@ -3880,30 +3918,36 @@
 	if (gene && !showNoncoding && (gene->cdsStart == gene->cdsEnd))
 	    {
 	    //skip non-coding gene
 	    genePredFree(&gene);
 	    }
 	if (gene && knownCanonical && !hashLookup(kcHash, gene->name))
 	    {
 	    //skip gene not in knownCanonical hash
 	    genePredFree(&gene);
 	    }
 	if (gene && knownToTag && !hashLookup(ktHash, gene->name))
 	    {
 	    // skip gene not in knownToTag Basic hash
 	    genePredFree(&gene);
 	    }
+	if (gene && !showPseudo && hashLookup(kpHash, gene->name))
+	    {
+	    //skip gene in knownPseudo hash
+	    genePredFree(&gene);
+	    }
+
 	boolean transferIt = FALSE;
 	if (gene && !kceList)
 	    {
 	    transferIt = TRUE;
 	    }
 	else if (gene && kceList)
 	    {
 	    // TODO need to check the chrom equality first
 	    int best = findBestKce(kceList, &bestKce, &prevKce);
 	    if (sameString(gene->chrom, chrom))
 		{
 		if (gene->exonStarts[0] < best)
 		    transferIt = TRUE;
 		}
 	    }
@@ -3994,32 +4038,34 @@
 	lastEnd = end;
 	}
 
     ++bestKce->exonNumber;
     if (bestKce->exonNumber >= bestKce->gene->exonCount)
 	{  // remove from kceList
 	genePredFree(&bestKce->gene);
 	if (prevKce)
 	    prevKce->next = bestKce->next;
 	else
 	    kceList = bestKce->next;
 	freeMem(bestKce);
 	}
 
     }
+
 sqlFreeResult(&sr);
 slReverse(&virtRegionList);
+hashFree(&kpHash);
 hashFree(&kcHash);
 hashFree(&ktHash);
 hFreeConn(&conn);
 }
 
 
 void testRegionList()
 /* check if it is ascending non-overlapping regions.
 (this is not always a requirement in the most general case, i.e. user-regions)
 */
 {
 char lastChrom[256];
 int lastEnd = -1;
 struct virtRegion *v;
 for (v=virtRegionList; v; v=v->next)
@@ -4503,78 +4549,92 @@
 
     slAddHead(&virtRegionList, v);
     slReverse(&virtRegionList);
     }
 else if (sameString(virtModeType, "exonMostly")
       || sameString(virtModeType, "geneMostly"))
     {
 
     // Gencode settings: comprehensive, alt-splice, non-coding
 
     char *knownCanonical = NULL;  // show splice-variants, not filtered out via knownCanonical
     boolean showNoncoding = TRUE; // show non-coding where cdsStart==cdsEnd
     char *knownToTag = NULL;      // show comprehensive set not filtered by knownToTag
     char varName[SMALLBUF];
     boolean geneMostly = FALSE;
+    boolean showPseudo = TRUE;
+    char *kgnf = NULL;
 
     lastDbPosSaveCartSetting("emGeneTable");
 
     //DISGUISE makes obsolete dySaveCartSetting(dy, "emGeneTable");
     //DISGUISE makes obsolete dySaveCartSetting(dy, "emPadding");
     if (sameString(virtModeType, "geneMostly"))
 	geneMostly = TRUE;
     if (sameString(emGeneTable, "knownGene"))
 	{
 	// test cart var knownGene.show.noncoding
         // check for alternate table name.
         // if found, set and pass to gene-table reading routine
 
         // Some code borrowed from simpleTracks.c::loadKnownGene()
 
 	safef(varName, sizeof(varName), "%s.show.noncoding", emGeneTable);
 	showNoncoding = cartUsualBoolean(cart, varName, TRUE);
 	//DISGUISE makes obsolete dySaveCartSetting(dy, varName);
 	safef(varName, sizeof(varName), "%s.show.spliceVariants", emGeneTable);
 	boolean showSpliceVariants = cartUsualBoolean(cart, varName, TRUE);
 	//DISGUISE makes obsolete dySaveCartSetting(dy, varName);
 	if (!showSpliceVariants)
 	    {
 	    char *canonicalTable = trackDbSettingOrDefault(emGeneTrack->tdb, "canonicalTable", "knownCanonical");
 	    if (hTableExists(database, canonicalTable))
 		knownCanonical = canonicalTable;
 	    }
+
 	safef(varName, sizeof(varName), "%s.show.comprehensive", emGeneTable);
 	boolean showComprehensive = cartUsualBoolean(cart, varName, FALSE);
 	//DISGUISE makes obsolete dySaveCartSetting(dy, varName);
+	knownToTag = NULL;
 	if (!showComprehensive)
 	    {
 	    if (hTableExists(database, "knownToTag"))
 		{
-		knownToTag = "knownToTag";
+		safef(varName, sizeof(varName), "%s.show.set", emGeneTable);
+		char *setString = cartUsualString(cart, varName, "basic");
+		if (differentString(setString, "all"))
+		   knownToTag = setString;
 		}
 	    }
 
+	safef(varName, sizeof(varName), "%s.show.pseudo", emGeneTable);
+	showPseudo = cartUsualBoolean(cart, varName, FALSE);
+
+	safef(varName, sizeof(varName), "%s.nameFilter", emGeneTable);
+	kgnf = trimSpaces(cartUsualString(cart, varName, NULL));
+	if (sameOk(kgnf, ""))
+	    kgnf = NULL;
 	}
     if (sameString(emGeneTable, "refGene"))
 	{
 	char varName[SMALLBUF];
 	safef(varName, sizeof(varName), "%s.hideNoncoding", emGeneTable);
 	showNoncoding = !cartUsualBoolean(cart, varName, FALSE);
 	//DISGUISE makes obsolete dySaveCartSetting(dy, varName);
 	}
 
-    initVirtRegionsFromEMGeneTableExons(showNoncoding, knownCanonical, knownToTag, geneMostly);
+    initVirtRegionsFromEMGeneTableExons(showPseudo, showNoncoding, knownCanonical, knownToTag, geneMostly, kgnf);
     if (!virtRegionList)
 	{
 	warn("No genes found on chrom %s for track %s, returning to default view", chromName, emGeneTrack->shortLabel);
 	return FALSE;   // regionList is empty, nothing found.
 	}
     if (geneMostly)
 	virtModeShortDescr = "genes";
     else
 	virtModeShortDescr = "exons";
     // DISGUISE makes obsolete dyStringPrintf(dy," %s %s", dy->string, knownCanonical, knownToTag);
     }
 else if (sameString(virtModeType, "kcGenes")) // TODO obsolete
     {
     initVirtRegionsFromKnownCanonicalGenes("knownCanonical");
     virtModeShortDescr = "genes";