9840af33fea8c07d1b0e017cb9ec77d305535f91 galt Tue Feb 3 13:03:09 2026 -0800 Updates support for Multi-region exon mode knownGene processing. It now correctly handles pseudoGenes checkbox in the track settings, all the trackSets including MANE, and also the commma-separted ID-list filtering. fixes #37060 diff --git src/hg/hgTracks/hgTracks.c src/hg/hgTracks/hgTracks.c index 41a1357dec7..46f4f094738 100644 --- src/hg/hgTracks/hgTracks.c +++ src/hg/hgTracks/hgTracks.c @@ -3740,104 +3740,142 @@ gene->exonEnds[i] += padding; if (gene->exonEnds[i] > chromSize) gene->exonEnds[i] = chromSize; } } static void convertGenePredGeneToExon(struct genePred *gene) /* convert gene into a gene with just one exon that spans the entire gene */ { if (gene->exonCount < 1) errAbort("unexpected input in convertGenePredGeneToExon(), gene->exonCount=%d < 1", gene->exonCount); gene->exonEnds[0] = gene->exonEnds[gene->exonCount - 1]; gene->exonCount = 1; } -void initVirtRegionsFromEMGeneTableExons(boolean showNoncoding, char *knownCanonical, char *knownToTag, boolean geneMostly) +void initVirtRegionsFromEMGeneTableExons(boolean showPseudo, boolean showNoncoding, char *knownCanonical, + char *knownToTag, boolean geneMostly, char *kgnf) /* Create a regionlist from knownGene exons. */ // Merge exon regions that overlap. // DONE Jim indicated that he would prefer it to include all transcripts, not just knownCanonical. // DONE Jim also suggested that we might want to handle padding right here in this step. // After thinking about it, I do not think it would be very hard because we are merging already. // Basically, just take the record from the db table row, add padding to start and end, // and clip for chromosome size. // TODO If we keep it at full genome level (instead of single chrom), then there is an apparent // sorting issue because although they are sorted on disk, they are usually sorted by chrom alphabetically // so that chr11 (not chr2) comes after chr1. Instead of trying to specify the sort order in the query, // which is slow, or trying to read one chrom at a time in the sorted order which is also slow, we can instead // just fetch them in their native order, and then create a duplicate array and copy the contents // to it in memory, one chunk per chrom, which would be very fast, but temporarily require duplicate vchrom array mem. // Not sure what to do about assemblies with many scaffolds. // // Adding support for extra options from Gencode hg38 so we can filter for // comprehensive, splice-variants, non-coding subsets. +// Added add support for pseudo filter for pseudoGenes, default off. +// Added support for Track Sets including new MANE and Id-list filter. { struct sqlConnection *conn = hAllocConn(database); virtRegionList = NULL; struct sqlResult *sr; char **row; int rowOffset = 0; struct dyString *query = NULL; int padding = emPadding; if (sameString(virtModeType, "geneMostly")) padding = gmPadding; + +// knownPseudo Hash +struct hash *kpHash = NULL; +if (!showPseudo) // filter out pseudo variants + { + // load up hash of pseudo transcriptIds + query = sqlDyStringCreate("select kgId from knownAttrs" + " where transcriptClass = 'pseudo'"); + kpHash = newHash(10); + sr = sqlGetResult(conn, dyStringContents(query)); + while ((row = sqlNextRow(sr)) != NULL) + { + hashAdd(kpHash, row[0], NULL); + } + sqlFreeResult(&sr); + dyStringFree(&query); + } + // knownCanonical Hash struct hash *kcHash = NULL; if (knownCanonical) // filter out alt splicing variants { // load up hash of canonical transcriptIds query = sqlDyStringCreate("select transcript from %s" //" where chrom not like '%%_hap_%%' and chrom not like '%%_random'" , knownCanonical); if (virtualSingleChrom()) sqlDyStringPrintf(query, " where chrom='%s'", chromName); kcHash = newHash(10); sr = sqlGetResult(conn, dyStringContents(query)); while ((row = sqlNextRow(sr)) != NULL) { hashAdd(kcHash, row[0], NULL); } sqlFreeResult(&sr); dyStringFree(&query); } // knownToTag basic hash struct hash *ktHash = NULL; if (knownToTag) // filter out all but Basic { // load up hash of canonical transcriptIds - query = sqlDyStringCreate("select name from %s where value='basic'", knownToTag); + query = sqlDyStringCreate("select name from knownToTag where value='%s'", knownToTag); ktHash = newHash(10); sr = sqlGetResult(conn, dyStringContents(query)); while ((row = sqlNextRow(sr)) != NULL) { hashAdd(ktHash, row[0], NULL); } sqlFreeResult(&sr); dyStringFree(&query); } setEMGeneTrack(); if (!emGeneTable) errAbort("Unexpected error, emGeneTable=NULL in initVirtRegionsFromEMGeneTableExons"); if (hIsBinned(database, emGeneTable)) // skip first bin column if any ++rowOffset; query = sqlDyStringCreate("select * from %s", emGeneTable); +if (virtualSingleChrom() || kgnf) + sqlDyStringPrintf(query, " where"); if (virtualSingleChrom()) - sqlDyStringPrintf(query, " where chrom='%s'", chromName); + sqlDyStringPrintf(query, " chrom='%s'", chromName); +if (virtualSingleChrom() && kgnf) + sqlDyStringPrintf(query, " and"); +if (kgnf) + { + sqlDyStringPrintf(query, " name in ("); + struct slName *id, *list = slNameListFromCommaEscaped(kgnf); + for (id=list; id; id=id->next) + { + if (id != list) + sqlDyStringPrintf(query, ","); + sqlDyStringPrintf(query, "'%s'", trimSpaces(id->name)); + } + sqlDyStringPrintf(query, ")"); + } + // TODO GALT may have to change this to in-memory sorting? // refGene is out of order because of genbank continuous loading // also, using where chrom= causes it to use indexes which disturb order returned. sqlDyStringPrintf(query, " order by chrom, txStart"); sr = sqlGetResult(conn, dyStringContents(query)); dyStringFree(&query); char chrom[256] = ""; int start = -1; int end = -1; char lastChrom[256] = ""; int lastStart = -1; int lastEnd = -1; int chromSize = -1; char lastChromSizeChrom[256] = ""; @@ -3880,30 +3918,36 @@ if (gene && !showNoncoding && (gene->cdsStart == gene->cdsEnd)) { //skip non-coding gene genePredFree(&gene); } if (gene && knownCanonical && !hashLookup(kcHash, gene->name)) { //skip gene not in knownCanonical hash genePredFree(&gene); } if (gene && knownToTag && !hashLookup(ktHash, gene->name)) { // skip gene not in knownToTag Basic hash genePredFree(&gene); } + if (gene && !showPseudo && hashLookup(kpHash, gene->name)) + { + //skip gene in knownPseudo hash + genePredFree(&gene); + } + boolean transferIt = FALSE; if (gene && !kceList) { transferIt = TRUE; } else if (gene && kceList) { // TODO need to check the chrom equality first int best = findBestKce(kceList, &bestKce, &prevKce); if (sameString(gene->chrom, chrom)) { if (gene->exonStarts[0] < best) transferIt = TRUE; } } @@ -3994,32 +4038,34 @@ lastEnd = end; } ++bestKce->exonNumber; if (bestKce->exonNumber >= bestKce->gene->exonCount) { // remove from kceList genePredFree(&bestKce->gene); if (prevKce) prevKce->next = bestKce->next; else kceList = bestKce->next; freeMem(bestKce); } } + sqlFreeResult(&sr); slReverse(&virtRegionList); +hashFree(&kpHash); hashFree(&kcHash); hashFree(&ktHash); hFreeConn(&conn); } void testRegionList() /* check if it is ascending non-overlapping regions. (this is not always a requirement in the most general case, i.e. user-regions) */ { char lastChrom[256]; int lastEnd = -1; struct virtRegion *v; for (v=virtRegionList; v; v=v->next) @@ -4503,78 +4549,92 @@ slAddHead(&virtRegionList, v); slReverse(&virtRegionList); } else if (sameString(virtModeType, "exonMostly") || sameString(virtModeType, "geneMostly")) { // Gencode settings: comprehensive, alt-splice, non-coding char *knownCanonical = NULL; // show splice-variants, not filtered out via knownCanonical boolean showNoncoding = TRUE; // show non-coding where cdsStart==cdsEnd char *knownToTag = NULL; // show comprehensive set not filtered by knownToTag char varName[SMALLBUF]; boolean geneMostly = FALSE; + boolean showPseudo = TRUE; + char *kgnf = NULL; lastDbPosSaveCartSetting("emGeneTable"); //DISGUISE makes obsolete dySaveCartSetting(dy, "emGeneTable"); //DISGUISE makes obsolete dySaveCartSetting(dy, "emPadding"); if (sameString(virtModeType, "geneMostly")) geneMostly = TRUE; if (sameString(emGeneTable, "knownGene")) { // test cart var knownGene.show.noncoding // check for alternate table name. // if found, set and pass to gene-table reading routine // Some code borrowed from simpleTracks.c::loadKnownGene() safef(varName, sizeof(varName), "%s.show.noncoding", emGeneTable); showNoncoding = cartUsualBoolean(cart, varName, TRUE); //DISGUISE makes obsolete dySaveCartSetting(dy, varName); safef(varName, sizeof(varName), "%s.show.spliceVariants", emGeneTable); boolean showSpliceVariants = cartUsualBoolean(cart, varName, TRUE); //DISGUISE makes obsolete dySaveCartSetting(dy, varName); if (!showSpliceVariants) { char *canonicalTable = trackDbSettingOrDefault(emGeneTrack->tdb, "canonicalTable", "knownCanonical"); if (hTableExists(database, canonicalTable)) knownCanonical = canonicalTable; } + safef(varName, sizeof(varName), "%s.show.comprehensive", emGeneTable); boolean showComprehensive = cartUsualBoolean(cart, varName, FALSE); //DISGUISE makes obsolete dySaveCartSetting(dy, varName); + knownToTag = NULL; if (!showComprehensive) { if (hTableExists(database, "knownToTag")) { - knownToTag = "knownToTag"; + safef(varName, sizeof(varName), "%s.show.set", emGeneTable); + char *setString = cartUsualString(cart, varName, "basic"); + if (differentString(setString, "all")) + knownToTag = setString; } } + safef(varName, sizeof(varName), "%s.show.pseudo", emGeneTable); + showPseudo = cartUsualBoolean(cart, varName, FALSE); + + safef(varName, sizeof(varName), "%s.nameFilter", emGeneTable); + kgnf = trimSpaces(cartUsualString(cart, varName, NULL)); + if (sameOk(kgnf, "")) + kgnf = NULL; } if (sameString(emGeneTable, "refGene")) { char varName[SMALLBUF]; safef(varName, sizeof(varName), "%s.hideNoncoding", emGeneTable); showNoncoding = !cartUsualBoolean(cart, varName, FALSE); //DISGUISE makes obsolete dySaveCartSetting(dy, varName); } - initVirtRegionsFromEMGeneTableExons(showNoncoding, knownCanonical, knownToTag, geneMostly); + initVirtRegionsFromEMGeneTableExons(showPseudo, showNoncoding, knownCanonical, knownToTag, geneMostly, kgnf); if (!virtRegionList) { warn("No genes found on chrom %s for track %s, returning to default view", chromName, emGeneTrack->shortLabel); return FALSE; // regionList is empty, nothing found. } if (geneMostly) virtModeShortDescr = "genes"; else virtModeShortDescr = "exons"; // DISGUISE makes obsolete dyStringPrintf(dy," %s %s", dy->string, knownCanonical, knownToTag); } else if (sameString(virtModeType, "kcGenes")) // TODO obsolete { initVirtRegionsFromKnownCanonicalGenes("knownCanonical"); virtModeShortDescr = "genes";