3ef5ed59d49e3654d5de00183150d38ac85476e0 galt Wed Dec 11 11:19:19 2013 -0800 fixes 12318. for assemblies like rheMac3 that have many thousands of chroms, it is faster to find out a distinct list of chroms that are actually used by the e.g. refGene table and hash them, and then can quickly skip thousands of regions not in the hash. now it goes from timing out over 3 minutes to returning a good result in a few seconds. diff --git src/hg/hgTables/gffOut.c src/hg/hgTables/gffOut.c index eec203f..ef14f76 100644 --- src/hg/hgTables/gffOut.c +++ src/hg/hgTables/gffOut.c @@ -368,45 +368,64 @@ bed->thickStart, bed->thickEnd, bed->blockCount, exonStarts->string, exonEnds->string ); char *exonFrames = sqlQuickString(conn, sql); slNameAddHead(&list, exonFrames); dyStringFree(&exonStarts); dyStringFree(&exonEnds); } slReverse(&list); return list; } +static struct hash *makeChromHashForTable(struct sqlConnection *conn, char *table) +/* Get a hash of all the chroms that are actually being used for the table. + * This is helpful for assemblies with huge numbers of chroms. */ +{ +char query[1024]; +sqlSafef(query, sizeof query, "select distinct chrom, 'dummyvalue' from %s", table); +struct hash *hash = sqlQuickHash(conn, query); +return hash; +} + void doOutGff(char *table, struct sqlConnection *conn, boolean outputGtf) /* Save as GFF/GTF. */ { struct hTableInfo *hti = getHti(database, table, conn); struct bed *bedList; struct slName *exonFramesList = NULL; char source[HDB_MAX_TABLE_STRING]; int itemCount; struct region *region, *regionList = getRegions(); textOpen(); int efIdx = sqlFieldIndex(conn, table, "exonFrames"); safef(source, sizeof(source), "%s_%s", database, table); itemCount = 0; +// regionList can have many thousands of items e.g. rheMac3 has 34000 chroms! +struct hash *chromHash = NULL; +int regionCount = slCount(regionList); +if (regionCount > 400) + { + chromHash = makeChromHashForTable(conn, table); + }; for (region = regionList; region != NULL; region = region->next) { + if (chromHash && (!hashFindVal(chromHash, region->chrom))) + continue; struct lm *lm = lmInit(64*1024); int fieldCount; bedList = cookedBedList(conn, table, region, lm, &fieldCount); if (efIdx != -1) exonFramesList = getExonFrames(table, conn, bedList); itemCount += bedToGffLines(bedList, exonFramesList, hti, fieldCount, source, outputGtf); lmCleanup(&lm); } if (itemCount == 0) hPrintf(NO_RESULTS); }