791097ea7589f6fadcf825c5c3a8934054917a28
angie
  Thu Jul 9 14:38:12 2020 -0700
VCF haplotype display: trackDb setting geneTrack adds option to color by function (red for non-synon, green for synon, blue for UTR/NC).  refs #25870

diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c
index d813127..40c96cd 100644
--- src/hg/hgTracks/vcfTrack.c
+++ src/hg/hgTracks/vcfTrack.c
@@ -1,38 +1,45 @@
 /* vcfTrack -- handlers for Variant Call Format data. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "bigWarn.h"
 #include "dystring.h"
 #include "rainbow.h"
 #include "errCatch.h"
+#include "fa.h"
+#include "genePredReader.h"
 #include "hacTree.h"
 #include "hdb.h"
 #include "hgColors.h"
 #include "hgTracks.h"
 #include "pgSnp.h"
 #include "phyloTree.h"
+#include "trackHub.h"
 #include "trashDir.h"
+#include "variantProjector.h"
 #include "vcf.h"
 #include "vcfUi.h"
 #include "knetUdc.h"
 #include "udc.h"
 #include "memgfx.h"
 
+// Russ Corbett-Detig suggested darker shades for coloring non-synonymous variants green
+Color darkerShadesOfGreenOnWhite[EXPR_DATA_SHADES];
+
 static boolean getMinQual(struct trackDb *tdb, double *retMinQual)
 /* Return TRUE and set retMinQual if cart contains minimum QUAL filter */
 {
 if (cartOrTdbBoolean(cart, tdb, VCF_APPLY_MIN_QUAL_VAR, VCF_DEFAULT_APPLY_MIN_QUAL))
     {
     if (retMinQual != NULL)
 	*retMinQual = cartOrTdbDouble(cart, tdb, VCF_MIN_QUAL_VAR, VCF_DEFAULT_MIN_QUAL);
     return TRUE;
     }
 return FALSE;
 }
 
 static boolean minQualFail(struct vcfRecord *record, double minQual)
 /* Return TRUE if record's QUAL column value is non-numeric or is less than minQual. */
 {
@@ -632,93 +639,477 @@
     x1--;
     w = 3;
     }
 mapBoxHgcOrHgGene(hvg, chromStartMap, chromEndMap, x1, yOff, w, tg->height, tg->track,
 		  rec->name, dy->string, NULL, TRUE, NULL);
 }
 
 // These are initialized when we start drawing, then constant.
 static Color purple = 0;
 static Color undefYellow = 0;
 
 enum hapColorMode
     {
     altOnlyMode,
     refAltMode,
-    baseMode
+    baseMode,
+    functionMode
     };
 
-static Color colorByAltOnly(int refs, int alts, int unks)
-/* Coloring alternate alleles only: shade by proportion of alt alleles to refs, unknowns */
+static Color colorByAltShade(int refs, int alts, int unks, Color colorShades[], int colorShadeCount)
+/* Coloring alternate alleles only by shades of color: shade by proportion of alts */
 {
 if (unks > (refs + alts))
     return undefYellow;
-int grayIx = hGrayInRange(alts, 0, alts+refs+unks, maxShade+1) - 1; // undo force to 1
-return shadesOfGray[grayIx];
+int total = refs + alts + unks;
+int shadeIx = (alts * colorShadeCount) / total;
+if (shadeIx == colorShadeCount)
+    shadeIx--;
+return colorShades[shadeIx];
+}
+
+static Color colorByAltOnly(int refs, int alts, int unks)
+/* Coloring alternate alleles only: shade by proportion of alt alleles */
+{
+return colorByAltShade(refs, alts, unks, shadesOfGray, maxShade+1);
 }
 
 static Color colorByRefAlt(int refs, int alts, int unks)
 /* Color blue for reference allele, red for alternate allele, gray for unknown, purple
  * for reasonably mixed. */
 {
 const int fudgeFactor = 4; // Threshold factor for calling one color or the other when mixed
 if (unks > (refs + alts))
     return undefYellow;
 if (alts > fudgeFactor * refs)
     return MG_RED;
 if (refs > fudgeFactor * alts)
     return MG_BLUE;
 return purple;
 }
 
 static Color colorByBase(int refs, int alts, int unks, char *refAl, char *altAl)
 /* Color gray for unknown or mixed, otherwise pgSnpColor of predominant allele. */
 {
 const int fudgeFactor = 4; // Threshold for calling for one color or the other when mixed
 if (unks > (refs + alts))
     return undefYellow;
 if (alts > fudgeFactor * refs)
     return pgSnpColor(altAl);
 if (refs > fudgeFactor * alts)
     return pgSnpColor(refAl);
 return shadesOfGray[5];
 }
 
+static struct dnaSeq *genePredToGenomicSequence(struct genePred *pred, struct seqWindow *gSeqWin)
+/* Return concatenated genomic sequence of exons of pred. */
+{
+int txLen = 0;
+int i;
+for (i=0; i < pred->exonCount; i++)
+    txLen += (pred->exonEnds[i] - pred->exonStarts[i]);
+char *seq = needMem(txLen + 1);
+int offset = 0;
+for (i=0; i < pred->exonCount; i++)
+    {
+    int exonStart = pred->exonStarts[i];
+    int exonEnd = pred->exonEnds[i];
+    int exonLen = exonEnd - exonStart;
+    seqWindowCopy(gSeqWin, exonStart, exonLen, seq+offset, txLen+1-offset);
+    offset += exonLen;
+    }
+if(pred->strand[0] == '-')
+    reverseComplement(seq, txLen);
+struct dnaSeq *txSeq = NULL;
+AllocVar(txSeq);
+txSeq->name = cloneString(pred->name);
+txSeq->dna = seq;
+txSeq->size = txLen;
+return txSeq;
+}
+
+struct txInfo
+/* Transcript sequence and alignment needed for prediction of functional effect/consequences */
+    {
+    struct txInfo *next;
+    struct psl *psl;            // alignment of transcript to genome
+    struct dnaSeq *txSeq;       // transcript sequence
+    struct genbankCds *cds;     // transcript CDS (possible none)
+    struct dnaSeq *protSeq;     // transcript protein sequence (possibly NULL)
+
+    };
+
+static struct txInfo *txInfoFromGenePred(struct genePred *gp, struct seqWindow *gSeqWin)
+/* Use gp and gSeqWin to construct transcript alignment, sequence and protein sequence if applic. */
+{
+struct txInfo *txi;
+AllocVar(txi);
+AllocVar(txi->cds);
+genePredToCds(gp, txi->cds);
+txi->txSeq = genePredToGenomicSequence(gp, gSeqWin);
+int chromSize = 0;  // unused
+txi->psl = genePredToPsl(gp, chromSize, txi->txSeq->size);
+pslRemoveFrameShifts(txi->psl);
+vpExpandIndelGaps(txi->psl, gSeqWin, txi->txSeq);
+txi->protSeq = NULL;
+if (txi->cds->end > txi->cds->start && txi->cds->startComplete)
+    {
+    txi->protSeq = translateSeq(txi->txSeq, txi->cds->start, FALSE);
+    aaSeqZToX(txi->protSeq);
+    }
+return txi;
+}
+
+static struct hash *hashExtNcbiRefSeq(struct sqlConnection *conn)
+/* Despite the seq/extFile structure, there is only one external file and we don't want to
+ * keep opening and closing the file each time.  Just in case there are multiple files someday,
+ * hash extNcbiRefSeq id to open udc file handle(s). */
+{
+struct hash *extNcbi = hashNew(0);
+char query[1024];
+sqlSafef(query, sizeof query, "select id, path from extNcbiRefSeq");
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *extId = row[0];
+    char *path = row[1];
+    struct udcFile *udcF = udcFileOpen(path, NULL);
+    hashAdd(extNcbi, extId, udcF);
+    }
+sqlFreeResult(&sr);
+return extNcbi;
+}
+
+static void freeExtNcbiHash(struct hash **pExtNcbi)
+/* Clean up hash of open udcFiles created by hashExtNcbiRefSeq. */
+{
+if (pExtNcbi && *pExtNcbi)
+    {
+    struct hash *hash = *pExtNcbi;
+    struct hashCookie cookie = hashFirst(hash);
+    struct hashEl *hel;
+    while ((hel = hashNext(&cookie)) != NULL)
+        udcFileClose((struct udcFile **)&hel->val);
+    hashFree(pExtNcbi);
+    }
+}
+
+static struct txInfo *txInfoInitFromPsl(struct sqlConnection *conn, char *pslTable,
+                                        char *extraWhere, struct hash **retTxiHash)
+/* Alloc and return a list of txInfo with only psl populated, from pslTable in the current window.
+ * Also return a hash of transcript ID to txInfo (retTxiHash). */
+{
+struct txInfo *txiList = NULL;
+struct hash *txiHash = hashNew(0);
+int binOffset = 0;
+int start = max(0, winStart - GPRANGE);
+int end = winEnd + GPRANGE;
+struct sqlResult *sr = hRangeQuery(conn, pslTable, chromName, start, end, extraWhere, &binOffset);
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    struct txInfo *txi;
+    AllocVar(txi);
+    txi->psl = pslLoad(row+binOffset);
+    slAddHead(&txiList, txi);
+    hashAdd(txiHash, txi->psl->qName, txi);
+    }
+sqlFreeResult(&sr);
+*retTxiHash = txiHash;
+return txiList;
+}
+
+static void txiInfoAppendIdList(struct dyString *query, struct txInfo *txiList)
+/* Append a paren-enclosed list of quoted transcript IDs to query. */
+{
+dyStringAppendC(query, '(');
+struct txInfo *txi;
+for (txi = txiList;  txi != NULL;  txi = txi->next)
+    {
+    if (txi != txiList)
+        dyStringAppend(query, ", ");
+    dyStringPrintf(query, "'%s'", txi->psl->qName);
+    }
+dyStringAppendC(query, ')');
+}
+
+static void txInfoAddCdsFromQuery(struct hash *txiHash, struct sqlConnection *conn, char *query)
+/* Perform query; results have two fields, transcript name (which must be found in txiHash) and
+ * GenBank-formatted CDS string.  For reach row from the query, parse the CDS string into the cds
+ * in the struct txInfo for the appropriate transcript. */
+{
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *name = row[0];
+    char *cdsStr = row[1];
+    struct txInfo *txi = hashMustFindVal(txiHash, name);
+    AllocVar(txi->cds);
+    genbankCdsParse(cdsStr, txi->cds);
+    }
+sqlFreeResult(&sr);
+}
+
+static struct txInfo *txInfoLoadNcbiRefSeq(struct seqWindow *gSeqWin, struct trackDb *gTdb)
+/* Load ncbiRefSeq[Curated] PSL (+ CDS and sequence) in current window and make txInfo for each. */
+{
+struct txInfo *txiList = NULL;
+if (!trackHubDatabase(database) && hDbHasNcbiRefSeq(database))
+    {
+    struct sqlConnection *conn = hAllocConn(database);
+    struct hash *txiHash = hashNew(0);
+    char *extraWhere = NULL;
+    if (sameString(gTdb->track, "ncbiRefSeqCurated"))
+        extraWhere = "qName not like 'X%'";
+    else if (sameString(gTdb->track, "ncbiRefSeqPredicted"))
+        extraWhere = "qName like 'X%'";
+    txiList = txInfoInitFromPsl(conn, "ncbiRefSeqPsl", extraWhere, &txiHash);
+    // Now get CDS for each psl/txi:
+    struct dyString *query = sqlDyStringCreate("select * from ncbiRefSeqCds where id in ");
+    txiInfoAppendIdList(query, txiList);
+    txInfoAddCdsFromQuery(txiHash, conn, query->string);
+    // Now get transcript sequence for each psl/txi:
+    struct hash *extNcbi = hashExtNcbiRefSeq(conn);
+    dyStringClear(query);
+    sqlDyStringPrintf(query, "select acc,extFile,file_offset,file_size from seqNcbiRefSeq "
+                      "where acc in ");
+    txiInfoAppendIdList(query, txiList);
+    struct sqlResult *sr = sqlGetResult(conn, query->string);
+    char **row;
+    while ((row = sqlNextRow(sr)) != NULL)
+        {
+        char *name = row[0];
+        char *extId = row[1];
+        off_t offset = sqlLongLong(row[2]);
+        size_t size = sqlUnsigned(row[3]);
+        struct udcFile *udcF = hashMustFindVal(extNcbi, extId);
+        char *buf = needMem(size+1);
+        udcSeek(udcF, offset);
+        udcRead(udcF, buf, size);
+        struct txInfo *txi = hashMustFindVal(txiHash, name);
+        txi->txSeq = faSeqFromMemText(buf, TRUE);
+        }
+    sqlFreeResult(&sr);
+    freeExtNcbiHash(&extNcbi);
+    // Now get protein sequence (if applicable) for each psl/txi:
+    dyStringClear(query);
+    sqlDyStringPrintf(query, "select id, protAcc, seq from ncbiRefSeqLink nl, ncbiRefSeqPepTable np "
+                      "where nl.protAcc = np.name and id in ");
+    txiInfoAppendIdList(query, txiList);
+    sr = sqlGetResult(conn, query->string);
+    while ((row = sqlNextRow(sr)) != NULL)
+        {
+        char *txId = row[0];
+        char *protId = row[1];
+        char *protSeq = cloneString(row[2]);
+        struct txInfo *txi = hashMustFindVal(txiHash, txId);
+        txi->protSeq = newDnaSeq(protSeq, strlen(protSeq), protId);
+        }
+    sqlFreeResult(&sr);
+    hFreeConn(&conn);
+    }
+return txiList;
+}
+
+static struct txInfo *txInfoLoadRefGene(struct seqWindow *gSeqWin, struct trackDb *gTdb)
+/* Load refSeqAli PSL (+ genbank CDS and sequence) in current window and make txInfo for each. */
+{
+struct txInfo *txiList = NULL;
+if (!trackHubDatabase(database))
+    {
+    initGenbankTableNames(database);
+    struct sqlConnection *conn = hAllocConn(database);
+    struct hash *txiHash = NULL;
+    txiList = txInfoInitFromPsl(conn, "refSeqAli", NULL, &txiHash);
+    // Now get CDS for each psl/txi:
+    struct dyString *query = sqlDyStringCreate("select i.acc, c.name from %s i, %s c "
+                                               "where c.id = i.cds and i.acc in ",
+                                               gbCdnaInfoTable, cdsTable);
+    txiInfoAppendIdList(query, txiList);
+    txInfoAddCdsFromQuery(txiHash, conn, query->string);
+    // Now get transcript and translated sequence for each psl/txi:
+    struct txInfo *txi;
+    for (txi = txiList;  txi != NULL;  txi = txi->next)
+        {
+        txi->txSeq = hGenBankGetMrna(database, txi->psl->qName, NULL);
+        if (txi->cds->end > txi->cds->start && txi->cds->startComplete)
+            {
+            txi->protSeq = translateSeq(txi->txSeq, txi->cds->start, FALSE);
+            aaSeqZToX(txi->protSeq);
+            }
+        }
+    }
+return txiList;
+}
+
+static struct txInfo *txInfoLoadBigGenePred(struct seqWindow *gSeqWin, struct trackDb *gTdb)
+/* Load up bigGenePred items in current window and make txInfo for each. */
+{
+struct txInfo *txiList = NULL;
+char *fileName = cloneString(trackDbSetting(gTdb, "bigDataUrl"));
+if (fileName == NULL)
+    fileName = cloneString(trackDbSetting(gTdb, "bigGeneDataUrl"));
+if (isNotEmpty(fileName))
+    {
+    struct bbiFile *bbi = bigBedFileOpen(fileName);
+    struct lm *lm = lmInit(0);
+    struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chromName, winStart,
+                                                        winEnd, 0, lm);
+    struct bigBedInterval *bb;
+    for (bb = bbList; bb != NULL; bb = bb->next)
+        {
+        struct genePredExt *gp = genePredFromBigGenePred(chromName, bb);
+        struct txInfo *txi = txInfoFromGenePred((struct genePred *)gp, gSeqWin);
+        slAddHead(&txiList, txi);
+        }
+    bbiFileClose(&bbi);
+    lmCleanup(&lm);
+    }
+return txiList;
+}
+
+static struct txInfo *txInfoLoadGenePred(struct seqWindow *gSeqWin, struct trackDb *gTdb)
+/* Load genePreds in current window and make txInfo for each. */
+{
+struct txInfo *txiList = NULL;
+if (! trackHubDatabase(database))
+    {
+    struct sqlConnection *conn = hAllocConn(database);
+    struct genePredReader *gpr = genePredReaderRangeQuery(conn, gTdb->table,
+                                                          chromName, winStart, winEnd, NULL);
+    struct genePred *gp = NULL;
+    while ((gp = genePredReaderNext(gpr)) != NULL)
+        {
+        struct txInfo *txi = txInfoFromGenePred(gp, gSeqWin);
+        slAddHead(&txiList, txi);
+        }
+    genePredReaderFree(&gpr);
+    hFreeConn(&conn);
+    }
+return txiList;
+}
+
+static struct txInfo *txInfoLoad(struct seqWindow *gSeqWin, struct trackDb *tdb)
+/* Return a list of transcript sequences and alignments for all transcripts in the current
+ * window for all enabled gene tracks. */
+{
+struct txInfo *txiList = NULL;
+char *geneTrack = cartOrTdbString(cart, tdb, "geneTrack", NULL);
+struct track *gTrack = hashFindVal(trackHash, geneTrack);
+if (gTrack)
+    {
+    struct trackDb *gTdb = gTrack->tdb;
+    // If independent transcript sequence and PSL are available, use them.
+    if (startsWith("ncbiRefSeq", geneTrack))
+        {
+        txiList = txInfoLoadNcbiRefSeq(gSeqWin, gTdb);
+        }
+    else if (sameString(geneTrack, "refGene"))
+        {
+        txiList = txInfoLoadRefGene(gSeqWin, gTdb);
+        }
+    else if (sameString(gTdb->type, "genePred") || sameString(gTdb->type, "bigGenePred"))
+        {
+        // If the track is visible, then struct genePred items have already been loaded so
+        // just use them.  Otherwise load up.
+        if (gTrack->items)
+            {
+            struct linkedFeatures *lf;
+            for (lf = gTrack->items;  lf != NULL;  lf = lf->next)
+                {
+                struct genePred *gp = lf->original;
+                struct txInfo *txi = txInfoFromGenePred(gp, gSeqWin);
+                slAddHead(&txiList, txi);
+                }
+            }
+        else if (sameString(gTdb->type, "bigGenePred"))
+            txiList = txInfoLoadBigGenePred(gSeqWin, gTdb);
+        else
+            txiList = txInfoLoadGenePred(gSeqWin, gTdb);
+        }
+    else
+        errAbort("VCF txInfoLoad: expecting type 'genePred' or 'bigGenePred' for track '%s' "
+                 "in geneTrack setting, but got type '%s'",
+                 geneTrack, gTdb->type);
+    }
+return txiList;
+}
+
+static enum soTerm functionForRecord(struct vcfRecord *rec, struct seqWindow *gSeqWin,
+                                     struct txInfo *txiList)
+/* Return the most severe functional consequence of rec for any transcript in txiList. */
+{
+struct lm *lm = lmInit(0);
+// Can't use rec->chrom because that might be just "1" instead of "chr1":
+struct bed3 variantBed3 = { NULL, chromName, rec->chromStart, rec->chromEnd };
+enum soTerm maxImpactTerm = soUnknown;
+struct txInfo *txi;
+for (txi = txiList;  txi != NULL;  txi = txi->next)
+    {
+    int alIx;
+    // Sometimes reference allele is actually a change to the transcript
+    for (alIx = 0;  alIx < rec->alleleCount;  alIx++)
+        {
+        char *allele = rec->alleles[alIx];
+        // Watch out for weird symbolic alleles like "<INS:ME:ALU>".
+        if (sameString(allele, "<DEL>"))
+            allele = "";
+        else if (allele[0] == '<')
+            continue;
+        struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &variantBed3, allele,
+                                                  txi->psl, txi->txSeq);
+        struct vpPep *vpPep = vpTranscriptToProtein(vpTx, txi->cds, txi->txSeq, txi->protSeq);
+        struct gpFx *gpFx = vpTranscriptToGpFx(vpTx, txi->psl, txi->cds, txi->txSeq, vpPep,
+                                               txi->protSeq, lm);
+        enum soTerm term = gpFx->soNumber;
+        if (soTermCmp(&term, &maxImpactTerm) < 0)
+            maxImpactTerm = term;
+        vpPepFree(&vpPep);
+        vpTxFree(&vpTx);
+        }
+    }
+lmCleanup(&lm);
+return maxImpactTerm;
+}
+
 // tg->height needs an extra pixel at the bottom; it's eaten by the clipping rectangle:
 #define CLIP_PAD 1
 
 static void drawOneRec(struct vcfRecord *rec, unsigned int *gtHapOrder, unsigned int gtHapCount,
 		       struct track *tg, struct hvGfx *hvg, int xOff, int yOff, int width,
-		       boolean isClustered, boolean isCenter, enum hapColorMode colorMode)
+		       boolean isClustered, boolean isCenter, enum hapColorMode colorMode,
+                       enum soTerm funcTerm)
 /* Draw a stack of genotype bars for this record */
 {
 unsigned int chromStartMap = vcfRecordTrimIndelLeftBase(rec);
 unsigned int chromEndMap = vcfRecordTrimAllelesRight(rec);
 const double scale = scaleForPixels(width);
 int x1 = round((double)(rec->chromStart-winStart)*scale) + xOff;
 int x2 = round((double)(rec->chromEnd-winStart)*scale) + xOff;
 int w = x2-x1;
 if (w <= 1)
     {
     x1--;
     w = 3;
     }
 // When coloring mode is altOnly, we draw one extra pixel row at the top & one at bottom
 // to show the locations of variants, since the reference alleles are invisible:
 int extraPixel = 0;
 int hapHeight = tg->height - CLIP_PAD;
-if (colorMode == altOnlyMode)
+if (colorMode == altOnlyMode || colorMode == functionMode)
     {
     hvGfxLine(hvg, x1, yOff, x2, yOff, (isClustered ? purple : shadesOfGray[5]));
     extraPixel = 1;
     hapHeight -= extraPixel*2;
     }
 double hapsPerPix = (double)gtHapCount / hapHeight;
 int pixIx;
 for (pixIx = 0;  pixIx < hapHeight;  pixIx++)
     {
     int gtHapOrderIxStart = (int)(hapsPerPix * pixIx);
     int gtHapOrderIxEnd = round(hapsPerPix * (pixIx + 1));
     if (gtHapOrderIxEnd == gtHapOrderIxStart)
 	gtHapOrderIxEnd++;
     int unks = 0, refs = 0, alts = 0;
     int gtHapOrderIx;
@@ -735,64 +1126,86 @@
 		unks++;
 	    else if (alIx > 0)
 		alts++;
 	    else
 		refs++;
 	    }
 	else
 	    unks++;
 	}
     int y = yOff + extraPixel + pixIx;
     Color col;
     if (colorMode == baseMode)
 	col = colorByBase(refs, alts, unks, rec->alleles[0], rec->alleles[1]);
     else if (colorMode == refAltMode)
 	col = colorByRefAlt(refs, alts, unks);
+    else if (colorMode == functionMode)
+        {
+        Color *funcShades = shadesOfGray;
+        int funcShadeCount = maxShade+1;
+        Color funcColor = colorFromSoTerm(funcTerm);
+        if (funcColor == MG_RED)
+            {
+            funcShades = shadesOfRedOnWhite;
+            funcShadeCount = ArraySize(shadesOfRedOnWhite);
+            }
+        else if (funcColor == MG_GREEN)
+            {
+            funcShades = darkerShadesOfGreenOnWhite;
+            funcShadeCount = ArraySize(darkerShadesOfGreenOnWhite);
+            }
+        else if (funcColor == MG_BLUE)
+            {
+            funcShades = shadesOfBlueOnWhite;
+            funcShadeCount = ArraySize(shadesOfBlueOnWhite);
+            }
+        col = colorByAltShade(refs, alts, unks, funcShades, funcShadeCount);
+        }
     else
 	col = colorByAltOnly(refs, alts, unks);
     if (col != MG_WHITE)
 	hvGfxLine(hvg, x1, y, x2, y, col);
     }
 int yBot = yOff + tg->height - CLIP_PAD - 1;
 if (isCenter)
     {
-    if (colorMode == altOnlyMode)
+    if (colorMode == altOnlyMode || colorMode == functionMode)
 	{
 	// Colorful outline to distinguish this variant:
 	hvGfxLine(hvg, x1-1, yOff, x1-1, yBot, purple);
 	hvGfxLine(hvg, x2+1, yOff, x2+1, yBot, purple);
 	hvGfxLine(hvg, x1-1, yOff, x2+1, yOff, purple);
 	hvGfxLine(hvg, x1-1, yBot, x2+1, yBot, purple);
 	}
     else
 	{
 	// Thick black lines to distinguish this variant:
 	hvGfxBox(hvg, x1-3, yOff, 3, tg->height, MG_BLACK);
 	hvGfxBox(hvg, x2, yOff, 3, tg->height, MG_BLACK);
 	hvGfxLine(hvg, x1-2, yOff, x2+2, yOff, MG_BLACK);
 	hvGfxLine(hvg, x1-2, yBot, x2+2, yBot, MG_BLACK);
 	}
     // Mouseover was handled already by mapBoxForCenterVariant
     }
 else
     {
     struct dyString *dy = dyStringNew(0);
     gtSummaryString(rec, dy);
     mapBoxHgcOrHgGene(hvg, chromStartMap, chromEndMap, x1, yOff, w, tg->height, tg->track,
 		      rec->name, dy->string, NULL, TRUE, NULL);
     }
-if (colorMode == altOnlyMode)
+if (colorMode == altOnlyMode || colorMode == functionMode)
     hvGfxLine(hvg, x1, yBot, x2, yBot, (isClustered ? purple : shadesOfGray[5]));
 }
 
 static int getCenterVariantIx(struct track *tg, int seqStart, int seqEnd,
 			      struct vcfRecord *records)
 // If the user hasn't specified a local variant/position to use as center,
 // just use the median variant in window.
 {
 int defaultIx = (slCount(records)-1) / 2;
 char *centerChrom = cartOptionalStringClosestToHome(cart, tg->tdb, FALSE, "centerVariantChrom");
 if (centerChrom != NULL && sameString(chromName, centerChrom))
     {
     int centerPos = cartUsualIntClosestToHome(cart, tg->tdb, FALSE, "centerVariantPos", -1);
     int winSize = seqEnd - seqStart;
     if (centerPos > (seqStart - winSize) && centerPos < (seqEnd + winSize))
@@ -1059,59 +1472,96 @@
  * are usually hundreds more just like it. */
 {
 }
 
 static enum hapColorMode getColorMode(struct trackDb *tdb)
 /* Get the hap-cluster coloring mode from cart & tdb. */
 {
 enum hapColorMode colorMode = altOnlyMode;
 char *colorBy = cartOrTdbString(cart, tdb, VCF_HAP_COLORBY_VAR, VCF_DEFAULT_HAP_COLORBY);
 if (sameString(colorBy, VCF_HAP_COLORBY_ALTONLY))
     colorMode = altOnlyMode;
 else if (sameString(colorBy, VCF_HAP_COLORBY_REFALT))
     colorMode = refAltMode;
 else if (sameString(colorBy, VCF_HAP_COLORBY_BASE))
     colorMode = baseMode;
+else
+    {
+    char *geneTrack = cartOrTdbString(cart, tdb, "geneTrack", NULL);
+    if (isNotEmpty(geneTrack) && sameString(colorBy, VCF_HAP_COLORBY_FUNCTION))
+        colorMode = functionMode;
+    }
 return colorMode;
 }
 
+#define GENE_SIZE_FUDGE 2500000
+
 static boolean vcfHapClusterDrawInit(struct track *tg, struct vcfFile *vcff, struct hvGfx *hvg,
-                                     enum hapColorMode *retHapColorMode)
+                                     enum hapColorMode *retHapColorMode,
+                                     struct seqWindow **retGSeqWin, struct txInfo **retTxiList)
 /* Parse vcff's genotypes and get ready to draw haplotypes.  Return FALSE if nothing to draw. */
 {
 if (vcff->records == NULL)
     return FALSE;
 undefYellow = hvGfxFindRgb(hvg, &undefinedYellowColor);
 if (retHapColorMode)
     *retHapColorMode = getColorMode(tg->tdb);
 pushWarnHandler(ignoreEm);
 struct vcfRecord *rec;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     vcfParseGenotypes(rec);
 popWarnHandler();
+if (*retHapColorMode == functionMode)
+    {
+    if (!exprBedColorsMade)
+        {
+        makeRedGreenShades(hvg);
+        // Make darkerShadesOfGreenOnWhite for local use
+        static struct rgbColor white  = {255, 255, 255};
+        static struct rgbColor darkerGreen  = {0, 210, 0};
+        hvGfxMakeColorGradient(hvg, &white, &darkerGreen,  EXPR_DATA_SHADES,
+                               darkerShadesOfGreenOnWhite);
+        }
+    int gStart = winStart - GENE_SIZE_FUDGE;
+    if (gStart < 0)
+        gStart = 0;
+    int gEnd = winEnd + GENE_SIZE_FUDGE;
+    int chromSize = hChromSize(database, chromName);
+    if (gEnd > chromSize)
+        gEnd = chromSize;
+    *retGSeqWin = chromSeqWindowNew(database, chromName, gStart, gEnd);
+    *retTxiList = txInfoLoad(*retGSeqWin, tg->tdb);
+    }
+else
+    {
+    *retGSeqWin = NULL;
+    *retTxiList = NULL;
+    }
 return TRUE;
 }
 
 static void vcfHapClusterDraw(struct track *tg, int seqStart, int seqEnd,
 			      struct hvGfx *hvg, int xOff, int yOff, int width,
 			      MgFont *font, Color color, enum trackVisibility vis)
 /* Split samples' chromosomes (haplotypes), cluster them by center-weighted
  * alpha similarity, and draw in the order determined by clustering. */
 {
 struct vcfFile *vcff = tg->extraUiData;
 enum hapColorMode colorMode;
-if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode))
+struct seqWindow *gSeqWin;
+struct txInfo *txiList;
+if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode, &gSeqWin, &txiList))
     return;
 purple = hvGfxFindColorIx(hvg, 0x99, 0x00, 0xcc);
 unsigned int gtHapCount = 0;
 int nRecords = slCount(vcff->records);
 int centerIx = getCenterVariantIx(tg, seqStart, seqEnd, vcff->records);
 // Limit the number of variants that we compare, to keep from timing out:
 // (really what we should limit is the number of distinct haplo's passed to hacTree!)
 // In the meantime, this should at least be a cart var...
 int maxVariantsPerSide = 50;
 int startIx = max(0, centerIx - maxVariantsPerSide);
 int endIx = min(nRecords, centerIx+1 + maxVariantsPerSide);
 struct hacTree *ht = NULL;
 unsigned int *gtHapOrder = clusterHaps(vcff, centerIx, startIx, endIx, &gtHapCount, &ht);
 struct vcfRecord *centerRec = NULL;
 struct vcfRecord *rec;
@@ -1119,38 +1569,46 @@
 // Unlike drawing order (last drawn is on top), the first mapBox gets priority,
 // so map center variant before drawing & mapping other variants!
 for (rec = vcff->records, ix=0;  rec != NULL;  rec = rec->next, ix++)
     {
     if (ix == centerIx)
 	{
 	centerRec = rec;
 	mapBoxForCenterVariant(rec, hvg, tg, xOff, yOff, width);
 	break;
 	}
     }
 for (rec = vcff->records, ix=0;  rec != NULL;  rec = rec->next, ix++)
     {
     boolean isClustered = (ix >= startIx && ix < endIx);
     if (ix != centerIx)
+        {
+        enum soTerm funcTerm = soUnknown;
+        if (colorMode == functionMode)
+            funcTerm = functionForRecord(rec, gSeqWin, txiList);
 	drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, isClustered, FALSE,
-		   colorMode);
+		   colorMode, funcTerm);
+        }
     }
 // Draw the center rec on top, outlined with black lines, to make sure it is very visible:
+enum soTerm funcTerm = soUnknown;
+if (colorMode == functionMode)
+    funcTerm = functionForRecord(centerRec, gSeqWin, txiList);
 drawOneRec(centerRec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, TRUE, TRUE,
-	   colorMode);
+	   colorMode, funcTerm);
 // Draw as much of the tree as can fit in the left label area:
-int extraPixel = (colorMode == altOnlyMode) ? 1 : 0;
+int extraPixel = (colorMode == altOnlyMode || colorMode == functionMode) ? 1 : 0;
 int hapHeight = tg->height- CLIP_PAD - 2*extraPixel;
 struct yFromNodeHelper yHelper = {0, NULL, NULL};
 initYFromNodeHelper(&yHelper, yOff+extraPixel, hapHeight, gtHapCount, gtHapOrder,
 		    vcff->genotypeCount);
 struct titleHelper titleHelper = { NULL, 0, 0, 0, 0, NULL, NULL };
 initTitleHelper(&titleHelper, tg->track, startIx, centerIx, endIx, nRecords, vcff);
 char *treeAngle = cartOrTdbString(cart, tg->tdb, VCF_HAP_TREEANGLE_VAR, VCF_DEFAULT_HAP_TREEANGLE);
 boolean drawRectangle = sameString(treeAngle, VCF_HAP_TREEANGLE_RECTANGLE);
 drawTreeInLabelArea(ht, hvg, yOff+extraPixel, hapHeight+CLIP_PAD, &yHelper, &titleHelper,
 		    drawRectangle);
 }
 
 static void drawSampleLabels(struct vcfFile *vcff, struct hvGfx *hvg,
                              boolean isAllDiploid, int yStart, int height,
                              unsigned int *gtHapOrder, int gtHapCount, MgFont *font, Color color,
@@ -1262,42 +1720,49 @@
 if (retIsAllDiploid)
     *retIsAllDiploid = isAllDiploid;
 if (retGtHapCount)
     *retGtHapCount = orderIx;
 return gtHapOrder;
 }
 
 static void vcfGtHapFileOrderDraw(struct track *tg, int seqStart, int seqEnd,
                                   struct hvGfx *hvg, int xOff, int yOff, int width,
                                   MgFont *font, Color color, enum trackVisibility vis)
 /* Draw rows in the same fashion as vcfHapClusterDraw, but instead of clustering, use the
  * order in which samples appear in the VCF file. */
 {
 struct vcfFile *vcff = tg->extraUiData;
 enum hapColorMode colorMode;
-if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode))
+struct seqWindow *gSeqWin;
+struct txInfo *txiList;
+if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode, &gSeqWin, &txiList))
     return;
 boolean isAllDiploid;
 int gtHapCount;
 unsigned int *gtHapOrder = gtHapOrderFromGtOrder(vcff, &isAllDiploid, &gtHapCount);
 struct vcfRecord *rec;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
+    {
+    enum soTerm funcTerm = soUnknown;
+    if (colorMode == functionMode)
+        funcTerm = functionForRecord(rec, gSeqWin, txiList);
     drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, FALSE, FALSE,
-               colorMode);
+               colorMode, funcTerm);
+    }
 // If height is sufficient, draw sample names as left labels; otherwise make mouseover titles
 // with sample names for each pixel y offset.
-int extraPixel = (colorMode == altOnlyMode) ? 1 : 0;
+int extraPixel = (colorMode == altOnlyMode || colorMode == functionMode) ? 1 : 0;
 int hapHeight = tg->height - CLIP_PAD - 2*extraPixel;
 int minHeightForLabels;
 if (isAllDiploid)
     minHeightForLabels = vcff->genotypeCount * (tl.fontHeight + 1);
 else
     minHeightForLabels = gtHapCount * (tl.fontHeight + 1);
 if (hapHeight >= minHeightForLabels)
     drawSampleLabels(vcff, hvg, isAllDiploid, yOff+extraPixel, hapHeight, gtHapOrder, gtHapCount,
                      font, color, tg->track);
 else
     drawSampleTitles(vcff, yOff+extraPixel, hapHeight, gtHapOrder, gtHapCount, tg->track);
 }
 
 static struct phyloTree *getTreeFromFile(struct trackDb *tdb)
 /* Get the filename that follows trackDb setting 'hapClusterMethod treeFile ' and read it in
@@ -1502,68 +1967,76 @@
 double pxPerHap = (double)clipHeight / gtHapCount;
 rDrawPhyloTreeInLabelArea(tree, hvgLL, x, yOff, pxPerHap, font, drawRectangle);
 // Restore the prior clipping:
 hvGfxUnclip(hvgLL);
 hvGfxSetClip(hvgLL, clipXBak, clipYBak, clipWidthBak, clipHeightBak);
 }
 
 static void vcfGtHapTreeFileDraw(struct track *tg, int seqStart, int seqEnd,
                                  struct hvGfx *hvg, int xOff, int yOff, int width,
                                  MgFont *font, Color color, enum trackVisibility vis)
 /* Draw rows in the same fashion as vcfHapClusterDraw, but instead of clustering, use the
  * order in which samples appear in the VCF file. */
 {
 struct vcfFile *vcff = tg->extraUiData;
 enum hapColorMode colorMode;
-if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode))
+struct seqWindow *gSeqWin;
+struct txInfo *txiList;
+if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode, &gSeqWin, &txiList))
     return;
 struct phyloTree *tree = getTreeFromFile(tg->tdb);
 int gtHapCount;
 unsigned int *leafOrderToHapOrderStart, *leafOrderToHapOrderEnd;
 unsigned int *gtHapOrder = gtHapOrderFromTree(vcff, tree,
                                                 &leafOrderToHapOrderStart, &leafOrderToHapOrderEnd,
                                                 &gtHapCount);
 struct vcfRecord *rec;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
+    {
+    enum soTerm funcTerm = soUnknown;
+    if (colorMode == functionMode)
+        funcTerm = functionForRecord(rec, gSeqWin, txiList);
     drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, FALSE, FALSE,
-               colorMode);
-int extraPixel = (colorMode == altOnlyMode) ? 1 : 0;
+               colorMode, funcTerm);
+    }
+int extraPixel = (colorMode == altOnlyMode || colorMode == functionMode) ? 1 : 0;
 int hapHeight = tg->height - CLIP_PAD - 2*extraPixel;
 char *treeAngle = cartOrTdbString(cart, tg->tdb, VCF_HAP_TREEANGLE_VAR, VCF_DEFAULT_HAP_TREEANGLE);
 boolean drawRectangle = sameString(treeAngle, VCF_HAP_TREEANGLE_RECTANGLE);
 drawPhyloTreeInLabelArea(tree, hvg, yOff+extraPixel, hapHeight, gtHapCount, font,
                          drawRectangle, leafOrderToHapOrderStart, leafOrderToHapOrderEnd);
 drawSampleTitles(vcff, yOff+extraPixel, hapHeight, gtHapOrder, gtHapCount, tg->track);
 }
 
 static int vcfHapClusterTotalHeight(struct track *tg, enum trackVisibility vis)
 /* Return height of haplotype graph (2 * #samples * lineHeight);
  * 2 because we're assuming diploid genomes here, no XXY, tetraploid etc. */
 {
 const struct vcfFile *vcff = tg->extraUiData;
 if (vcff->records == NULL)
     return 0;
 int ploidy = sameString(chromName, "chrY") ? 1 : 2;
 int simpleHeight = ploidy * vcff->genotypeCount * tg->lineHeight;
 int defaultHeight = min(simpleHeight, VCF_DEFAULT_HAP_HEIGHT);
 char *tdbHeight = trackDbSettingOrDefault(tg->tdb, VCF_HAP_HEIGHT_VAR, NULL);
 if (isNotEmpty(tdbHeight))
     defaultHeight = atoi(tdbHeight);
 int cartHeight = cartOrTdbInt(cart, tg->tdb, VCF_HAP_HEIGHT_VAR, defaultHeight);
 if (tg->visibility == tvSquish)
     cartHeight /= 2;
-int extraPixel = (getColorMode(tg->tdb) == altOnlyMode) ? 1 : 0;
+enum hapColorMode colorMode = getColorMode(tg->tdb);
+int extraPixel = (colorMode == altOnlyMode || colorMode == functionMode) ? 1 : 0;
 int totalHeight = cartHeight + CLIP_PAD + 2*extraPixel;
 tg->height = min(totalHeight, maximumTrackHeight(tg));
 return tg->height;
 }
 
 static char *vcfHapClusterTrackName(struct track *tg, void *item)
 /* If someone asks for itemName/mapItemName, just send name of track like wiggle. */
 {
 return tg->track;
 }
 
 static void vcfHapClusterOverloadMethods(struct track *tg, struct vcfFile *vcff)
 /* If we confirm at load time that we can draw a haplotype graph, use
  * this to overwrite the methods for the rest of execution: */
 {