41723d134f8b0c52c78705c2e5da97f8875e3cf6
angie
  Wed Feb 15 11:44:39 2017 -0800
Added HGVS terms as variant input option in hgVai.  refs #11460

diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c
index 8297ffa..868a394 100644
--- src/hg/hgVai/hgVai.c
+++ src/hg/hgVai/hgVai.c
@@ -2,30 +2,31 @@
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "htmshell.h"
 #include "web.h"
 #include "cheapcgi.h"
 #include "cart.h"
 #include "cartTrackDb.h"
 #include "genbank.h"
 #include "hgConfig.h"
+#include "hgHgvs.h"
 #include "hui.h"
 #include "grp.h"
 #include "hCommon.h"
 #include "hgFind.h"
 #include "hPrint.h"
 #include "jsHelper.h"
 #include "memalloc.h"
 #include "textOut.h"
 #include "trackHub.h"
 #include "hubConnect.h"
 #include "twoBit.h"
 #include "gpFx.h"
 #include "bigGenePred.h"
 #include "udc.h"
 #include "knetUdc.h"
@@ -53,40 +54,49 @@
 char *regionType = NULL;	/* genome, ENCODE pilot regions, or specific position range. */
 struct grp *fullGroupList = NULL;	/* List of all groups. */
 struct trackDb *fullTrackList = NULL;	/* List of all tracks in database. */
 static struct pipeline *compressPipeline = (struct pipeline *)NULL;
 
 
 // Null terminated list of CGI Variables we don't want to save permanently:
 char *excludeVars[] = {"Submit", "submit", "hgva_startQuery", NULL,};
 
 #define hgvaRange "position"
 #define hgvaRegionType "hgva_regionType"
 #define hgvaRegionTypeEncode "encode"
 #define hgvaRegionTypeGenome "genome"
 #define hgvaRegionTypeRange "range"
 #define hgvaPositionContainer "positionContainer"
+#define hgvsTrashSubDir "hgv"
+// auto-generated variants as input
 #define hgvaSampleVariants "hgva_internally_generated_sample_variants"
 #define hgvaSampleVariantsLabel "Artificial Example Variants"
+// dbSNP rsIDs as variant inputs:
 #define hgvaUseVariantIds "hgva_useVariantIds"
 #define hgvaVariantIdsLabel "Variant Identifiers"
 #define hgvaVariantIds "hgva_variantIds"
 #define hgvaUseVariantFileOrUrl "hgva_useVariantFileOrUrl"
-#define hgvaVariantFileOrUrlLabel "from file"
+#define hgvaVariantPasteContainer "variantPasteContainer"
+// A local file or URL as variant input:
+#define hgvaVariantFileOrUrlLabel "from file or URL"
 #define hgvaVariantFileOrUrl "hgva_variantFileOrUrl"
 #define hgvaVariantFileOrUrlType "hgva_variantFileOrUrlType"
-#define hgvaVariantPasteContainer "variantPasteContainer"
+// HGVS terms as variant inputs:
+#define hgvaUseHgvs "hgva_useHgvs"
+#define hgvaHgvsLabel "HGVS terms"
+#define hgvaHgvs "hgva_hgvs"
+#define hgvaHgvsPasteContainer "hgvsPasteContainer"
 
 void addSomeCss()
 /*#*** This should go in a .css file of course. */
 {
 printf("<style>\n"
 	"div.sectionLite { border-width: 1px; border-color: #"HG_COL_BORDER"; border-style: solid;"
 	"  background-color: #"HG_COL_INSIDE"; padding-left: 10px; padding-right: 10px; "
 	"  padding-top: 8px; padding-bottom: 5px; margin-top: 5px; margin-bottom: 5px }\n"
 	".sectionLiteHeader { font-weight: bold; font-size:larger; color:#000000;"
 	"  text-align:left; vertical-align:bottom; white-space:nowrap; }\n"
 	"div.sectionLiteHeader.noReorderRemove { padding-bottom:5px; }\n"
 	"div.sourceFilter { padding-top: 5px; padding-bottom: 5px }\n"
 	"</style>\n");
 }
 
@@ -471,35 +481,44 @@
     {
     printf("If you have more than one custom track or hub track in "
            PGSNP_OR_VCF " format, please select the one you wish to annotate:<BR>\n");
     }
 printf("<B>variants: </B>");
 printf("<SELECT ID='hgva_variantTrack' NAME='hgva_variantTrack'>\n");
 jsOnEventById("change", "hgva_variantTrack", "hgva.changeVariantSource();");
 char *selected = cartUsualString(cart, "hgva_variantTrack", "");
 struct slRef *ref;
 for (ref = varTrackList;  ref != NULL;  ref = ref->next)
     {
     struct trackDb *tdb = ref->val;
     printOption(tdb->track, selected, tdb->longLabel);
     }
 printOption(hgvaSampleVariants, selected, hgvaSampleVariantsLabel);
+printOption(hgvaUseHgvs, selected, hgvaHgvsLabel);
 boolean hasSnps = (hFindLatestSnpTable(database, NULL) != NULL);
 if (hasSnps)
     printOption(hgvaUseVariantIds, selected, hgvaVariantIdsLabel);
 printf("</SELECT><BR>\n");
 
+printf("<div id='"hgvaHgvsPasteContainer"'%s>\n",
+       differentString(selected, hgvaUseHgvs) ? " style='display: none;'" : "");
+printf("Enter HGVS terms: one term per line; blank lines and comment lines beginning with '#' "
+       "are ignored.<BR>\n");
+char *oldPasted = cartUsualString(cart, hgvaHgvs, "");
+cgiMakeTextArea(hgvaHgvs, oldPasted, 10, 70);
+puts("</div>");
+
 if (hasSnps)
     {
     printf("<div id='"hgvaVariantPasteContainer"'%s>\n",
 	   differentString(selected, hgvaUseVariantIds) ? " style='display: none;'" : "");
     printf("Enter dbSNP rs# identifiers separated by whitespace or commas:<BR>\n");
     char *oldPasted = cartUsualString(cart, hgvaVariantIds, "");
     cgiMakeTextArea(hgvaVariantIds, oldPasted, 10, 70);
     puts("</div>");
     }
 
 printf("<B>maximum number of variants to be processed:</B>\n");
 char *curLimit = cartUsualString(cart, "hgva_variantLimit", "10000");
 char *limitLabels[] = { "10", "100", "1,000", "10,000", "100,000" };
 char *limitValues[] = { "10", "100", "1000", "10000", "100000" };
 cgiMakeDropListWithVals("hgva_variantLimit", limitLabels, limitValues, ArraySize(limitLabels),
@@ -1747,44 +1766,56 @@
 static void getCartPosOrDie(char **retChrom, uint *retStart, uint *retEnd)
 /* Get chrom:start-end from cart, errAbort if any problems. */
 {
 char *position = cartString(cart, hgvaRange);
 if (! parsePosition(position, retChrom, retStart, retEnd))
     errAbort("Expected position to be chrom:start-end but got '%s'", position);
 }
 
 static char *sampleVariantsPath(struct annoAssembly *assembly, char *geneTrack)
 /* Construct a path for a trash file of contrived example variants for this assembly
  * and gene track. */
 {
 char *chrom = NULL;
 uint start = 0, end = 0;
 getCartPosOrDie(&chrom, &start, &end);
-char *subDir = "hgv";
-mkdirTrashDirectory(subDir);
+mkdirTrashDirectory(hgvsTrashSubDir);
 struct dyString *dy = dyStringCreate("%s/%s/%s_%s_%s_%u-%u.vcf",
-				     trashDir(), subDir, assembly->name, geneTrack,
+				     trashDir(), hgvsTrashSubDir, assembly->name, geneTrack,
 				     chrom, start, end);
 return dyStringCannibalize(&dy);
 }
 
-static void writeMinimalVcfHeader(FILE *f, char *db)
-/* Write header for VCF with no meaningful qual, filter, info or genotype data. */
+static char *makeMinimalVcfHeader(char *db, char *headerDefs)
+/* Return a string containing a simple no-genotypes VCF header.  db must be non-NULL.
+ * headerDefs can be NULL or empty or line(s) starting with '##' and ending w/'\n' */
 {
-fputs("##fileformat=VCFv4.1\n", f);
-fprintf(f, "##reference=%s\n", db);
-fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n", f);
+struct dyString *dy = dyStringCreate("##fileformat=VCFv4.2\n"
+                                     "##reference=%s\n"
+                                     "%s"
+                                     "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n",
+                                     db, (headerDefs ? headerDefs : ""));
+return dyStringCannibalize(&dy);
+}
+
+static void writeMinimalVcfHeader(FILE *f, char *db, char *headerDefs)
+/* Write header for VCF with no genotype data. db must be non-NULL.
+ * headerDefs can be NULL or empty or line(s) starting with '##' and ending w/'\n' */
+{
+char *headerString = makeMinimalVcfHeader(db, headerDefs);
+fputs(headerString, f);
+freeMem(headerString);
 }
 
 static void mutateBase(char *pBase)
 /* Change *pBase into a different base. */
 {
 static char *bases = "ACGT";
 char c;
 // In real life the probabilities aren't even, but this is easier.
 while ((c = bases[(uint)(floor(drand48() * 4.0))]) == *pBase)
     ;
 *pBase = c;
 }
 
 static void writeMinimalVcfRow(FILE *f, struct vcfRecord *rec)
 /* Write a minimalist VCF row (coords, name, alleles and placeholders). */
@@ -1976,31 +2007,31 @@
 static struct annoStreamer *makeSampleVariantsStreamer(struct annoAssembly *assembly,
 						       struct trackDb *geneTdb, int maxOutRows)
 /* Construct a VCF file of example variants for db (unless it already exists)
  * and return an annoStreamer for that file.  If possible, make the variants hit a gene. */
 {
 char *sampleFile = sampleVariantsPath(assembly, geneTdb->track);
 boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildSampleVariants", FALSE);
 if (! fileExists(sampleFile) || forceRebuild)
     {
     struct annoStreamer *geneStream = hAnnoStreamerFromTrackDb(assembly, geneTdb->table, geneTdb,
                                                                NULL, ANNO_NO_LIMIT, NULL);
     boolean isBig = sameString(geneTdb->type, "bigGenePred");
     boolean gotCoding = FALSE, gotNonCoding = FALSE;
     struct genePred *gpList = genesFromPosition(geneStream, isBig, &gotCoding, &gotNonCoding);
     FILE *f = mustOpen(sampleFile, "w");
-    writeMinimalVcfHeader(f, assembly->name);
+    writeMinimalVcfHeader(f, assembly->name, NULL);
     if (gpList == NULL)
 	{
 	warn("Unable to find any gene transcripts in '%s' (%s)",
 	     geneTdb->shortLabel, geneTdb->track);
 	writeArbitraryVariants(f, assembly);
 	}
     else
 	{
 	struct bed4 *bedList = NULL;
 	uint chromSize = annoAssemblySeqSize(assembly, gpList->chrom);
 	uint minSeqStart = chromSize, maxSeqEnd = 0;
 	struct genePred *gp;
 	for (gp = gpList;  gp != NULL;  gp = gp->next)
 	    {
 	    int txSeqStart = gp->txStart - (UPSTREAMFUDGE + IGFUDGE);
@@ -2022,38 +2053,37 @@
 						      minSeqStart, maxSeqEnd);
 	struct bed4 *bed;
 	for (bed = bedList;  bed != NULL;  bed = bed->next)
 	    {
 	    writeMinimalVcfRowFromBed(f, bed, txSeq, minSeqStart);
 	    }
 	dnaSeqFree(&txSeq);
 	bed4FreeList(&bedList);
 	}
     geneStream->close(&geneStream);
     carefulClose(&f);
     }
 return annoStreamVcfNew(sampleFile, NULL, FALSE, assembly, maxOutRows);
 }
 
-static char *variantIdPath(struct annoAssembly *assembly, char *variantIds)
-/* Use the md5sum of the user's pasted/uploaded variants to make a hopefully
- * unique trash filename. */
+static char *md5TrashPath(struct annoAssembly *assembly, char *string)
+/* Use the md5sum of a string to make a hopefully unique trash filename. */
 {
-char *md5sum = md5HexForString(variantIds);
-char *subDir = "hgv";
-mkdirTrashDirectory(subDir);
-struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), subDir, assembly->name, md5sum);
+char *md5sum = md5HexForString(string);
+mkdirTrashDirectory(hgvsTrashSubDir);
+struct dyString *dy = dyStringCreate("%s/%s/%s_%s.vcf", trashDir(), hgvsTrashSubDir,
+                                     assembly->name, md5sum);
 return dyStringCannibalize(&dy);
 }
 
 static struct slName *hashListNames(struct hash *hash)
 /* Return a list of all element names in the hash (if any). */
 {
 struct slName *list = NULL;
 struct hashCookie cookie = hashFirst(hash);
 struct hashEl *hel;
 while ((hel = hashNext(&cookie)) != NULL)
     slAddHead(&list, slNameNew(hel->name));
 return list;
 }
 
 static char *encloseInAngleBracketsDbSnp(char *stringIn)
@@ -2337,47 +2367,126 @@
 	if (*pEnd < last->chromStart)
 	    *pEnd = last->chromStart;
 	}
     }
 }
 
 static struct annoStreamer *makeVariantIdStreamer(struct annoAssembly *assembly, int maxOutRows,
 						  char **pChrom, uint *pStart, uint *pEnd,
 						  struct slName **pCommentList)
 /* Translate user's pasted/uploaded variant IDs into minimal VCF if possible.
  * Return a VCF streamer for those, and if the current search position is too narrow
  * to include all of the variants, widen it as necessary. */
 {
 // Hash variant text to get trash filename.  Use if exists, otherwise build it.
 char *variantIds = cartString(cart, hgvaVariantIds);
-char *varFile = variantIdPath(assembly, variantIds);
+char *varFile = md5TrashPath(assembly, variantIds);
 boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildVariantIds", FALSE);
 if (! fileExists(varFile) || forceRebuild)
     {
     struct vcfRecord *varList = parseVariantIds(assembly, variantIds, pCommentList);
 //#*** If no variants were recognized, we should probably show main page with a warning.
     adjustRangeForVariants(varList, pChrom, pStart, pEnd);
     FILE *f = mustOpen(varFile, "w");
-    writeMinimalVcfHeader(f, assembly->name);
+    writeMinimalVcfHeader(f, assembly->name, NULL);
     struct vcfRecord *var;
     for (var = varList;  var != NULL;  var = var->next)
 	writeMinimalVcfRow(f, var);
     carefulClose(&f);
     }
 return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows);
 }
 
+static struct vcfRecord *parseHgvs(struct vcfFile *vcff, struct annoAssembly *assembly,
+                                   char *hgvsTerms, struct slName **pCommentList)
+/* Return a sorted list of vcfRecords . */
+{
+struct vcfRecord *recList = NULL;
+struct slName *failedTerms = NULL;
+struct dyString *dyError = dyStringNew(0);
+struct lineFile *lf = lineFileOnString("user-provided HGVS terms", TRUE, cloneString(hgvsTerms));
+char *term = NULL;
+while (lineFileNextReal(lf, &term))
+    {
+    eraseTrailingSpaces(term);
+    dyStringClear(dyError);
+    struct vcfRow *vcfRow = hgvsToVcfRow(assembly->name, term, FALSE, dyError);
+    if (vcfRow)
+        {
+        char *row[8];
+        row[0] = vcfRow->chrom;
+        char posStr[64];
+        safef(posStr, sizeof(posStr), "%d", vcfRow->pos);
+        row[1] = posStr;
+        row[2] = vcfRow->id;
+        row[3] = vcfRow->ref;
+        row[4] = vcfRow->alt;
+        row[5] = ".";
+        row[6] = vcfRow->filter;
+        row[7] = vcfRow->info;
+        slAddHead(&recList, vcfRecordFromRow(vcff, row));
+        }
+    else
+	slNameAddHead(&failedTerms, dyStringContents(dyError));
+    }
+if (failedTerms != NULL)
+    {
+    slReverse(&failedTerms);
+    char *firstUnknownIds = firstNCommaSep(failedTerms, 5);
+    struct dyString *dy = dyStringCreate("%d HGVS terms could not be parsed and/or mapped to %s, "
+                                         "e.g. %s",
+					 slCount(failedTerms), assembly->name, firstUnknownIds);
+    slAddTail(pCommentList, slNameNew(dy->string));
+    freeMem(firstUnknownIds);
+    dyStringFree(&dy);
+    }
+slSort(&recList, vcfRecordCmp);
+slNameFreeList(&failedTerms);
+dyStringFree(&dyError);
+lineFileClose(&lf);
+return recList;
+}
+
+static struct annoStreamer *makeHgvsStreamer(struct annoAssembly *assembly, int maxOutRows,
+                                             char **pChrom, uint *pStart, uint *pEnd,
+                                             struct slName **pCommentList)
+/* Translate user's pasted/uploaded variant IDs into minimal VCF if possible.
+ * Return a VCF streamer for those., and if the current search position is too narrow
+ * to include all of the variants, widen it as necessary. */
+{
+// Hash HGVS input to get trash filename.  Use if exists, otherwise build it.
+char *hgvsTerms = cartString(cart, hgvaHgvs);
+char *varFile = md5TrashPath(assembly, hgvsTerms);
+boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildHgvs", FALSE);
+if (! fileExists(varFile) || forceRebuild)
+    {
+    char *headerString = makeMinimalVcfHeader(assembly->name, HGVS_VCF_HEADER_DEFS);
+    struct vcfFile *vcff = vcfFileFromHeader("hgVaiHgvs", headerString, VCF_IGNORE_ERRS);
+    struct vcfRecord *varList = parseHgvs(vcff, assembly, hgvsTerms, pCommentList);
+//#*** If no HGVS terms could be mapped, we should probably show main page with a warning.
+    adjustRangeForVariants(varList, pChrom, pStart, pEnd);
+    FILE *f = mustOpen(varFile, "w");
+    writeMinimalVcfHeader(f, assembly->name, HGVS_VCF_HEADER_DEFS);
+    struct vcfRecord *var;
+    for (var = varList;  var != NULL;  var = var->next)
+	writeMinimalVcfRow(f, var);
+    carefulClose(&f);
+    vcfFileFree(&vcff);
+    }
+return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows);
+}
+
 static struct trackDb *getVariantTrackDb(char *variantTrack)
 /* Get a tdb for the user's selected variant track, or warn and return NULL
  * (main page will be displayed) */
 {
 struct trackDb *varTdb = tdbForTrack(database, variantTrack, &fullTrackList);
 if (varTdb == NULL)
     {
     if (isHubTrack(variantTrack))
 	warn("Can't find hub track '%s'", variantTrack);
     else
 	warn("Can't find tdb for variant track '%s'", variantTrack);
     }
 else
     checkVariantTrack(varTdb);
 return varTdb;
@@ -2600,30 +2709,35 @@
     {
     primary = makeSampleVariantsStreamer(assembly, geneTdb, maxVarRows);
     primaryLongLabel = hgvaSampleVariantsLabel;
     // Sample variants can't always be made within the currently selected position range,
     // so just for these, force search to be genome-wide.
     chrom = NULL;
     start = 0;
     end = 0;
     }
 else if (sameString(variantTrack, hgvaUseVariantIds))
     {
     // Override search position if cart position doesn't include all variants:
     primary = makeVariantIdStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList);
     primaryLongLabel = hgvaVariantIdsLabel;
     }
+else if (sameString(variantTrack, hgvaUseHgvs))
+    {
+    primary = makeHgvsStreamer(assembly, maxVarRows, &chrom, &start, &end, &commentList);
+    primaryLongLabel = hgvaHgvsLabel;
+    }
 else if (sameString(variantTrack, hgvaUseVariantFileOrUrl))
     {
     char *fileOrUrl = cartString(cart, hgvaVariantFileOrUrl);
     char *type = cartOptionalString(cart, hgvaVariantFileOrUrlType);
     primary = hAnnoStreamerFromBigFileUrl(fileOrUrl, NULL, assembly, maxVarRows, type);
     primaryLongLabel = hgvaVariantFileOrUrlLabel;
     }
 else
     {
     struct trackDb *varTdb = getVariantTrackDb(variantTrack);
     if (varTdb == NULL)
 	{
         if (! isCommandLine)
             doUi();
 	return;