193eba89efc0c2cb8e61e58f75855cff4cae408b
angie
  Wed Apr 11 17:44:38 2018 -0700
Filter symbolic alleles out of dbSNP observed alleles and warn user.  Make warnings reappear on subsequent queries even when we don't rebuild the VCF file by saving the warnings in a file too.

variantProjector doesn't accept symbolic alleles, and dbSNP has all kinds of cruft in its observed alleles.  One kind that occurs frequently is "lengthTooLong" -- sometimes in that case the alleles can be recovered from the allele frequency columns.

Also ignore lines starting with "#" in rsId input using lineFileNextReal .

refs #21142 note-24 -- thanks Jairo for finding the 'lengthTooLong' problem!

diff --git src/hg/hgVai/hgVai.c src/hg/hgVai/hgVai.c
index 7a024dc..41a6287 100644
--- src/hg/hgVai/hgVai.c
+++ src/hg/hgVai/hgVai.c
@@ -27,30 +27,31 @@
 #include "hubConnect.h"
 #include "twoBit.h"
 #include "gpFx.h"
 #include "bigGenePred.h"
 #include "udc.h"
 #include "knetUdc.h"
 #include "md5.h"
 #include "regexHelper.h"
 #include "hAnno.h"
 #include "annoGratorQuery.h"
 #include "annoGratorGpVar.h"
 #include "annoFormatVep.h"
 #include "annoStreamBigBed.h"
 #include "annoStreamDb.h"
 #include "windowsToAscii.h"
+#include "obscure.h"
 
 #include "libifyMe.h"
 
 #define GENCODE_TAG_DOC_URL "\"http://www.gencodegenes.org/gencode_tags.html\""
 #define GENCODE_BASIC_DOC_URL "\"http://www.gencodegenes.org/faq.html\""
 #define REFSEQ_STATUS_DOC_URL "\"https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_status_codes\""
 #define APPRIS_DOC_URL "\"http://appris.bioinfo.cnio.es/#/help/database\""
 
 #define HGVS_MUST_USE_ACC "Note: HGVS terms must use versioned transcript or genomic accessions " \
     "(e.g. NM_000023.3, NC_000012.11, ENST00000000233.9), not gene symbols."
 
 /* Global Variables */
 struct cart *cart;		/* CGI and other variables */
 struct hash *oldVars = NULL;	/* The cart before new cgi stuff added. */
 char *genome = NULL;		/* Name of genome - mouse, human, etc. */
@@ -2178,43 +2179,40 @@
 static char *encloseInAngleBracketsDbSnp(char *stringIn)
 /* Return a string that has <dbSNP: stringIn>, with spaces replaced by '_'s. */
 {
 int stringInLen = strlen(stringIn);
 int stringOutLen = stringInLen + strlen("<dbSNP:>") + 1;
 char *stringOut = needMem(stringOutLen);
 safef(stringOut, stringOutLen, "<dbSNP:%s>", stringIn);
 subChar(stringOut, ' ', '_');
 return stringOut;
 }
 
 // dbSNP named alleles have many ways to describe a deletion from the reference,
 // for example "LARGEDELETION", "LARGE DELETION", "... DELETED", "... DEL":
 static const char *dbSnpDelRegex = "^\\(.*(DELET.*| DEL)\\)$";
 
-static char **parseDbSnpAltAlleles(char *refAl, char *obsAls, boolean minusStrand,
+static char **altAlsFromObserved(char *refAl, int obsCount, char **obsWords, boolean minusStrand,
                                  int *retAltAlCount, boolean *retNeedLeftBase)
-/* Given a non-symbolic reference allele and slash-sep observed alleles from dbSNP,
+/* Given a non-symbolic reference allele and an array of observed alleles,
  * return an array of +-strand alleles that are not the same as the reference.
  * If any allele is "-" (deleted, zero-length), then set retNeedLeftBase to TRUE
  * because in this case VCF requires that the reference base to the left of the indel
  * must be added to all alleles, and the start coord also moves one base to the left.
  * Also, if any alt allele is symbolic, padding is required.
  * Note: this trashes obsAls.  Resulting array can be freed but not its contents. */
 {
-int obsCount = countChars(obsAls, '/') + 1;
-char *obsWords[obsCount];
-chopByChar(obsAls, '/', obsWords, obsCount);
 boolean obsHasDeletion = FALSE;
 int i;
 for (i = 0;  i < obsCount;  i++)
     if (sameString(obsWords[i], "-"))
 	{
 	obsHasDeletion = TRUE;
 	break;
 	}
 char **altAls;
 AllocArray(altAls, obsCount);
 int altCount = 0;
 boolean needLeftBase = isEmpty(refAl) || sameString(refAl, "-");
 for (i = 0;  i < obsCount;  i++)
     {
     char *altAl = obsWords[i];
@@ -2247,62 +2245,96 @@
 		    continue;
 		needLeftBase = TRUE;
 		altAl = encloseInAngleBracketsDbSnp(altAl);
 		}
 	    altAls[altCount] = altAl;
 	    }
         altAls[altCount] = altAl;
 	altCount++;
 	}
     }
 *retAltAlCount = altCount;
 *retNeedLeftBase = needLeftBase;
 return altAls;
 }
 
+static char **parseDbSnpAltAlleles(char *refAl, char *obsAls, int alleleFreqCount, char *alleles,
+				   boolean minusStrand,
+                                   int *retAltAlCount, boolean *retNeedLeftBase)
+/* Given a non-symbolic reference allele and slash-sep observed alleles from dbSNP,
+ * return an array of +-strand alleles that are not the same as the reference.
+ * If obsAls is "lengthTooLong" but alleleFreqCount is at least 2 and alleles is a comma-sep
+ * list of nucleotide bases, use alleles instead.
+ * If any allele is "-" (deleted, zero-length), then set retNeedLeftBase to TRUE
+ * because in this case VCF requires that the reference base to the left of the indel
+ * must be added to all alleles, and the start coord also moves one base to the left.
+ * Also, if any alt allele is symbolic, padding is required.
+ * Note: this trashes obsAls.  Resulting array can be freed but not its contents. */
+{
+int obsCount = 0;
+int maxWords = 1000;
+char *obsWords[maxWords];
+if (sameString(obsAls, "lengthTooLong") && alleleFreqCount > 1)
+    {
+    obsCount = countChars(alleles, ',');
+    if (obsCount > maxWords)
+        errAbort("parseDbSnpAltAlleles: too many alleles (%d) in observed column", obsCount);
+    chopByChar(alleles, ',', obsWords, obsCount);
+    }
+else
+    {
+    obsCount = countChars(obsAls, '/') + 1;
+    if (obsCount > maxWords)
+        errAbort("parseDbSnpAltAlleles: too many alleles (%d) in observed column", obsCount);
+    chopByChar(obsAls, '/', obsWords, obsCount);
+    }
+return altAlsFromObserved(refAl, obsCount, obsWords, minusStrand, retAltAlCount, retNeedLeftBase);
+}
+
 char *firstNCommaSep(struct slName *nameList, int n)
 /* Return a comma-separated string with the first n names in nameList. */
 {
 struct dyString *dy = dyStringNew(0);
 int i;
 struct slName *el;
 for (i=0, el=nameList;  i < 5 && el != NULL;  i++, el = el->next)
     {
     if (i > 0)
 	dyStringAppend(dy, ", ");
     dyStringPrintf(dy, "'%s'", el->name);
     }
 return dyStringCannibalize(&dy);
 }
 
 //#*** Variant ID-matching should be metadata-driven too.  termRegex -> data source.
 static const char *rsIdRegex = "^rs[0-9]+$";
 
 static void rsIdsToVcfRecords(struct annoAssembly *assembly, struct slName *rsIds,
 			      struct vcfFile *vcff, struct vcfRecord **pRecList,
-			      struct slName **pCommentList)
+                              FILE *warnF)
 /* If possible, look up coordinates and alleles of dbSNP rs IDs. */
 {
 if (rsIds == NULL)
     return;
 struct trackDb *tdb = hFindLatestSnpTrack(database, NULL, &fullTrackList);
 if (tdb == NULL)
     return;
 struct sqlConnection *conn = hAllocConn(assembly->name);
 // Build a 'name in (...)' query, and build a hash of IDs so we can test whether all were found
 struct dyString *dq = sqlDyStringCreate("select chrom, chromStart, chromEnd, name, strand, "
-					"refUCSC, observed from %s where name in (",
+					"refUCSC, observed, alleleFreqCount, alleles "
+                                        "from %s where name in (",
 					tdb->table);
 struct hash *idHash = hashNew(0);
 struct slName *id;
 for (id = rsIds;  id != NULL;  id = id->next)
     {
     tolowers(id->name);
     dyStringPrintf(dq, "%s'%s'", (id != rsIds ? "," : ""), id->name);
     hashStoreName(idHash, id->name);
     }
 dyStringAppend(dq, ");");
 struct sqlResult *sr = sqlGetResult(conn, dq->string);
 // Construct a minimal VCF row to make a vcfRecord for each variant.
 char *vcfRow[9];
 vcfRow[5] = vcfRow[6] = vcfRow[7] = "."; // placeholder for qual, filter, info
 // It would be cool to someday add snpNNN's exceptions column to the filter or info.
@@ -2319,125 +2351,151 @@
     char refAl[max(strlen(row[5]), chromEnd-chromStart) + 2];
     safecpy(refAl, sizeof(refAl), row[5]);
     if (sameString(refAl, "-"))
 	refAl[0] = '\0';
     else if (! isAllNt(refAl, strlen(refAl)))
 	{
 	// refUCSC is symbolic like "( 2548bp insertion )" -- just use the actual reference
 	// sequence for now, to keep things simple downstream.
 	//#*** Need something more efficient, like a sequence cache inside assembly!
 	struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom, chromStart, chromEnd);
 	safecpy(refAl, sizeof(refAl), seq->dna);
 	toUpperN(refAl, seq->size);
 	dnaSeqFree(&seq);
 	}
     char *obsAls = row[6];
+    int alFCount = atoi(row[7]);
+    char *alFAls = row[8];
     int altAlCount = 0;
     boolean needLeftBase = FALSE;
-    char **altAls = parseDbSnpAltAlleles(refAl, obsAls, sameString(strand, "-"),
+    char **altAls = parseDbSnpAltAlleles(refAl, obsAls, alFCount, alFAls, sameString(strand, "-"),
 					 &altAlCount, &needLeftBase);
     needLeftBase |= (chromStart == chromEnd);  // should be redundant, but just in case.
+    // variantProjector doesn't accept symbolic alleles (aside from VCF <DEL>),
+    // so filter those out and warn:
+    int j = 0;
+    struct dyString *noSym = NULL;
+    char *filteredAltAls[altAlCount];
+    for (i = 0;  i < altAlCount;  i++)
+        {
+        if (!isAllNt(altAls[i], strlen(altAls[i])))
+            {
+            if (noSym == NULL)
+                noSym = dyStringCreate("%s has symbolic allele(s) e.g. '%s', ignoring.  ",
+                                       name, altAls[i]);
+            }
+        else
+            filteredAltAls[j++] = altAls[i];
+        }
+    if (noSym != NULL)
+        {
+        fputs(noSym->string, warnF);
+        dyStringFree(&noSym);
+        }
+    altAlCount = j;
+    altAls = filteredAltAls;
     hashRemove(idHash, name);
     uint vcfStart = chromStart + 1;
     dyStringClear(dyAltAlStr);
     // If this is a non-symbolic indel, we'll have to move start one base to the left
     // and add the reference base to the left of the actual alleles
     if (needLeftBase)
 	{
 	vcfStart--;
 	//#*** Need something more efficient, like a sequence cache inside assembly!
 	struct dnaSeq *seq = twoBitReadSeqFragLower(assembly->tbf, chrom,
 						    chromStart-1, chromStart);
 	toUpperN(seq->dna, 1);
 	char leftBase = seq->dna[0];
 	dnaSeqFree(&seq);
 	char refAlPlus[strlen(refAl)+2];
 	safef(refAlPlus, sizeof(refAlPlus), "%c%s", leftBase, refAl);
 	safecpy(refAl, sizeof(refAl), refAlPlus);
 	for (i = 0;  i < altAlCount;  i++)
 	    {
 	    if (i > 0)
 		dyStringAppendC(dyAltAlStr, ',');
-	    if (isAllNt(altAls[i], strlen(altAls[i])))
             dyStringPrintf(dyAltAlStr, "%c%s", leftBase, altAls[i]);
-	    else
-		dyStringAppend(dyAltAlStr, altAls[i]);
 	    }
 	}
     else
 	{
 	// Not an indel, just make comma-sep string of alt alleles.
 	for (i = 0;  i < altAlCount;  i++)
 	    {
 	    if (i > 0)
 		dyStringAppendC(dyAltAlStr, ',');
 	    dyStringAppend(dyAltAlStr, altAls[i]);
 	    }
 	}
+    if (altAlCount == 0)
+        dyStringAppendC(dyAltAlStr, '.');
     char vcfStartStr[64];
     safef(vcfStartStr, sizeof(vcfStartStr), "%d", vcfStart);
     vcfRow[0] = chrom;
     vcfRow[1] = vcfStartStr;
     vcfRow[2] = name;
     vcfRow[3] = refAl;
     vcfRow[4] = dyAltAlStr->string;
     struct vcfRecord *rec = vcfRecordFromRow(vcff, vcfRow);
     slAddHead(pRecList, rec);
     }
 dyStringFree(&dyAltAlStr);
 // Check for IDs not found
 struct slName *notFoundIds = hashListNames(idHash);
 if (notFoundIds != NULL)
     {
     char *namesNotFound = firstNCommaSep(notFoundIds, 5);
-    struct dyString *dy = dyStringCreate("%d rs# IDs not found, e.g. %s",
+    fprintf(warnF, "%d rs# IDs not found, e.g. %s",
             slCount(notFoundIds), namesNotFound);
-    slAddTail(pCommentList, slNameNew(dy->string));
     freeMem(namesNotFound);
-    dyStringFree(&dy);
     }
 slNameFreeList(&notFoundIds);
 }
 
 static struct vcfRecord *parseVariantIds(struct annoAssembly *assembly, char *variantIdText,
-					 struct slName **pCommentList)
+					 FILE *warnF)
 /* Return a sorted list of minimal vcfRecords (coords, name, ref and alt alleles)
  * corresponding to each pasted variant. */
 {
 struct vcfRecord *recList = NULL;
 char *p = cloneString(variantIdText), *id;
 struct slName *rsIds = NULL, *unknownIds = NULL;
 subChar(p, ',', ' ');
-while ((id = nextWord(&p)) != NULL)
+struct lineFile *lf = lineFileOnString("rs# IDs", TRUE, p);
+char *line = NULL;
+while (lineFileNextReal(lf, &line))
+    {
+    while ((id = nextWord(&line)) != NULL)
         {
         if (regexMatchNoCase(id, rsIdRegex))
             slNameAddHead(&rsIds, id);
         else
             slNameAddHead(&unknownIds, id);
         }
+    }
+lineFileClose(&lf);
 if (unknownIds != NULL)
     {
     slReverse(&unknownIds);
     char *firstUnknownIds = firstNCommaSep(unknownIds, 5);
-    struct dyString *dy = dyStringCreate("%d variant identifiers are unrecognized, e.g. %s",
+    fprintf(warnF, "%d variant identifiers are unrecognized, e.g. %s.  ",
             slCount(unknownIds), firstUnknownIds);
-    slAddTail(pCommentList, slNameNew(dy->string));
     freeMem(firstUnknownIds);
-    dyStringFree(&dy);
     }
 struct vcfFile *vcff = vcfFileNew();
-rsIdsToVcfRecords(assembly, rsIds, vcff, &recList, pCommentList);
+rsIdsToVcfRecords(assembly, rsIds, vcff, &recList, warnF);
 slSort(&recList, vcfRecordCmp);
 slNameFreeList(&rsIds);
 slNameFreeList(&unknownIds);
 return recList;
 }
 
 static void adjustRangeForVariants(struct vcfRecord *recList, char **pChrom,
 				   uint *pStart, uint *pEnd)
 /* Given a *sorted* list of VCF records, look at the first and last records and if
  * they fall outside of the range, expand the range to include them. */
 {
 if (pChrom != NULL && *pChrom != NULL && recList != NULL)
     {
     // We have a non-empty sorted list of records, and search is not genome-wide;
     // look at first and last variants to see if they're not already included in search range.
@@ -2449,58 +2507,75 @@
 	// Variants span multiple chroms; force genome-wide search.
 	*pChrom = NULL;
 	*pStart = *pEnd = 0;
 	}
     else
 	{
 	// Variant(s) are on the same chrom as search range; check starts and ends.
 	if (*pStart > first->chromEnd)
 	    *pStart = first->chromEnd;
 	if (*pEnd < last->chromStart)
 	    *pEnd = last->chromStart;
 	}
     }
 }
 
+static void slAddFileContentsTail(struct slName **pSlNameList, char *filename)
+/* If filename exists and is nonempty, add its contents as a new slName at end of list. */
+{
+char *contents = NULL;
+size_t contentSize = 0;
+if (fileExists(filename))
+    readInGulp(filename, &contents, &contentSize);
+if (isNotEmpty(contents))
+    slAddTail(pSlNameList, slNameNew(contents));
+freeMem(contents);
+}
+
 static struct annoStreamer *makeVariantIdStreamer(struct annoAssembly *assembly, int maxOutRows,
 						  char **pChrom, uint *pStart, uint *pEnd,
 						  struct slName **pCommentList)
 /* Translate user's pasted/uploaded variant IDs into minimal VCF if possible.
  * Return a VCF streamer for those, and if the current search position is too narrow
  * to include all of the variants, widen it as necessary. */
 {
 // Hash variant text to get trash filename.  Use if exists, otherwise build it.
 char *variantIds = cartString(cart, hgvaVariantIds);
 char *varFile = md5TrashPath(assembly, variantIds);
+char warnFile[strlen(varFile)+6];
+safef(warnFile, sizeof(warnFile), "%s.warn", varFile);
 boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildVariantIds", FALSE);
 if (! fileExists(varFile) || forceRebuild)
     {
-    struct vcfRecord *varList = parseVariantIds(assembly, variantIds, pCommentList);
+    FILE *warnF = mustOpen(warnFile, "w");
+    struct vcfRecord *varList = parseVariantIds(assembly, variantIds, warnF);
 //#*** If no variants were recognized, we should probably show main page with a warning.
     adjustRangeForVariants(varList, pChrom, pStart, pEnd);
     FILE *f = mustOpen(varFile, "w");
     writeMinimalVcfHeader(f, assembly->name, NULL);
     struct vcfRecord *var;
     for (var = varList;  var != NULL;  var = var->next)
 	writeMinimalVcfRow(f, var);
     carefulClose(&f);
+    carefulClose(&warnF);
     }
+slAddFileContentsTail(pCommentList, warnFile);
 return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows);
 }
 
 static struct vcfRecord *parseHgvs(struct vcfFile *vcff, struct annoAssembly *assembly,
-                                   char *hgvsTerms, struct slName **pCommentList)
+                                   char *hgvsTerms, FILE *warnF)
 /* Return a sorted list of vcfRecords . */
 {
 struct vcfRecord *recList = NULL;
 struct slName *failedTerms = NULL;
 struct dyString *dyError = dyStringNew(0);
 struct lineFile *lf = lineFileOnString("user-provided HGVS terms", TRUE, cloneString(hgvsTerms));
 char *term = NULL;
 boolean notVersionedAcc = FALSE;
 while (lineFileNextReal(lf, &term))
     {
     eraseTrailingSpaces(term);
     dyStringClear(dyError);
     struct vcfRow *vcfRow = hgvsToVcfRow(assembly->name, term, FALSE, dyError);
     if (vcfRow)
         {
@@ -2516,72 +2591,74 @@
         row[6] = vcfRow->filter;
         row[7] = vcfRow->info;
         slAddHead(&recList, vcfRecordFromRow(vcff, row));
         }
     else
         {
         if (!regexMatch(term, "^[A-Z_0-9]+\\.[0-9]+[: ]"))
             notVersionedAcc = TRUE;
 	slNameAddHead(&failedTerms, dyStringContents(dyError));
         }
     }
 if (failedTerms != NULL)
     {
     slReverse(&failedTerms);
     char *firstUnknownIds = firstNCommaSep(failedTerms, 5);
-    struct dyString *dy = dyStringCreate("%d HGVS terms could not be parsed and/or mapped to %s, "
-                                         "e.g. %s",
+    fprintf(warnF, "%d HGVS terms could not be parsed and/or mapped to %s, e.g. %s.  ",
             slCount(failedTerms), assembly->name, firstUnknownIds);
     if (notVersionedAcc)
-        dyStringAppend(dy, ".  " HGVS_MUST_USE_ACC);
-    slAddTail(pCommentList, slNameNew(dy->string));
+        fputs(HGVS_MUST_USE_ACC, warnF);
     freeMem(firstUnknownIds);
-    dyStringFree(&dy);
     }
 slSort(&recList, vcfRecordCmp);
 slNameFreeList(&failedTerms);
 dyStringFree(&dyError);
 lineFileClose(&lf);
 return recList;
 }
 
 static struct annoStreamer *makeHgvsStreamer(struct annoAssembly *assembly, int maxOutRows,
                                              char **pChrom, uint *pStart, uint *pEnd,
                                              struct slName **pCommentList)
 /* Translate user's pasted/uploaded variant IDs into minimal VCF if possible.
  * Return a VCF streamer for those., and if the current search position is too narrow
  * to include all of the variants, widen it as necessary. */
 {
 // Hash HGVS input to get trash filename.  Use if exists, otherwise build it.
 char *hgvsTerms = cartString(cart, hgvaHgvs);
 char *varFile = md5TrashPath(assembly, hgvsTerms);
+char warnFile[strlen(varFile)+6];
+safef(warnFile, sizeof(warnFile), "%s.warn", varFile);
 boolean forceRebuild = cartUsualBoolean(cart, "hgva_rebuildHgvs", FALSE);
 if (! fileExists(varFile) || forceRebuild)
     {
     char *headerString = makeMinimalVcfHeader(assembly->name, HGVS_VCF_HEADER_DEFS);
     struct vcfFile *vcff = vcfFileFromHeader("hgVaiHgvs", headerString, VCF_IGNORE_ERRS);
-    struct vcfRecord *varList = parseHgvs(vcff, assembly, hgvsTerms, pCommentList);
+    FILE *warnF = mustOpen(warnFile, "w");
+    struct vcfRecord *varList = parseHgvs(vcff, assembly, hgvsTerms, warnF);
 //#*** If no HGVS terms could be mapped, we should probably show main page with a warning.
     adjustRangeForVariants(varList, pChrom, pStart, pEnd);
     FILE *f = mustOpen(varFile, "w");
     writeMinimalVcfHeader(f, assembly->name, HGVS_VCF_HEADER_DEFS);
     struct vcfRecord *var;
     for (var = varList;  var != NULL;  var = var->next)
 	writeMinimalVcfRow(f, var);
     carefulClose(&f);
+    carefulClose(&warnF);
     vcfFileFree(&vcff);
     }
+slAddFileContentsTail(pCommentList, warnFile);
 return annoStreamVcfNew(varFile, NULL, FALSE, assembly, maxOutRows);
 }
 
 static struct trackDb *getVariantTrackDb(char *variantTrack)
 /* Get a tdb for the user's selected variant track, or warn and return NULL
  * (main page will be displayed) */
 {
 struct trackDb *varTdb = tdbForTrack(database, variantTrack, &fullTrackList);
 if (varTdb == NULL)
     {
     if (isHubTrack(variantTrack))
 	warn("Can't find hub track '%s'", variantTrack);
     else
 	warn("Can't find tdb for variant track '%s'", variantTrack);
     }