2c0f74b4fbfefc533d27bf3ccdc18e4ecfdecae7
angie
  Fri Oct 5 17:07:18 2012 -0700
Follow-up to ce70491d:1. Move hgTracks left-base-of-indel trimming code up to vcf.[ch] for
sharing w/hgc.
2. Correct chromStart in hgTracks mapBox links to hgc when we have
trimmed a left base.
3. In hgc, abbreviate long sequences (e.g. 40kb deletion) and show
trimmed left base in parentheses for consistency with VCF file (and
sometimes INFO fields that use left-inclusive coords/seqs).
4. In pgSnpFromVcfRecord, don't truncate long alleles because hgTracks
and hgc do their own abbreviating.

diff --git src/hg/hgc/vcfClick.c src/hg/hgc/vcfClick.c
index 6ee70bc..753368d 100644
--- src/hg/hgc/vcfClick.c
+++ src/hg/hgc/vcfClick.c
@@ -1,351 +1,399 @@
 /* vcfTrack -- handlers for Variant Call Format data. */
 
 #ifdef USE_TABIX
 
 #include "common.h"
 #include "dystring.h"
 #include "errCatch.h"
 #include "hCommon.h"
 #include "hdb.h"
 #include "hgc.h"
 #include "htmshell.h"
 #include "jsHelper.h"
 #if (defined USE_TABIX && defined KNETFILE_HOOKS)
 #include "knetUdc.h"
 #include "udc.h"
 #endif//def USE_TABIX && KNETFILE_HOOKS
 #include "pgSnp.h"
 #include "trashDir.h"
 #include "vcf.h"
 #include "vcfUi.h"
 
 #define NA "<em>n/a</em>"
 
 static void printKeysWithDescriptions(struct vcfFile *vcff, int wordCount, char **words,
 				      struct vcfInfoDef *infoDefs)
 /* Given an array of keys, print out a list of values with
  * descriptions if descriptions are available. */
 {
 int i;
 for (i = 0;  i < wordCount; i++)
     {
     if (i > 0)
 	printf(", ");
     char *key = words[i];
     const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key);
     if (def != NULL)
 	printf("%s (%s)", htmlEncode(key), def->description);
     else
 	printf("%s", htmlEncode(key));
     }
 printf("<BR>\n");
 }
 
-static void vcfAltAlleleDetails(struct vcfRecord *rec)
+static void vcfAltAlleleDetails(struct vcfRecord *rec, char **displayAls)
 /* If VCF header specifies any symbolic alternate alleles, pull in descriptions. */
 {
 printf("<B>Alternate allele(s):</B> ");
 if (rec->alleleCount < 2 || sameString(rec->alleles[1], "."))
     {
     printf(NA"<BR>\n");
     return;
     }
 struct vcfFile *vcff = rec->file;
-printKeysWithDescriptions(vcff, rec->alleleCount-1, &(rec->alleles[1]), vcff->altDefs);
+printKeysWithDescriptions(vcff, rec->alleleCount-1, &(displayAls[1]), vcff->altDefs);
 }
 
 static void vcfQualDetails(struct vcfRecord *rec)
 /* If VCF header specifies a quality/confidence score (not "."), print it out. */
 {
 printf("<B>Quality/confidence score:</B> %s<BR>\n", sameString(rec->qual, ".") ? NA : rec->qual);
 }
 
 static void vcfFilterDetails(struct vcfRecord *rec)
 /* If VCF header specifies any filters, pull in descriptions. */
 {
 if (rec->filterCount == 0 || sameString(rec->filters[0], "."))
     printf("<B>Filter:</B> "NA"<BR>\n");
 else if (rec->filterCount == 1 && sameString(rec->filters[0], "PASS"))
     printf("<B>Filter:</B> PASS<BR>\n");
 else
     {
     printf("<B>Filter failures:</B> ");
     printf("<font style='font-weight: bold; color: #FF0000;'>\n");
     struct vcfFile *vcff = rec->file;
     printKeysWithDescriptions(vcff, rec->filterCount, rec->filters, vcff->filterDefs);
     printf("</font>\n");
     }
 }
 
 static void vcfInfoDetails(struct vcfRecord *rec)
 /* Expand info keys to descriptions, then print out keys and values. */
 {
 if (rec->infoCount == 0)
     return;
 struct vcfFile *vcff = rec->file;
 puts("<B>INFO column annotations:</B><BR>");
 puts("<TABLE border=0 cellspacing=0 cellpadding=0>");
 int i;
 for (i = 0;  i < rec->infoCount;  i++)
     {
     struct vcfInfoElement *el = &(rec->infoElements[i]);
     const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, el->key);
     if (def == NULL)
 	continue;
     printf("<TR><TD align=\"right\"><B>%s:</B></TD><TD>&nbsp;", el->key);
     int j;
     enum vcfInfoType type = def->type;
     if (type == vcfInfoFlag && el->count == 0)
 	printf("Yes"); // no values, so we can't call vcfPrintDatum...
     // However, if this is older VCF, type vcfInfoFlag might have a value.
     for (j = 0;  j < el->count;  j++)
 	{
 	if (j > 0)
 	    printf(", ");
 	if (el->missingData[j])
 	    printf(".");
 	else
 	    vcfPrintDatum(stdout, el->values[j], type);
 	}
     if (def != NULL)
 	printf("</TD><TD>&nbsp;%s", def->description);
     else
 	printf("</TD><TD>");
     printf("</TD></TR>\n");
     }
 puts("</TABLE>");
 }
 
+static void vcfGenotypeTable(struct vcfRecord *rec, char *track, char **displayAls)
+/* Put the table containing details about each genotype into a collapsible section. */
+{
+static struct dyString *tmp1 = NULL;
+if (tmp1 == NULL)
+    tmp1 = dyStringNew(0);
+jsBeginCollapsibleSection(cart, track, "genotypes", "Detailed genotypes", FALSE);
+dyStringClear(tmp1);
+dyStringAppend(tmp1, rec->format);
+struct vcfFile *vcff = rec->file;
+enum vcfInfoType formatTypes[256];
+char *formatKeys[256];
+int formatCount = chopString(tmp1->string, ":", formatKeys, ArraySize(formatKeys));
+puts("<B>Genotype info key:</B><BR>");
+int i;
+for (i = 0;  i < formatCount;  i++)
+    {
+    if (sameString(formatKeys[i], vcfGtGenotype))
+	continue;
+    const struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, formatKeys[i]);
+    char *desc = def ? def->description : "<em>not described in VCF header</em>";
+    printf("&nbsp;&nbsp;<B>%s:</B> %s<BR>\n", formatKeys[i], desc);
+    formatTypes[i] = def->type;
+    }
+hTableStart();
+puts("<TR><TH>Sample ID</TH><TH>Genotype</TH><TH>Phased?</TH>");
+for (i = 0;  i < formatCount;  i++)
+    {
+    if (sameString(formatKeys[i], vcfGtGenotype))
+	continue;
+    printf("<TH>%s</TH>", formatKeys[i]);
+    }
+puts("</TR>\n");
+for (i = 0;  i < vcff->genotypeCount;  i++)
+    {
+    struct vcfGenotype *gt = &(rec->genotypes[i]);
+    char *hapA = ".", *hapB = ".";
+    if (gt->hapIxA >= 0)
+	hapA = displayAls[(unsigned char)gt->hapIxA];
+    if (gt->isHaploid)
+	hapB = "";
+    else if (gt->hapIxB >= 0)
+	hapB = displayAls[(unsigned char)gt->hapIxB];
+    char sep = gt->isHaploid ? ' ' : gt->isPhased ? '|' : '/';
+    char *phasing = gt->isHaploid ? NA : gt->isPhased ? "Y" : "n";
+    printf("<TR><TD>%s</TD><TD>%s%c%s</TD><TD>%s</TD>", vcff->genotypeIds[i],
+	   hapA, sep, hapB, phasing);
+    int j;
+    for (j = 0;  j < gt->infoCount;  j++)
+	{
+	if (sameString(formatKeys[j], vcfGtGenotype))
+	    continue;
+	printf("<TD>");
+	struct vcfInfoElement *el = &(gt->infoElements[j]);
+	int k;
+	for (k = 0;  k < el->count;  k++)
+	    {
+	    if (k > 0)
+		printf(", ");
+	    if (el->missingData[k])
+		printf(".");
+	    else
+		vcfPrintDatum(stdout, el->values[k], formatTypes[j]);
+	    }
+	printf("</TD>");
+	}
+    puts("</TR>");
+    }
+hTableEnd();
+jsEndCollapsibleSection();
+}
+
 static void ignoreEm(char *format, va_list args)
 /* Ignore warnings from genotype parsing -- when there's one, there
  * are usually hundreds more just like it. */
 {
 }
 
-static void vcfGenotypesDetails(struct vcfRecord *rec, char *track)
-/* Print genotypes in some kind of table... */
+static void vcfGenotypesDetails(struct vcfRecord *rec, char *track, char **displayAls)
+/* Print summary of allele and genotype frequency, plus collapsible section
+ * with table of genotype details. */
 {
 struct vcfFile *vcff = rec->file;
 if (vcff->genotypeCount == 0)
     return;
-static struct dyString *tmp1 = NULL;
-if (tmp1 == NULL)
-    tmp1 = dyStringNew(0);
+// Wrapper table for collapsible section:
+puts("<TABLE>");
 pushWarnHandler(ignoreEm);
 vcfParseGenotypes(rec);
 popWarnHandler();
 // Tally genotypes and alleles for summary:
 int refs = 0, alts = 0, unks = 0;
 int refRefs = 0, refAlts = 0, altAlts = 0, gtUnk = 0, gtOther = 0, phasedGts = 0;
 int i;
 for (i = 0;  i < vcff->genotypeCount;  i++)
     {
     struct vcfGenotype *gt = &(rec->genotypes[i]);
     if (gt->isPhased)
 	phasedGts++;
     if (gt->hapIxA == 0)
 	refs++;
     else if (gt->hapIxA > 0)
 	alts++;
     else
 	unks++;
     if (!gt->isHaploid)
 	{
 	if (gt->hapIxB == 0)
 	    refs++;
 	else if (gt->hapIxB > 0)
 	    alts++;
 	else
 	    unks++;
 	if (gt->hapIxA == 0 && gt->hapIxB == 0)
 	    refRefs++;
 	else if (gt->hapIxA == 1 && gt->hapIxB == 1)
 	    altAlts++;
 	else if ((gt->hapIxA == 1 && gt->hapIxB == 0) ||
 		 (gt->hapIxA == 0 && gt->hapIxB == 1))
 	    refAlts++;
 	else if (gt->hapIxA < 0 || gt->hapIxB < 0)
 	    gtUnk++;
 	else
 	    gtOther++;
 	}
     }
 printf("<B>Genotype count:</B> %d", vcff->genotypeCount);
 if (differentString(seqName, "chrY"))
     printf(" (%d phased)", phasedGts);
 else
     printf(" (haploid)");
 puts("<BR>");
 int totalAlleles = refs + alts + unks;
 double refAf = (double)refs/totalAlleles;
 double altAf = (double)alts/totalAlleles;
 printf("<B>Alleles:</B> %s: %d (%.3f%%); %s: %d (%.3f%%)",
-       rec->alleles[0], refs, 100*refAf,  rec->alleles[1], alts, 100*altAf);
+       displayAls[0], refs, 100*refAf, displayAls[1], alts, 100*altAf);
 if (unks > 0)
     printf("; unknown: %d (%.3f%%)", unks, 100 * (double)unks/totalAlleles);
 puts("<BR>");
 // Should be a better way to detect haploid chromosomes than comparison with "chrY":
 if (vcff->genotypeCount > 1 && differentString(seqName, "chrY"))
     {
     printf("<B>Genotypes:</B> %s/%s: %d (%.3f%%); %s/%s: %d (%.3f%%); %s/%s: %d (%.3f%%)",
-	   rec->alleles[0], rec->alleles[0], refRefs, 100*(double)refRefs/vcff->genotypeCount,
-	   rec->alleles[0], rec->alleles[1], refAlts, 100*(double)refAlts/vcff->genotypeCount,
-	   rec->alleles[1], rec->alleles[1], altAlts, 100*(double)altAlts/vcff->genotypeCount);
+	   displayAls[0], displayAls[0], refRefs, 100*(double)refRefs/vcff->genotypeCount,
+	   displayAls[0], displayAls[1], refAlts, 100*(double)refAlts/vcff->genotypeCount,
+	   displayAls[1], displayAls[1], altAlts, 100*(double)altAlts/vcff->genotypeCount);
     if (gtUnk > 0)
 	printf("; unknown: %d (%.3f%%)", gtUnk, 100*(double)gtUnk/vcff->genotypeCount);
     if (gtOther > 0)
 	printf("; other: %d (%.3f%%)", gtOther, 100*(double)gtOther/vcff->genotypeCount);
     printf("<BR>\n");
     if (rec->alleleCount == 2)
 	printf("<B>Hardy-Weinberg equilibrium:</B> "
 	       "P(%s/%s) = %.3f%%; P(%s/%s) = %.3f%%; P(%s/%s) = %.3f%%<BR>",
-	       rec->alleles[0], rec->alleles[0], 100*refAf*refAf,
-	       rec->alleles[0], rec->alleles[1], 100*2*refAf*altAf,
-	       rec->alleles[1], rec->alleles[1], 100*altAf*altAf);
+	       displayAls[0], displayAls[0], 100*refAf*refAf,
+	       displayAls[0], displayAls[1], 100*2*refAf*altAf,
+	       displayAls[1], displayAls[1], 100*altAf*altAf);
     }
-jsBeginCollapsibleSection(cart, track, "genotypes", "Detailed genotypes", FALSE);
-dyStringClear(tmp1);
-dyStringAppend(tmp1, rec->format);
-enum vcfInfoType formatTypes[256];
-char *formatKeys[256];
-int formatCount = chopString(tmp1->string, ":", formatKeys, ArraySize(formatKeys));
-puts("<B>Genotype info key:</B><BR>");
-for (i = 0;  i < formatCount;  i++)
-    {
-    if (sameString(formatKeys[i], vcfGtGenotype))
-	continue;
-    const struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, formatKeys[i]);
-    char *desc = def ? def->description : "<em>not described in VCF header</em>";
-    printf("&nbsp;&nbsp;<B>%s:</B> %s<BR>\n", formatKeys[i], desc);
-    formatTypes[i] = def->type;
-    }
-hTableStart();
-puts("<TR><TH>Sample ID</TH><TH>Genotype</TH><TH>Phased?</TH>");
-for (i = 0;  i < formatCount;  i++)
-    {
-    if (sameString(formatKeys[i], vcfGtGenotype))
-	continue;
-    printf("<TH>%s</TH>", formatKeys[i]);
-    }
-puts("</TR>\n");
-for (i = 0;  i < vcff->genotypeCount;  i++)
-    {
-    struct vcfGenotype *gt = &(rec->genotypes[i]);
-    char *hapA = ".", *hapB = ".";
-    if (gt->hapIxA >= 0)
-	hapA = rec->alleles[(unsigned char)gt->hapIxA];
-    if (gt->isHaploid)
-	hapB = "";
-    else if (gt->hapIxB >= 0)
-	hapB = rec->alleles[(unsigned char)gt->hapIxB];
-    char sep = gt->isHaploid ? ' ' : gt->isPhased ? '|' : '/';
-    char *phasing = gt->isHaploid ? NA : gt->isPhased ? "Y" : "n";
-    printf("<TR><TD>%s</TD><TD>%s%c%s</TD><TD>%s</TD>", vcff->genotypeIds[i],
-	   hapA, sep, hapB, phasing);
-    int j;
-    for (j = 0;  j < gt->infoCount;  j++)
-	{
-	if (sameString(formatKeys[j], vcfGtGenotype))
-	    continue;
-	printf("<TD>");
-	struct vcfInfoElement *el = &(gt->infoElements[j]);
-	int k;
-	for (k = 0;  k < el->count;  k++)
-	    {
-	    if (k > 0)
-		printf(", ");
-	    if (el->missingData[k])
-		printf(".");
-	    else
-		vcfPrintDatum(stdout, el->values[k], formatTypes[j]);
-	    }
-	printf("</TD>");
-	}
-    puts("</TR>");
-    }
-hTableEnd();
-jsEndCollapsibleSection();
+vcfGenotypeTable(rec, track, displayAls);
+puts("</TABLE>");
 }
 
 static void pgSnpCodingDetail(struct vcfRecord *rec)
 /* Translate rec into pgSnp (with proper chrom name) and call Belinda's
  * coding effect predictor from pgSnp details. */
 {
 char *genePredTable = "knownGene";
 if (hTableExists(database, genePredTable))
     {
     struct pgSnp *pgs = pgSnpFromVcfRecord(rec);
     if (!sameString(rec->chrom, seqName))
 	// rec->chrom might be missing "chr" prefix:
 	pgs->chrom = seqName;
     printSeqCodDisplay(database, pgs, genePredTable);
     }
 }
 
+static void abbreviateLongSeq(char *seqIn, int endLength, struct dyString *dy)
+/* If seqIn is longer than 2*endLength plus abbreviation fudge, abbreviate it
+ * to its first endLength bases, ellipsis that says how many bases are skipped,
+ * and its last endLength bases; add result to dy. */
+{
+int threshold = 2*endLength + 30;
+int seqInLen = strlen(seqIn);
+if (seqInLen > threshold)
+    {
+    dyStringAppendN(dy, seqIn, endLength);
+    int skippedLen = seqInLen-2*endLength;
+    dyStringPrintf(dy, "...&lt;%d bases&gt;...%s",
+		   skippedLen, seqIn+seqInLen-endLength);
+    }
+else
+    dyStringAppend(dy, seqIn);
+}
+
+static void makeDisplayAlleles(struct vcfRecord *rec, boolean showLeftBase, char leftBase,
+			       int endLength, char **displayAls)
+/* If necessary, show the left base that we trimmed and/or abbreviate long sequences. */
+{
+int i;
+for (i = 0;  i < rec->alleleCount; i++)
+    {
+    struct dyString *dy = dyStringNew(128);
+    if (showLeftBase)
+	dyStringPrintf(dy, "(%c)", leftBase);
+    abbreviateLongSeq(rec->alleles[i], endLength, dy);
+    displayAls[i] = dy->string; // leak some mem
+    }
+}
+
 static void vcfRecordDetails(struct trackDb *tdb, struct vcfRecord *rec)
 /* Display the contents of a single line of VCF, assumed to be from seqName
  * (using seqName instead of rec->chrom because rec->chrom might lack "chr"). */
 {
 printf("<B>Name:</B> %s<BR>\n", rec->name);
 printCustomUrl(tdb, rec->name, TRUE);
 static char *formName = "vcfCfgHapCenter";
 printf("<FORM NAME=\"%s\" ACTION=\"%s\">\n", formName, hgTracksName());
 cartSaveSession(cart);
 vcfCfgHaplotypeCenter(cart, tdb, tdb->track, FALSE, rec->file, rec->name,
 		      seqName, rec->chromStart, formName);
 printf("</FORM>\n");
+char leftBase = rec->alleles[0][0];
+unsigned int vcfStart = vcfRecordTrimIndelLeftBase(rec);
+boolean showLeftBase = (rec->chromStart == vcfStart+1);
+char *displayAls[rec->alleleCount];
+makeDisplayAlleles(rec, showLeftBase, leftBase, 20, displayAls);
 printPosOnChrom(seqName, rec->chromStart, rec->chromEnd, NULL, FALSE, rec->name);
-printf("<B>Reference allele:</B> %s<BR>\n", rec->alleles[0]);
-vcfAltAlleleDetails(rec);
+printf("<B>Reference allele:</B> %s<BR>\n", displayAls[0]);
+vcfAltAlleleDetails(rec, displayAls);
 vcfQualDetails(rec);
 vcfFilterDetails(rec);
 vcfInfoDetails(rec);
 pgSnpCodingDetail(rec);
-// Wrapper table for collapsible section:
-puts("<TABLE>");
-vcfGenotypesDetails(rec, tdb->track);
-puts("</TABLE>");
+makeDisplayAlleles(rec, showLeftBase, leftBase, 5, displayAls);
+vcfGenotypesDetails(rec, tdb->track, displayAls);
 }
 
 void doVcfTabixDetails(struct trackDb *tdb, char *item)
 /* Show details of an alignment from a VCF file compressed and indexed by tabix. */
 {
 #if (defined USE_TABIX && defined KNETFILE_HOOKS)
 knetUdcInstall();
 if (udcCacheTimeout() < 300)
     udcSetCacheTimeout(300);
 #endif//def USE_TABIX && KNETFILE_HOOKS
 int start = cartInt(cart, "o");
 int end = cartInt(cart, "t");
 struct sqlConnection *conn = hAllocConnTrack(database, tdb);
 // TODO: will need to handle per-chrom files like bam, maybe fold bamFileNameFromTable into this::
 char *fileOrUrl = bbiNameFromSettingOrTableChrom(tdb, conn, tdb->table, seqName);
 hFreeConn(&conn);
 int vcfMaxErr = -1;
 struct vcfFile *vcff = NULL;
 /* protect against temporary network error */
 struct errCatch *errCatch = errCatchNew();
 if (errCatchStart(errCatch))
     {
     vcff = vcfTabixFileMayOpen(fileOrUrl, seqName, start, end, vcfMaxErr, -1);
     }
 errCatchEnd(errCatch);
 if (errCatch->gotError)
     {
     if (isNotEmpty(errCatch->message->string))
 	warn("%s", errCatch->message->string);
     }
 errCatchFree(&errCatch);
 if (vcff != NULL)
     {
     struct vcfRecord *rec;
     for (rec = vcff->records;  rec != NULL;  rec = rec->next)
 	if (rec->chromStart == start && rec->chromEnd == end) // in pgSnp mode, don't get name
 	    vcfRecordDetails(tdb, rec);
     }
 else
     printf("Sorry, unable to open %s<BR>\n", fileOrUrl);
 }
 
 
 #endif // no USE_TABIX