f78d6e76501646c8b8cc966103bafca896322509
angie
  Mon Oct 7 13:54:42 2013 -0700
Work in progress for #11460 (paste/upload variant input options...):adding an option for user to paste/upload variant identifiers which
will be translated into a sorted list of vcfRecords.

Currently we recognize only rs# IDs.  I was considering adding
dbVar IDs, but those could come from multiple sources (DGV, ClinVar,
ISCA) so I'm not sure.

Treating all symbolic/named alleles as deletions... non-ideal,
but fortunately those are a small minority in dbSNP.

Next: recognize HGVS IDs.

The grander vision of #11460 includes accepting VEP input format
and VCF, but I think those should be new SELECT options so we don't
get into quagmire of guessing format.

diff --git src/lib/vcf.c src/lib/vcf.c
index d017d1e..59c941b 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -398,31 +398,31 @@
 expectColumnName(vcff, "FILTER", words, 6);
 expectColumnName(vcff, "INFO", words, 7);
 if (wordCount > 8)
     {
     expectColumnName(vcff, "FORMAT", words, 8);
     if (wordCount < 10)
 	vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?");
     vcff->genotypeCount = (wordCount - 9);
     vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *));
     int i;
     for (i = 9;  i < wordCount;  i++)
 	vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]);
     }
 }
 
-static struct vcfFile *vcfFileNew()
+struct vcfFile *vcfFileNew()
 /* Return a new, empty vcfFile object. */
 {
 struct vcfFile *vcff = NULL;
 AllocVar(vcff);
 vcff->pool = hashNew(0);
 vcff->reusePool = NULL;  // Must explicitly request a separate record pool
 return vcff;
 }
 
 void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize)
 // Creates a separate memory pool for records.  Establishing this pool allows
 // using vcfFileFlushRecords to abandon previously read records and free
 // the associated memory. Very useful when reading an entire file in batches.
 {
 assert(vcff->reusePool == NULL); // don't duplicate this
@@ -688,58 +688,75 @@
 {
 char *words[VCF_MAX_COLUMNS];
 int wordCount;
 if ((wordCount = lineFileChop(vcff->lf, words)) <= 0)
     return NULL;
 int expected = 8;
 if (vcff->genotypeCount > 0)
     expected = 9 + vcff->genotypeCount;
 lineFileExpectWords(vcff->lf, expected, wordCount);
 return vcfRecordFromRow(vcff, words);
 }
 
 unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec)
 /* For indels, VCF includes the left neighboring base; for example, if the alleles are
  * AA/- following a G base, then the VCF record will start one base to the left and have
- * "GAA" and "G" as the alleles.  That is not nice for display for two reasons:
+ * "GAA" and "G" as the alleles.  Also, if any alt allele is symbolic (e.g. <DEL>) then
+ * the ref allele must have a padding base.
+ * That is not nice for display for two reasons:
  * 1. Indels appear one base wider than their dbSNP entries.
  * 2. In pgSnp display mode, the two alleles are always the same color.
  * However, for hgTracks' mapBox we need the correct chromStart for identifying the
  * record in hgc -- so return the original chromStart. */
 {
 unsigned int chromStartOrig = rec->chromStart;
 struct vcfFile *vcff = rec->file;
 if (rec->alleleCount > 1)
     {
-    boolean allSameFirstBase = TRUE;
-    char firstBase = rec->alleles[0][0];
+    boolean needPaddingBase = TRUE;
+    char firstBase = '\0';
+    if (isAllNt(rec->alleles[0], strlen(rec->alleles[0])))
+	firstBase = rec->alleles[0][0];
     int i;
     for (i = 1;  i < rec->alleleCount;  i++)
+	{
+	if (isAllNt(rec->alleles[i], strlen(rec->alleles[i])))
+	    {
+	    if (firstBase == '\0')
+		firstBase = rec->alleles[i][0];
 	    if (rec->alleles[i][0] != firstBase)
+		// Different first base implies unpadded alleles.
+		needPaddingBase = FALSE;
+	    }
+	else
 	    {
-	    allSameFirstBase = FALSE;
+	    // Symbolic ALT allele: REF must have the padding base.
+	    needPaddingBase = TRUE;
 	    break;
 	    }
-    if (allSameFirstBase)
+	}
+    if (needPaddingBase)
 	{
 	rec->chromStart++;
 	for (i = 0;  i < rec->alleleCount;  i++)
 	    {
 	    if (rec->alleles[i][1] == '\0')
 		rec->alleles[i] = vcfFilePooledStr(vcff, "-");
-	    else
+	    else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i])))
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1);
+	    else // don't trim first character of symbolic allele
+		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]);
 	    }
 	}
     }
 return chromStartOrig;
 }
 
 static struct vcfRecord *vcfParseData(struct vcfFile *vcff, int maxRecords)
 // Given a vcfFile into which the header has been parsed, and whose
 // lineFile is positioned at the beginning of a data row, parse and
 // return all data rows from lineFile.
 {
 if (vcff == NULL)
     return NULL;
 int recCount = 0;
 struct vcfRecord *records = NULL;