de29d745d32fb04a5187fb5998cc94bbb9bf41a4
angie
  Wed Aug 9 13:39:59 2017 -0700
Add optional HGVS output to annoGratorGpVar and hgVai. Since annoGratorGpVar is genePred-based, it can't yet take advantage of variantProjector's full PSL+CDS+sequence support, so when transcripts don't align cleanly to genome, HGVS c./n./p. output may be incorrect. refs #19968

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index c76b795..77fc0eb 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -1,25 +1,26 @@
 /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */
 /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */
 
 /* Copyright (C) 2016 The Regents of the University of California
  * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "hgHgvs.h"
 #include "chromInfo.h"
 #include "genbank.h"
 #include "hdb.h"
+#include "indelShift.h"
 #include "pslTransMap.h"
 #include "regexHelper.h"
 #include "trackHub.h"
 
 void hgvsVariantFree(struct hgvsVariant **pHgvs)
 // Free *pHgvs and its contents, and set *pHgvs to NULL.
 {
 if (pHgvs && *pHgvs)
     {
     struct hgvsVariant *hgvs = *pHgvs;
     freez(&hgvs->seqAcc);
     freez(&hgvs->seqGeneSymbol);
     freez(&hgvs->changes);
     freez(pHgvs);
     }
@@ -474,96 +475,85 @@
 
 struct hgvsVariant *hgvsParseTerm(char *term)
 /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL.
  * This does not check validity of accessions, coordinates or alleles. */
 {
 struct hgvsVariant *hgvs = hgvsParseCNDotPos(term);
 if (hgvs == NULL)
     hgvs = hgvsParsePDotSubst(term);
 if (hgvs == NULL)
     hgvs = hgvsParsePDotRange(term);
 if (hgvs == NULL)
     hgvs = hgvsParseGDotPos(term);
 return hgvs;
 }
 
-static boolean dbHasNcbiRefSeq(char *db)
-/* Test whether NCBI's RefSeq alignments are available in db. */
-{
-// hTableExists() caches results so this shouldn't make for loads of new SQL queries if called
-// more than once.
-return (hTableExists(db, "ncbiRefSeq") && hTableExists(db, "ncbiRefSeqPsl") &&
-        hTableExists(db, "ncbiRefSeqCds") && hTableExists(db, "ncbiRefSeqLink") &&
-        hTableExists(db, "ncbiRefSeqPepTable") &&
-        hTableExists(db, "seqNcbiRefSeq") && hTableExists(db, "extNcbiRefSeq"));
-}
-
 static char *npForGeneSymbol(char *db, char *geneSymbol)
 /* Given a gene symbol, look up and return its NP_ accession; if not found return NULL. */
 {
 char query[2048];
-if (dbHasNcbiRefSeq(db))
+if (hDbHasNcbiRefSeq(db))
     {
     sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where name = '%s' "
              "and protAcc != 'n/a' and protAcc != '' "
              "order by length(protAcc), protAcc",
              geneSymbol);
     }
 else if (hTableExists(db, "refGene"))
     {
     // Join with refGene to make sure it's an NP for this species.
     sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
              "where l.name = '%s' and l.mrnaAcc = r.name "
              "and l.protAcc != '' order by length(l.protAcc), l.protAcc"
              , refLinkTable, geneSymbol);
     }
 else
     return NULL;
 struct sqlConnection *conn = hAllocConn(db);
 char *npAcc = sqlQuickString(conn, query);
 hFreeConn(&conn);
 return npAcc;
 }
 
 static char *nmForGeneSymbol(char *db, char *geneSymbol)
 /* Given a gene symbol, look up and return its NM_ accession; if not found return NULL. */
 {
 if (trackHubDatabase(db))
     return NULL;
 char query[2048];
 char *geneTable = NULL;
-if (dbHasNcbiRefSeq(db))
+if (hDbHasNcbiRefSeq(db))
     geneTable = "ncbiRefSeq";
 else if (hTableExists(db, "refGene"))
     geneTable = "refGene";
 if (geneTable == NULL)
     return NULL;
 sqlSafef(query, sizeof(query), "select name from %s where name2 = '%s' "
          "order by length(name), name", geneTable, geneSymbol);
 struct sqlConnection *conn = hAllocConn(db);
 char *nmAcc = sqlQuickString(conn, query);
 hFreeConn(&conn);
 return nmAcc;
 }
 
 static char *npForNm(char *db, char *nmAcc)
 /* Given an NM_ accession, look up and return its NP_ accession; if not found return NULL. */
 {
 if (trackHubDatabase(db))
     return NULL;
 char query[2048];
-if (dbHasNcbiRefSeq(db))
+if (hDbHasNcbiRefSeq(db))
     {
     // ncbiRefSeq tables use versioned NM_ accs, but the user might have passed in a
     // versionless NM_, so adjust query accordingly:
     if (strchr(nmAcc, '.'))
         sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id = '%s'", nmAcc);
     else
         sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'",
                  nmAcc);
     }
 else if (hTableExists(db, "refGene"))
     {
     // Trim .version if present since our genbank tables don't use versioned names.
     char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.');
     sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
              "where r.name = '%s' and l.mrnaAcc = r.name "
@@ -584,31 +574,31 @@
     return NULL;
 char *seq = NULL;
 if (startsWith("LRG_", acc))
     {
     if (hTableExists(db, "lrgPep"))
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     }
 else
     {
-    if (dbHasNcbiRefSeq(db))
+    if (hDbHasNcbiRefSeq(db))
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable "
                  "where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     else
         {
         aaSeq *aaSeq = hGenBankGetPep(db, acc, NULL);
         if (aaSeq)
             seq = aaSeq->dna;
         }
     }
@@ -807,85 +797,71 @@
         }
     // Since we found hgvs->seqAcc, split it into versionless and version parts.
     if (retFoundAcc)
         *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
     if (retFoundVersion)
         {
         char *p = strchr(hgvs->seqAcc, '.');
         if (p)
             *retFoundVersion = atoi(p+1);
         }
     // Don't free chromInfo, it's cached!
     }
 return coordsOK;
 }
 
-static char *dnaSeqCannibalize(struct dnaSeq **pDnaSeq)
-/* Return the already-allocated dna string and free the dnaSeq container. */
-{
-char *seq = NULL;
-if (pDnaSeq && *pDnaSeq)
-    {
-    struct dnaSeq *dnaSeq = *pDnaSeq;
-    seq = dnaSeq->dna;
-    dnaSeq->dna = NULL;
-    dnaSeqFree(pDnaSeq);
-    }
-return seq;
-}
-
 static char *getCdnaSeq(char *db, char *acc)
 /* Return cdna sequence for acc, or NULL if not found. */
 {
 if (trackHubDatabase(db))
     return NULL;
 char *seq = NULL;
 if (startsWith("LRG_", acc))
     {
     if (hTableExists(db, "lrgCdna"))
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     }
 else
     {
     struct dnaSeq *cdnaSeq = NULL;
-    if (dbHasNcbiRefSeq(db))
+    if (hDbHasNcbiRefSeq(db))
         cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq");
     else
         cdnaSeq = hGenBankGetMrna(db, acc, NULL);
     if (cdnaSeq)
         seq = dnaSeqCannibalize(&cdnaSeq);
     }
 return seq;
 }
 
 static boolean getCds(char *db, char *acc, struct genbankCds *retCds)
 /* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */
 {
 if (trackHubDatabase(db))
     return FALSE;
 boolean gotCds = FALSE;
 char query[1024];
 if (startsWith("LRG_", acc))
     sqlSafef(query, sizeof(query),
              "select cds from lrgCds where id = '%s'", acc);
-else if (dbHasNcbiRefSeq(db) &&
+else if (hDbHasNcbiRefSeq(db) &&
          // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete:
          strchr(acc, '.'))
     sqlSafef(query, sizeof(query),
              "select cds from ncbiRefSeqCds where id = '%s'", acc);
 else
     sqlSafef(query, sizeof(query),
              "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND "
              "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc);
 struct sqlConnection *conn = hAllocConn(db);
 char cdsBuf[2048];
 char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
 hFreeConn(&conn);
 if (isNotEmpty(cdsStr))
     gotCds = (genbankCdsParse(cdsStr, retCds) &&
               retCds->startComplete && retCds->start != retCds->end);
@@ -966,31 +942,31 @@
 /* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0.
  * The user may give us a RefSeq accession with or without a .version.
  * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary.
  * If we look it up and can't find it, this returns NULL.
  * If instead we're using genbank tables, return acc with no version.
  * That ensures that acc will be found in our local tables. */
 {
 if (trackHubDatabase(db))
     return NULL;
 char *normalizedAcc = NULL;
 int foundVersion = 0;
 if (startsWith("LRG_", acc))
     {
     normalizedAcc = cloneString(acc);
     }
-else if (dbHasNcbiRefSeq(db))
+else if (hDbHasNcbiRefSeq(db))
     {
     // ncbiRefSeq tables need versioned accessions.
     if (strchr(acc, '.'))
         normalizedAcc = cloneString(acc);
     else
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         normalizedAcc = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     if (isNotEmpty(normalizedAcc))
         {
         char *p = strchr(normalizedAcc, '.');
@@ -1351,31 +1327,31 @@
 sqlFreeResult(&sr);
 hFreeConn(&conn);
 return mappedToGenome;
 }
 
 static char *pslTableForAcc(char *db, char *acc)
 /* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where
  * acc should be found (table may or may not exist).  Don't free the returned string. */
 {
 char *pslTable = NULL;
 if (startsWith("LRG_", acc))
     pslTable = "lrgTranscriptAli";
 else if (startsWith("NM_", acc) || startsWith("NR_", acc))
     {
     // Use NCBI's alignments if they are available
-    if (dbHasNcbiRefSeq(db))
+    if (hDbHasNcbiRefSeq(db))
         pslTable = "ncbiRefSeqPsl";
     else
         pslTable = "refSeqAli";
     }
 return pslTable;
 }
 
 #define limitToRange(val, min, max) { if (val < min) { val = min; }  \
                                       if (val > max) { val = max; } }
 
 static struct bed *pslAndFriendsToRegion(struct psl *psl, struct hgvsVariant *hgvs,
                                           int upstream, int downstream)
 /* If hgvs has any intron offsets and/or upstream/downstream offsets, add those to the
  * anchor coords in psl and return the variant's region of the genome. */
 {
@@ -1458,31 +1434,31 @@
     }
 return region;
 }
 
 static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 struct bed *region = NULL;
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
 if (acc && startsWith("NP_", acc))
     {
     // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome.
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
-    if (dbHasNcbiRefSeq(db))
+    if (hDbHasNcbiRefSeq(db))
         sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'",
                  acc);
     else if (hTableExists(db, "refGene"))
         sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
                  "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
     else
         return NULL;
     char *nmAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     if (nmAcc)
         {
         struct hgvsVariant cDot;
         zeroBytes(&cDot, sizeof(cDot));
         cDot.type = hgvstCoding;
         cDot.seqAcc = nmAcc;
@@ -1800,159 +1776,30 @@
         mapping->chromEnd = mapping->chromStart;
         }
     else
         {
         // '+' strand and extra seq after dup ==> insertion at chromEnd, update leftBase & offset
         *pLeftShiftOffset += refLen;
         *pLeftBase = genomicRef[refLen-1];
         mapping->chromStart = mapping->chromEnd;
         }
     memmove(hgvsSeqAlt, hgvsSeqAlt+refLen, altLen-refLen+1);
     genomicRef[0] = genomicRefFwd[0] = hgvsSeqRef[0] = '\0';
     }
 return dupToIns;
 }
 
-static void updateSeq(struct dnaSeq *seq, char *db, char *chrom, int start, int end)
-/* Free and reallocate seq's contents with the new sequence range. */
-// BEWARE: newDnaSeq does not clone dna so original seq->dna was allocated elsewhere!
-// In this case I'm assuming that seq comes from a prior call to hChromSeq, which
-// passes control of seq over to the caller (there's no stale pointer to cause trouble).
-{
-struct dnaSeq *newSeq = hChromSeq(db, chrom, start, end);
-touppers(newSeq->dna);
-freeMem(seq->dna);
-freeMem(seq->name);
-memcpy(seq, newSeq, sizeof(*seq));
-freeMem(newSeq);
-}
-
-static boolean shiftAndFetchMoreSeq(char *db, int altLen, struct dnaSeq *genomicSeq,
-                                    int *pLeftShiftOffset, struct bed *mapping,
-                                    int *pTotalBasesShifted, int *pStart)
-/* We need to fetch more genomic sequence to the left; update genomicSeq and remaining args
- * with the distance that we have shifted so far, and fetch a larger block of sequence.
- * Return TRUE if we have shifted all the way to the beginning of the chromosome. */
-{
-// Adjust mapping->chrom{Start,End} and totalBasesShifted
-mapping->chromStart -= *pLeftShiftOffset;
-mapping->chromEnd -= *pLeftShiftOffset;
-*pTotalBasesShifted += *pLeftShiftOffset;
-// Double the size of our left-shifting buffer (but don't underflow chrom)
-*pLeftShiftOffset = min(mapping->chromStart, *pLeftShiftOffset * 2);
-// Update genomicSeq and start offset
-int genomicSeqStart = mapping->chromStart - *pLeftShiftOffset;
-int genomicSeqEnd = mapping->chromEnd + altLen;
-if (genomicSeqEnd > genomicSeqStart)
-    updateSeq(genomicSeq, db, mapping->chrom, genomicSeqStart, genomicSeqEnd);
-*pStart = *pLeftShiftOffset;
-return (mapping->chromStart == 0);
-}
-
-static int leftShift(boolean isRc, char *db, struct dnaSeq *genomicSeq, int *pLeftShiftOffset,
-                     char *genomicRef, char *genomicRefFwd, char *hgvsSeqRef, char *hgvsSeqAlt,
-                     char pLeftBase[1], struct bed *mapping)
-/* HGVS requires right-shifting ambiguous alignments while VCF requires left-shifting.
- * If it's possible to left-shift this ref/alt, then change pLeftBase and mapping.
- * Return */
-{
-int totalBasesShifted = 0;
-if (sameString(genomicRef, hgvsSeqRef))
-    {
-    int refLen = mapping->chromEnd - mapping->chromStart;
-    int altLen = strlen(hgvsSeqAlt);
-    if ((refLen == 0 && altLen > 0) ||
-        (refLen > 0 && altLen == 0))
-        {
-        char *genomeChunk = genomicSeq->dna;
-        int start = *pLeftShiftOffset;
-        boolean done = FALSE;
-        // If insertion, first compare inserted seq to genomic seq to the left of insertion point.
-        char hgvsSeqAltFwd[altLen+1];
-        copyMaybeRc(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), hgvsSeqAlt, isRc);
-        int ix;
-        for (ix = altLen-1;  !done && ix >= 0 && start > 0;  ix--)
-            {
-            if (genomeChunk[start-1] == hgvsSeqAltFwd[ix])
-                {
-                start--;
-                if (start == 0)
-                    {
-                    // We have hit the beginning of genomeChunk and are not done; fetch more seq.
-                    done = shiftAndFetchMoreSeq(db, altLen, genomicSeq, pLeftShiftOffset,
-                                                mapping, &totalBasesShifted, &start);
-                    genomeChunk = genomicSeq->dna;
-                    }
-                }
-            else
-                done = TRUE;
-            }
-        // If we're not done, keep trying to shift left.
-        while (!done && start > 0)
-            {
-            if (genomeChunk[start-1] == genomeChunk[start-1+refLen+altLen])
-                {
-                start--;
-                if (start == 0)
-                    {
-                    // We have hit the beginning of genomeChunk and are not done; fetch more seq.
-                    done = shiftAndFetchMoreSeq(db, altLen, genomicSeq, pLeftShiftOffset,
-                                                mapping, &totalBasesShifted, &start);
-                    genomeChunk = genomicSeq->dna;
-                    }
-                }
-            else
-                done = TRUE;
-            }
-        // Done shifting; update mapping coords and ref/alt sequences if we shifted.
-        int basesShifted = (*pLeftShiftOffset - start);
-        totalBasesShifted += basesShifted;
-        if (totalBasesShifted > 0)
-            {
-            mapping->chromStart -= basesShifted;
-            mapping->chromEnd -= basesShifted;
-            if (altLen > 0)
-                {
-                // Insertion: ref is still "", update hgvsSeqAlt.
-                if (totalBasesShifted < altLen)
-                    {
-                    // The beginning of hgvsSeqAlt is from genomic sequence. The remainder is a
-                    // shifted portion of the original.  Do the shifting first, then the copying
-                    // from genome.
-                    memmove(hgvsSeqAltFwd+totalBasesShifted, hgvsSeqAltFwd,
-                            (altLen - totalBasesShifted));
-                    memcpy(hgvsSeqAltFwd, genomeChunk+start, totalBasesShifted);
-                    }
-                else
-                    safencpy(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), genomeChunk+start, altLen);
-                copyMaybeRc(hgvsSeqAlt, altLen+1, hgvsSeqAltFwd, isRc);
-                *pLeftBase = (start > 0) ? genomeChunk[start-1] : genomeChunk[0];
-                }
-            if (refLen > 0)
-                {
-                // Deletion: update ref from genome, alt is still "".
-                safencpy(genomicRefFwd, refLen+1, genomeChunk+start, refLen);
-                copyMaybeRc(genomicRef, refLen+1, genomicRefFwd, isRc);
-                safecpy(hgvsSeqRef, refLen+1, genomicRef);
-                *pLeftBase = (start > 0) ? genomeChunk[start-1] : genomeChunk[0];
-                }
-            }
-        }
-    }
-return totalBasesShifted;
-}
-
 static char *makeVcfAlt(char *genomicRef, char *hgvsSeqRef, char *hgvsSeqAlt, struct bed *mapping,
                         boolean isRc, char leftBase, boolean leftBaseRight, boolean *retIsIndel)
 /* Based on comparing the three possibly differing alleles (genomic, hgvs ref, hgvs alt),
  * determine which sequence(s) will go in alt.  Determine whether this variant is an indel
  * (alleles not all the same length), and if so prepend leftBase to alt allele(s). */
 {
 int hgvsSeqRefLen = strlen(hgvsSeqRef);
 int hgvsSeqAltLen = strlen(hgvsSeqAlt);
 // VCF alleles are always on '+' (Fwd) strand of genome.
 char hgvsSeqRefFwd[hgvsSeqRefLen+1];
 copyMaybeRc(hgvsSeqRefFwd, sizeof(hgvsSeqRefFwd), hgvsSeqRef, isRc);
 char hgvsSeqAltFwd[hgvsSeqAltLen+1];
 copyMaybeRc(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), hgvsSeqAlt, isRc);
 int genomicRefLen = strlen(genomicRef);
 // The alt allele string will contain 0 to 2 sequences, maybe a comma, and 0 to 2 leftBases:
@@ -2088,69 +1935,87 @@
 return dyStringCannibalize(&filter);
 }
 
 static struct vcfRow *vcfFromHgvs(char *db, char *term, struct bed *mapping,
                                   struct hgvsVariant *hgvs, struct hgvsChange *changeList,
                                   boolean doLeftShift, struct dyString *dyError)
 /* Determine ref and alt sequence, move start left one base if indel, and return vcfRow.
  * If doLeftShift, also shift items with ambiguous start as far left on the genome as
  * possible.  That is the VCF convention but it is inconvenient for variant annotation;
  * the HGVS standard is to shift right as far as possible on the tx strand, so consequences
  * are reported at the point where the sequence has changed. */
 {
 // Include some genomic sequence to the left (if possible) in case we need to left-shift
 // an ambiguous mapping and/or include the base to the left of an indel.
 int genomicRefLen = mapping->chromEnd - mapping->chromStart;
-int leftShiftOffset = min(mapping->chromStart,
-                          max(128, 4*genomicRefLen));
-int genomicSeqStart = mapping->chromStart - leftShiftOffset;
-struct dnaSeq *genomicSeq = hChromSeq(db, mapping->chrom, genomicSeqStart, mapping->chromEnd);
-touppers(genomicSeq->dna);
+uint winStart, winEnd;
+indelShiftRangeForVariant(mapping->chromStart, genomicRefLen, strlen(hgvs->changes),
+                          &winStart, &winEnd);
+struct seqWindow *seqWin = chromSeqWindowNew(db, mapping->chrom, winStart, winEnd);
+int varWinStart = mapping->chromStart - seqWin->start;
 int indelOffset = (mapping->chromStart == 0) ? 0 : 1;
-char leftBase = genomicSeq->dna[leftShiftOffset-indelOffset];
+char leftBase = seqWin->seq[varWinStart-indelOffset];
 char genomicRefFwd[genomicRefLen+1];
-safencpy(genomicRefFwd, sizeof(genomicRefFwd), genomicSeq->dna + leftShiftOffset, genomicRefLen);
+safencpy(genomicRefFwd, sizeof(genomicRefFwd), seqWin->seq + varWinStart, genomicRefLen);
 // VCF is always on '+' (Fwd) strand of reference.  HGVS sequences may be on the '-' strand.
 // Make a version of the genome assembly allele for comparison to HGVS sequences.
 boolean isRc = (mapping->strand[0] == '-');
 char genomicRef[genomicRefLen+1];
 copyMaybeRc(genomicRef, sizeof(genomicRef), genomicRefFwd, isRc);
 // Get HGVS ref and alt alleles
 char *hgvsSeqRef = hgvsGetRef(hgvs, db);
 if (hgvsSeqRef == NULL)
     // hgvsSeqRef can be NULL when sequence coords are out of transcript bounds e.g. intron.
     // Use genomic reference sequence (rev-complemented if mapped to '-' strand)
     hgvsSeqRef = cloneString(genomicRef);
 char *hgvsTermRef = hgvsChangeGetAssertedRef(changeList);
 char *hgvsSeqAlt = hgvsChangeGetAlt(changeList, hgvsSeqRef, db);
 if (hgvsIsInsertion(hgvs))
     // HGVS insertions are two bases (before and after), but ours are zero-length points,
     // so adjust hgvsSeqRef accordingly.
     hgvsSeqRef[0] = '\0';
 if (hgvsSeqAlt == NULL)
     {
     dyStringAppend(dyError, "Unable to get alternate sequence");
     return NULL;
     }
 char *filter = makeHgvsToVcfFilter(genomicRef, hgvsSeqRef, hgvsTermRef);
-// These two VCF-normalization functions may modify the contents of their arguments:
-boolean dupToIns = doDupToIns(isRc, &leftShiftOffset, genomicRef, genomicRefFwd,
+// doDupToIns and indelShift may modify the contents of their arguments:
+boolean dupToIns = doDupToIns(isRc, &varWinStart, genomicRef, genomicRefFwd,
                               hgvsSeqRef, hgvsSeqAlt, &leftBase, mapping);
+if (dupToIns)
+    genomicRefLen = strlen(genomicRef);
 int basesShifted = 0;
-if (doLeftShift)
-    basesShifted = leftShift(isRc, db, genomicSeq, &leftShiftOffset, genomicRef, genomicRefFwd,
-                             hgvsSeqRef, hgvsSeqAlt, &leftBase, mapping);
+if (doLeftShift &&
+    sameString(genomicRef, hgvsSeqRef))
+    {
+    int altLen = strlen(hgvsSeqAlt);
+    char hgvsSeqAltFwd[altLen+1];
+    copyMaybeRc(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), hgvsSeqAlt, isRc);
+    basesShifted = indelShift(seqWin, &mapping->chromStart, &mapping->chromEnd,
+                              hgvsSeqAltFwd, INDEL_SHIFT_NO_MAX, isdLeft);
+    if (basesShifted)
+        {
+        varWinStart = mapping->chromStart - seqWin->start;
+        int leftBaseOffset = max(0, varWinStart - 1);
+        leftBase = seqWin->seq[leftBaseOffset];
+        copyMaybeRc(hgvsSeqAlt, altLen+1, hgvsSeqAltFwd, isRc);
+        safencpy(genomicRefFwd, sizeof(genomicRefFwd), seqWin->seq+varWinStart, genomicRefLen);
+        copyMaybeRc(genomicRef, sizeof(genomicRef), genomicRefFwd, isRc);
+        safecpy(hgvsSeqRef, genomicRefLen+1, genomicRef);
+        }
+    }
 // VCF special case for indel at beginning of chromosome: there is no left base, so instead
 // put the first base of the chromosome to the right!
 // See https://samtools.github.io/hts-specs/VCFv4.2.pdf 1.4.1.4 REF and discussion at
 // https://sourceforge.net/p/vcftools/mailman/vcftools-spec/thread/508CAD45.8010301%40broadinstitute.org/
 boolean leftBaseRight = (mapping->chromStart == 0);
 // Make REF and ALT strings ('+' strand, leftBase for indels) and write VCF
 boolean isIndel = FALSE;
 char *vcfAlt = makeVcfAlt(genomicRef, hgvsSeqRef, hgvsSeqAlt, mapping, isRc,
                           leftBase, leftBaseRight,  &isIndel);
 int pos = mapping->chromStart + 1;
 if (isIndel && !leftBaseRight)
     pos--;
 char vcfRef[genomicRefLen+1+1];
 if (isIndel)
     {
@@ -2161,31 +2026,31 @@
     }
 else
     safecpy(vcfRef, sizeof(vcfRef), genomicRefFwd);
 char *info = makeHgvsToVcfInfo(dupToIns, basesShifted, isIndel, mapping->chromEnd);
 struct vcfRow *row;
 AllocVar(row);
 row->chrom = cloneString(mapping->chrom);
 row->pos = pos;
 row->id = cloneString(term);
 row->ref = cloneString(vcfRef);
 row->alt = vcfAlt;
 row->filter = filter;
 row->info = info;
 freeMem(hgvsSeqRef);
 freeMem(hgvsSeqAlt);
-dnaSeqFree(&genomicSeq);
+chromSeqWindowFree(&seqWin);
 return row;
 }
 
 struct vcfRow *hgvsToVcfRow(char *db, char *term, boolean doLeftShift, struct dyString *dyError)
 /* Convert HGVS to a row of VCF suitable for sorting & printing.  If unable, return NULL and
  * put the reason in dyError.  Protein terms are ambiguous at the nucleotide level so they are
  * not supported at this point. */
 {
 struct vcfRow *row = NULL;
 struct hgvsVariant *hgvs = hgvsParseTerm(term);
 if (!hgvs)
     {
     dyStringPrintf(dyError, "Unable to parse '%s' as HGVS", term);
     return NULL;
     }
@@ -2234,15 +2099,512 @@
         }
     else
         {
         row = vcfFromHgvs(db, term, mapping, hgvs, changeList, doLeftShift, dyWarn);
         if (dyStringIsNotEmpty(dyWarn))
             {
             dyStringPrintf(dyError, "Can't convert '%s' to VCF: %s",
                            term, dyStringContents(dyWarn));
             }
         }
     }
 bedFree(&mapping);
 dyStringFree(&dyWarn);
 return row;
 }
+
+static int allNCount(char *seq)
+/* If seq is entirely N's, return its length, otherwise 0. */
+{
+int nLen = 0;
+for (nLen = 0;  seq[nLen] != '\0';  nLen++)
+    if (seq[nLen] != 'N')
+        return 0;
+return nLen;
+}
+
+// HGVS allows terms to specify deleted or duplicated bases if there are "several bases".
+// Search for "NOTE: it is allowed" here:
+//   http://varnomen.hgvs.org/recommendations/DNA/variant/deletion/
+//   http://varnomen.hgvs.org/recommendations/DNA/variant/duplication/
+// It does not say how many "several" can be.  It doesn't specify for inv, but common
+// practice is to include bases and I don't see why it should be different from del or dup.
+// If somebody complains about hardcoding then I guess we could make it a parameter of
+// hgvsAppendChangesFromNucRefAlt.
+#define HGVS_SEVERAL 30
+
+static void hgvsAppendChangesFromNucRefAlt(struct dyString *dy, char *ref, char *alt, int dupLen,
+                                           boolean breakDelIns)
+/* Translate reference allele and alternate allele into an HGVS change description, append to dy.
+ * If breakDelIns, then show deleted bases (e.g. show 'delAGinsTT' instead of 'delinsTT').
+ * Note: this forces ref and alt to uppercase.
+ * No support for con or repeats at this point, just {=, >, del, dup, ins, inv}. */
+{
+touppers(ref);
+touppers(alt);
+if (sameString(ref, alt))
+    dyStringPrintf(dy, "%s=", ref);
+else
+    {
+    int refLen = strlen(ref);
+    int altLen = strlen(alt);
+    char refInv[refLen+1];
+    safencpy(refInv, sizeof(refInv), ref, refLen);
+    reverseComplement(refInv, refLen);
+    if (refLen == 1 && altLen == 1)
+        dyStringPrintf(dy, "%c>%c", ref[0], alt[0]);
+    else if (dupLen > 0)
+        {
+        dyStringAppend(dy, "dup");
+        if (dupLen <= HGVS_SEVERAL)
+            dyStringAppendN(dy, alt, dupLen);
+        // Could be a pure duplication followed by insertion:
+        if (altLen > dupLen)
+            dyStringPrintf(dy, "ins%s", alt+dupLen);
+        }
+    else if (refLen == 0)
+        {
+        int allNLen = allNCount(alt);
+        if (allNLen)
+            dyStringPrintf(dy, "ins%d", allNLen);
+        else
+            dyStringPrintf(dy, "ins%s", alt);
+        }
+    else if (altLen == 0)
+        {
+        dyStringAppend(dy, "del");
+        if (refLen <= HGVS_SEVERAL)
+            dyStringAppendN(dy, ref, refLen);
+        }
+    else if (refLen == altLen && refLen > 1 && sameString(refInv, alt))
+        {
+        dyStringAppend(dy, "inv");
+        if (refLen <= HGVS_SEVERAL)
+            dyStringAppendN(dy, ref, refLen);
+        }
+    else
+        {
+        dyStringAppend(dy, "del");
+        if (breakDelIns && refLen <= HGVS_SEVERAL)
+            dyStringAppendN(dy, ref, refLen);
+        int allNLen = allNCount(alt);
+        if (allNLen)
+            dyStringPrintf(dy, "ins%d", allNLen);
+        else
+            dyStringPrintf(dy, "ins%s", alt);
+        }
+    }
+}
+
+void hgvsAppendChangesFromPepRefAlt(struct dyString *dy, char *ref, char *alt, int dupLen)
+/* Translate reference allele and alternate allele into an HGVS change description, append to dy.
+ * ref and alt must be sequences of single-letter IUPAC amino acid codes.
+ * Note: this forces ref and alt to uppercase. */
+{
+touppers(ref);
+touppers(alt);
+if (sameString(ref, alt))
+    dyStringAppendC(dy, '=');
+else
+    {
+    int refLen = strlen(ref);
+    int altLen = strlen(alt);
+    char altAbbr[3*altLen+1];
+    int ix;
+    for (ix = 0;  ix < altLen;  ix++)
+        aaToAbbr(alt[ix], altAbbr+3*ix, sizeof(altAbbr)-3*ix);
+    if (refLen == 1 && altLen == 1)
+        // Simple substitution
+        dyStringAppend(dy, altAbbr);
+    else if (dupLen > 0)
+        {
+        dyStringAppend(dy, "dup");
+        // Could be a pure duplication followed by insertion:
+        if (altLen > dupLen)
+            dyStringPrintf(dy, "ins%s", altAbbr+(dupLen*3));
+        }
+    else if (refLen == 0)
+        dyStringPrintf(dy, "ins%s", altAbbr);
+    else if (altLen == 0)
+        {
+        dyStringAppend(dy, "del");
+        }
+    else
+        {
+        dyStringPrintf(dy, "delins%s", altAbbr);
+        }
+    }
+}
+
+char *hgvsGFromVariant(struct seqWindow *gSeqWin, struct bed3 *variantBed, char *alt, char *acc,
+                       boolean breakDelIns)
+/* Return an HGVS g. string representing the genomic variant at the position of variantBed with
+ * reference allele from gSeqWin and alternate allele alt.  3'-shift indels if applicable.
+ * If acc is non-NULL it is used instead of variantBed->chrom.
+ * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
+{
+struct dyString *dy = dyStringCreate("%s:g.", acc ? acc : variantBed->chrom);
+uint vStart = variantBed->chromStart, vEnd = variantBed->chromEnd;
+gSeqWin->fetch(gSeqWin, variantBed->chrom, vStart, vEnd);
+int refLen = vEnd - vStart;
+char ref[refLen+1];
+seqWindowCopy(gSeqWin, vStart, refLen, ref, sizeof(ref));
+int altLen = strlen(alt);
+char altCpy[altLen+1];
+safecpy(altCpy, sizeof(altCpy), alt);
+if (differentString(ref, altCpy))
+    // Trim identical bases from start and end -- unless this is an assertion that there is
+    // no change, in which case it's good to keep the range on which that assertion was made.
+    trimRefAlt(ref, altCpy, &vStart, &vEnd, &refLen, &altLen);
+if (indelShiftIsApplicable(refLen, altLen) &&
+    indelShift(gSeqWin, &vStart, &vEnd, altCpy, INDEL_SHIFT_NO_MAX, isdRight))
+    // update ref
+    seqWindowCopy(gSeqWin, vStart, refLen, ref, sizeof(ref));
+int dupLen = 0;
+if (refLen == 1)
+    // Single base: single 1-based coordinate
+    dyStringPrintf(dy, "%d", vStart+1);
+else if (refLen == 0)
+    {
+    // Insertion or duplication
+    if (altLen > 0 && altLen <= vStart)
+        {
+        // Detect duplicated sequence
+        uint seqStart = vStart - altLen;
+        char precedingRef[altLen+1];
+        seqWindowCopy(gSeqWin, seqStart, altLen, precedingRef, sizeof(precedingRef));
+        if (sameString(altCpy, precedingRef))
+            dupLen = altLen;
+        }
+    if (dupLen > 0)
+        {
+        if (dupLen == 1)
+            // Single-base duplication
+            dyStringPrintf(dy, "%d", vStart); // - dupLen + 1 cancel each other out
+        else
+            // Multi-base duplication: range is the dupLen bases preceding vStart
+            dyStringPrintf(dy, "%d_%d", vStart - dupLen + 1, vStart);
+        }
+    else
+        // Insertion: two-base range enclosing zero-base insertion point
+        dyStringPrintf(dy, "%d_%d", vStart, vEnd+1);
+    }
+else
+    // Deletion or MNV
+    dyStringPrintf(dy, "%d_%d", vStart+1, vEnd);
+hgvsAppendChangesFromNucRefAlt(dy, ref, altCpy, dupLen, breakDelIns);
+return dyStringCannibalize(&dy);
+}
+
+static int findDup(char *alt, struct seqWindow *seqWin, uint refPoint, boolean isRc)
+/* Given that the variant is an insertion of alt at refPoint (minding isRc), if alt looks
+ * like a duplicate of the sequence before refPoint, return the length of duplicated sequence. */
+{
+int altLen = strlen(alt);
+// Don't underflow the sequence:
+if (!isRc && altLen > refPoint)
+    return 0;
+uint seqStart = refPoint - (isRc ? 0 : altLen);
+uint minEnd = seqStart + altLen;
+if (seqWin->start > seqStart || seqWin->end < minEnd)
+    seqWin->fetch(seqWin, seqWin->seqName,
+                  min(seqStart, seqWin->start),
+                  max(seqWin->end, minEnd));
+// Get sequence, reverse-complement if necessary
+char precedingRef[altLen+1];
+seqWindowCopy(seqWin, seqStart, altLen, precedingRef, sizeof(precedingRef));
+if (isRc)
+    reverseComplement(precedingRef, altLen);
+if (sameString(alt, precedingRef))
+    return altLen;
+else
+    {
+    // Detect an insertion plus a few slop bases at the end, like "dupinsTAT" where
+    // alt starts with precedingRef[3..] followed by new "TAT".  Then dupLen is altLen-3.
+    // Unfortunately, indelShift can actually shift far enough to prevent a perfect match. :(
+    // Example: ClinVar NM_000179.2(MSH6):c.1573_3439-429dupinsTAT -- the insertion is right-
+    // shifted by 1 and then there's no longer a perfect dup. :b  We would have to do dup-aware
+    // indel shifting!!
+    int searchLimit = 5;
+    int offset;
+    for (offset = 1;  offset < searchLimit;  offset++)
+        {
+        if (sameStringN(alt, precedingRef+offset, altLen-offset))
+            return altLen-offset;
+        }
+    }
+return 0;
+}
+
+static int tweakInsDup(struct vpTxPosition *startPos, struct vpTxPosition *endPos, char *alt,
+                       struct seqWindow *gSeqWin, struct psl *txAli, struct dnaSeq *txSeq)
+/* If this variant is an insertion, HGVS needs the 2-base range surrounding the insertion point.
+ * However, if an inserted sequence happens to be identical to the preceding sequence then
+ * HGVS calls it a dup on the range of preceding sequence.  If this is a dup, return the length of
+ * duplicated sequence (else return 0).
+ * This modifies startPos and endPos if the variant is an insertion/dup.
+ * For non-exonic insertions, the preceding reference sequence is from the genome not tx.
+ * Note: this doesn't yet detect "dupins", where most of the inserted sequence (starting from the
+ * beginning of the inserted sequence) matches the previous ref sequence */
+{
+int dupLen = 0;
+int isIns = vpTxPosIsInsertion(startPos, endPos);
+if (isIns)
+    {
+    // Yes, this is a zero-length insertion point -- see if it happens to be a duplication.
+    // "insTCA" is preferable to "dupTinsCA"... where to draw the line for when dup is worth it??
+    // For now, just say half of altLen... but an argument could be made for some small constant too
+    int minDup = strlen(alt) / 2;
+    // Fetch preceding sequence from tx if exonic, otherwise from genome.
+    if (startPos->region == vpExon && endPos->region == vpExon)
+        {
+        struct seqWindow *txSeqWin = memSeqWindowNew(txSeq->name, txSeq->dna);
+        dupLen = findDup(alt, txSeqWin, startPos->txOffset, FALSE);
+        if (dupLen > minDup)
+            {
+            // Since txSeqWin was used to find dup, the new startPos must also be exonic.
+            // In case startPos was looking forward from the boundary between an exon and an intron
+            // or downstream, adjust its other fields to be exonic now:
+            startPos->region = vpExon;
+            startPos->txOffset -= dupLen;
+            startPos->gDistance = startPos->intron3TxOffset = startPos->intron3Distance = 0;
+            }
+        else
+            dupLen = 0;
+        memSeqWindowFree(&txSeqWin);
+        }
+    else
+        {
+        boolean isRc = (pslQStrand(txAli) == '-');
+        dupLen = findDup(alt, gSeqWin, startPos->gOffset, isRc);
+        if (dupLen > minDup)
+            {
+            uint newGOffset = startPos->gOffset + (isRc ? dupLen : -dupLen);
+            vpPosGenoToTx(newGOffset, txAli, startPos, FALSE);
+            }
+        else
+            dupLen = 0;
+        }
+    }
+if (dupLen == 0 && isIns)
+    {
+    // Expand insertion point to 2-base region around insertion point for HGVS.
+    // startPos = base to left of endPos whose region looks left/5';
+    // endPos = base to right of startPos whose region looks right/3'.
+    struct vpTxPosition newStart = *endPos;
+    vpTxPosSlideInSameRegion(&newStart, -1);
+    struct vpTxPosition newEnd = *startPos;
+    vpTxPosSlideInSameRegion(&newEnd, 1);
+    *startPos = newStart;
+    *endPos = newEnd;
+    }
+return dupLen;
+}
+
+static uint hgvsTxToCds(uint txOffset, struct genbankCds *cds, boolean isStart, char pPrefix[2])
+/* Return the cds-relative HGVS coord and prefix corresponding to 0-based txOffset & cds. */
+{
+// Open end adjustment for determining CDS/UTR region:
+int endCmp = isStart ? 0 : 1;
+// For adjusting non-negative coords to HGVS's positive 1-based closed:
+int oneBased = isStart ? 1 : 0;
+// For adjusting negative coords to HGVS's negative 0-based closed:
+int closedEnd = isStart ? 0 : 1;
+pPrefix[1] = '\0';
+uint cdsCoord = 0;
+if (txOffset - endCmp < cds->start)
+    {
+    // 5'UTR: coord is negative distance from cdsStart
+    pPrefix[0] = '-';
+    cdsCoord = cds->start - txOffset + closedEnd;
+    }
+else if (txOffset - endCmp < cds->end)
+    {
+    // CDS: coord is positive distance from cdsStart
+    pPrefix[0] = '\0';
+    cdsCoord = txOffset - cds->start + oneBased;
+    }
+else
+    {
+    // 3'UTR: coord is positive distance from cdsEnd
+    pPrefix[0] = '*';
+    cdsCoord = txOffset - cds->end + oneBased;
+    }
+return cdsCoord;
+}
+
+static void appendHgvsNucPos(struct dyString *dy, struct vpTxPosition *txPos, boolean isStart,
+                             struct genbankCds *cds)
+/* Translate txPos (start or end) into an HGVS position (coding if cds is non-NULL). */
+{
+// Open end adjustment for determining region:
+int endCmp = isStart ? 0 : 1;
+// For adjusting non-negative coords to HGVS's positive 1-based closed:
+int oneBased = isStart ? 1 : 0;
+// For adjusting negative coords to HGVS's negative 0-based closed:
+int closedEnd = isStart ? 0 : 1;
+if (txPos->region == vpUpstream)
+    {
+    uint distance = txPos->gDistance;
+    if (cds)
+        distance += cds->start;
+    dyStringPrintf(dy, "-%u", distance + closedEnd);
+    }
+else if (txPos->region == vpDownstream)
+    {
+    uint distance = txPos->txOffset + txPos->gDistance;
+    if (cds)
+        dyStringPrintf(dy, "*%u", distance - cds->end + oneBased);
+    else
+        dyStringPrintf(dy, "%u", distance + oneBased);
+    }
+else if (txPos->region == vpExon)
+    {
+    char cdsPrefix[2];
+    cdsPrefix[0] = '\0';
+    uint hgvsCoord = cds ? hgvsTxToCds(txPos->txOffset, cds, isStart, cdsPrefix) :
+                           txPos->txOffset + oneBased;
+    dyStringPrintf(dy, "%s%u", cdsPrefix, hgvsCoord);
+    }
+else if (txPos->region == vpIntron)
+    {
+    // If intron length is odd, bias toward picking the 5' exon (middle base has positive offset).
+    char cdsPrefix[2];
+    cdsPrefix[0] = '\0';
+    char direction;
+    uint exonAnchor, intronOffset;
+    boolean anchorIsStart;
+    if (txPos->gDistance - endCmp < txPos->intron3Distance)
+        {
+        exonAnchor = txPos->txOffset;
+        direction = '+';
+        intronOffset = txPos->gDistance + oneBased;
+        anchorIsStart = FALSE;
+        }
+    else
+        {
+        exonAnchor = txPos->intron3TxOffset;
+        direction = '-';
+        intronOffset = txPos->intron3Distance + closedEnd;
+        anchorIsStart = TRUE;
+        }
+    exonAnchor = cds ? hgvsTxToCds(exonAnchor, cds, anchorIsStart, cdsPrefix) :
+                       exonAnchor + (anchorIsStart ? oneBased : 0);
+    dyStringPrintf(dy, "%s%u%c%u", cdsPrefix, exonAnchor, direction, intronOffset);
+    }
+else
+    errAbort("appendHgvsNucPos: unrecognized vpTxRegion value %d", txPos->region);
+}
+
+char *hgvsNFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli,
+                    struct dnaSeq *txSeq, boolean breakDelIns)
+/* Return an HGVS n. (noncoding transcript) term for a variant projected onto a transcript.
+ * gSeqWin must already have at least the correct seqName if not the surrounding sequence.
+ * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
+{
+struct dyString *dy = dyStringCreate("%s:n.", vpTx->txName);
+// Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup.
+struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end;
+int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq);
+appendHgvsNucPos(dy, &startPos, TRUE, NULL);
+if (!vpTxPosRangeIsSingleBase(&startPos, &endPos))
+    {
+    dyStringAppendC(dy, '_');
+    appendHgvsNucPos(dy, &endPos, FALSE, NULL);
+    }
+char *ref = vpTx->txRef ? vpTx->txRef : vpTx->gRef;
+hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns);
+return dyStringCannibalize(&dy);
+}
+
+
+char *hgvsCFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli,
+                    struct genbankCds *cds,  struct dnaSeq *txSeq, boolean breakDelIns)
+/* Return an HGVS c. (coding transcript) term for a variant projected onto a transcript w/cds.
+ * gSeqWin must already have at least the correct seqName if not the surrounding sequence.
+ * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
+{
+struct dyString *dy = dyStringCreate("%s:c.", vpTx->txName);
+// Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup.
+struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end;
+int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq);
+appendHgvsNucPos(dy, &startPos, TRUE, cds);
+if (!vpTxPosRangeIsSingleBase(&startPos, &endPos))
+    {
+    dyStringAppendC(dy, '_');
+    appendHgvsNucPos(dy, &endPos, FALSE, cds);
+    }
+char *ref = vpTx->txRef ? vpTx->txRef : vpTx->gRef;
+hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns);
+return dyStringCannibalize(&dy);
+}
+
+char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens)
+/* Return an HGVS p. (protein) term for a variant projected into protein space.
+ * Strict HGVS compliance requires parentheses around predicted protein changes, but
+ * nobody seems to do that in practice.
+ * Return NULL if an input is NULL. */
+{
+if (vpPep == NULL || protSeq == NULL)
+    return NULL;
+struct dyString *dy = dyStringCreate("%s:p.", vpPep->name);
+if (addParens)
+    dyStringAppendC(dy, '(');
+int refLen = vpPep->end - vpPep->start;
+int altLen = vpPep->alt ? strlen(vpPep->alt) : 0;
+char *pSeq = protSeq->dna;
+char refStartAbbr[4];
+aaToAbbr(pSeq[vpPep->start], refStartAbbr, sizeof(refStartAbbr));
+if (vpPep->likelyNoChange)
+    dyStringAppend(dy, "=");
+else if (vpPep->cantPredict || vpPep->spansUtrCds)
+    dyStringAppend(dy, "?");
+else if (vpPep->frameshift)
+    {
+    if (altLen == 1)
+        dyStringPrintf(dy, "%s%dTer", refStartAbbr, vpPep->start+1);
+    else
+        {
+        char altStartAbbr[4];
+        aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr));
+        dyStringPrintf(dy, "%s%d%sfsTer%d", refStartAbbr, vpPep->start+1, altStartAbbr, altLen);
+        }
+    }
+else
+    {
+    int dupLen = 0;
+    if (refLen == 0 && altLen > 0)
+        {
+        // It's an insertion; is it a duplication?
+        struct seqWindow *pSeqWin = memSeqWindowNew(protSeq->name, protSeq->dna);
+        dupLen = findDup(vpPep->alt, pSeqWin, vpPep->start, FALSE);
+        memSeqWindowFree(&pSeqWin);
+        }
+    if (refLen == 1)
+        dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1);
+    else
+        {
+        int rangeStart = vpPep->start, rangeEnd = vpPep->end;
+        if (dupLen > 0)
+            {
+            // Duplication; position range changes to preceding bases.
+            rangeEnd = rangeStart;
+            rangeStart -= dupLen;
+            aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr));
+            }
+        else if (refLen == 0)
+            {
+            // Insertion; expand to two-AA range around insertion point
+            rangeStart--;
+            aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr));
+            rangeEnd++;
+            }
+        char refLastAbbr[4];
+        aaToAbbr(pSeq[rangeEnd-1], refLastAbbr, sizeof(refLastAbbr));
+        dyStringPrintf(dy, "%s%d_%s%d", refStartAbbr, rangeStart+1, refLastAbbr, rangeEnd);
+        }
+    hgvsAppendChangesFromPepRefAlt(dy, vpPep->ref, vpPep->alt, dupLen);
+    }
+if (addParens)
+    dyStringAppendC(dy, ')');
+return dyStringCannibalize(&dy);
+}