0d3b8feb6074f6caccc2d4aadd020961fbd7eb82
chmalee
  Wed Jun 10 14:39:38 2020 -0700
Add drag reorder of VCF samples to VCF trio display. Also fixes bug in haplotype sort that I didn't discover until adding drag reorder. Sometimes when sorting haplotypes it is the case that the initial best match to a child allele is the same for each parent. Previously I would just advance the first drawn parent to the next best match but now I also check whether advancing the first drawn is actually the best idea and potentially advance the other parent instead, refs #25582

diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c
index d513525..e2d02f6 100644
--- src/hg/hgTracks/vcfTrack.c
+++ src/hg/hgTracks/vcfTrack.c
@@ -157,31 +157,32 @@
     {
     double majorAlFreq = max(refFreq, maxAltFreq);
     if (majorAlFreq > (1.0 - minFreq))
 	return TRUE;
     }
 return FALSE;
 }
 
 static void filterRefOnlyAlleles(struct vcfFile *vcff, struct trackDb *tdb)
 /* Drop items from VCF that don't differ from the reference for any of the
  * samples specified in trackDb */
 {
 struct vcfRecord *rec, *nextRecord, *retList = NULL;
 const struct vcfGenotype *gt;
 
-struct slPair *sample, *sampleOrder = vcfPhasedGetSampleOrder(cart, tdb, FALSE);
+boolean hideOtherSamples = cartUsualBooleanClosestToHome(cart, tdb, FALSE, VCF_PHASED_HIDE_OTHER_VAR, FALSE);
+struct slPair *sample, *sampleOrder = vcfPhasedGetSampleOrder(cart, tdb, FALSE, hideOtherSamples);
 for (rec = vcff->records; rec != NULL; rec = nextRecord)
     {
     nextRecord = rec->next;
     boolean allRef = TRUE;
     for (sample = sampleOrder; sample != NULL; sample = sample->next)
         {
         gt = vcfRecordFindGenotype(rec, sample->name);
         if (gt && !(gt->hapIxA == 0 && gt->hapIxB == 0) )
             allRef = FALSE;
         }
     if (!allRef)
         slAddHead(&retList, rec);
     }
 slReverse(&retList);
 vcff->records = retList;
@@ -220,31 +221,34 @@
     unsigned int vcfStart;
     unsigned int vcfEnd;
 };
 
 static struct pgSnp *vcfFileToPgSnp(struct vcfFile *vcff, struct trackDb *tdb)
 /* Convert vcff's records to pgSnp; don't free vcff until you're done with pgSnp
  * because it contains pointers into vcff's records' chrom. If the trackDb setting
  * sampleName is present, then check whether all the records are phased or not */
 {
 struct pgSnp *pgsList = NULL;
 struct vcfRecord *rec;
 int maxLen = 33;
 int maxAlCount = 5;
 struct slPair *sample = NULL, *phasedSamples = NULL;
 if (sameString(tdb->type, "vcfPhasedTrio"))
-    phasedSamples = vcfPhasedGetSampleOrder(cart, tdb, FALSE);
+    {
+    boolean hideOtherSamples = cartUsualBooleanClosestToHome(cart, tdb, FALSE, VCF_PHASED_HIDE_OTHER_VAR, FALSE);
+    phasedSamples = vcfPhasedGetSampleOrder(cart, tdb, FALSE, hideOtherSamples);
+    }
 
 vcff->allPhased = TRUE;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     {
     struct pgSnpVcfStartEnd *psvs = needMem(sizeof(*psvs));
     psvs->vcfStart = vcfRecordTrimIndelLeftBase(rec);
     psvs->vcfEnd = vcfRecordTrimAllelesRight(rec);
     for (sample = phasedSamples; sample != NULL; sample = sample->next)
         {
         const struct vcfGenotype *gt = vcfRecordFindGenotype(rec, sample->name);
         if (!gt->isPhased)
             vcff->allPhased = FALSE;
         }
     struct pgSnp *pgs = pgSnpFromVcfRecord(rec);
     memcpy(&(psvs->pgs), pgs, sizeof(*pgs));
@@ -1711,108 +1715,134 @@
             tmp->dist = cell->dist;
             slAddHead(&cellList, tmp);
             }
         }
     }
 slSort(&cellList, hapDistanceMatrixCellCmp);
 return cellList;
 }
 
 static unsigned int toggleInt(unsigned int s)
 /* Add or subtract one */
 {
 return s & 1 ? s - 1 : s + 1;
 }
 
-static void fillOutHapOrder(unsigned int *hapOrder, unsigned int hapCount, struct hapDistanceMatrixCell *c1, struct hapDistanceMatrixCell *c2)
+static void fillOutHapOrder(unsigned int  *hapOrder, unsigned int hapCount, struct hapDistanceMatrixCell *c1, struct hapDistanceMatrixCell *c2, char **sampleDrawOrder)
 /* Assign indices to hapOrder in the order we should draw the alleles. Allows for the second parent
  * cell to be NULL */
 {
 if (c1 == NULL)
     errAbort("Error: fillOutHapOrder passed NULL parent");
 int numSamplesToDraw = hapCount / 2;
+char *childSampleName = c1->otherId;
+int childIx = stringArrayIx(childSampleName, sampleDrawOrder, numSamplesToDraw);
 int i;
 for (i = 0; i < numSamplesToDraw; i++)
     {
-    int hapIx = 2*i;
-    if (i == 0) // top set of lines
+    char *thisSample = sampleDrawOrder[i];
+    struct hapDistanceMatrixCell *thisCell = c2 != NULL && sameString(c2->sampleId,thisSample) ? c2 : c1;
+    short hapIx = 2*i;
+    if (i != childIx) // fill out the parent indices, which may be above or below the child
         {
-        hapOrder[hapIx] = toggleInt(c1->alleleIx);
-        hapOrder[hapIx+1] = c1->alleleIx;
-        }
-    else if (i + 1 == numSamplesToDraw) // last set of lines
-        {
-        if (c2)
+        if (i < childIx)
             {
-            hapOrder[hapIx] = c2->alleleIx;
-            hapOrder[hapIx+1] = toggleInt(c2->alleleIx);
+            hapOrder[hapIx] = toggleInt(thisCell->alleleIx);
+            hapOrder[hapIx+1] = thisCell->alleleIx;
             }
-        else
+        else // below the child
             {
-            hapOrder[hapIx] = c1->otherAlleleIx;
-            hapOrder[hapIx+1] = toggleInt(c1->otherAlleleIx);
+            hapOrder[hapIx] = thisCell->alleleIx;
+            hapOrder[hapIx+1] = toggleInt(thisCell->alleleIx);
             }
         }
-    else // orient based on above and below
+    else // fill out the child indices
         {
         hapOrder[hapIx] = c1->otherAlleleIx;
+        if (c2)
             hapOrder[hapIx+1] = c2->otherAlleleIx;
+        else
+            hapOrder[hapIx+1] = toggleInt(c1->otherAlleleIx);
+        }
+    }
 }
+
+static void maybeRollBackCell(struct hapDistanceMatrixCell *c1, struct hapDistanceMatrixCell *c2,
+                              struct hapDistanceMatrixCell *c1Copy, struct hapDistanceMatrixCell *c2Copy)
+/* At this point c1->otherAlleleIx != c2->otherAlleleIx, however, it may be the case that c2 has a
+ * better scoring match to c1->otherAlleleIx, so try to find it and reset the values */
+{
+struct hapDistanceMatrixCell *tmp = c2;
+while (tmp->otherAlleleIx != c1->otherAlleleIx)
+    tmp = tmp->next;
+if (tmp->dist < c1->dist)
+    {
+    *c2 = *tmp;
+    *c1 = *c1Copy;
     }
 }
 
 static void setHapOrderFromMatrix(unsigned int *hapOrder, unsigned int hapCount,
                                 struct hapDistanceMatrix *matrix, struct hapCluster **hapArray,
                                 struct vcfFile *vcff, char *childName, char **sampleDrawOrder)
 /* Given a matrix where each row is an allele of the child, and each column
  * is the distance from the child allele to a parent allele, fill out the order
  * in which the haplotypes will be drawn. The order are the indexes into rec->genotypes. */
 {
 char *parentNames[(hapCount/2) - 1];
 struct hapDistanceMatrixCell *c1 = NULL;
 struct hapDistanceMatrixCell *c2 = NULL;
 int i, ix = 0;
 if (hapCount > 2) // is there a trio or at least part of one?
     {
     for (i = 0; i < hapCount/2 ; i++)
         if (!sameString(sampleDrawOrder[i], childName))
             parentNames[ix++] = sampleDrawOrder[i];
 
     // For a parent/child combo, find the most likely set of transmitted
     // alleles. Depending on the number of variants in the window and the
     // haplotypes in question, both parents can tie as the person who
     // contributed a given allele
     c1 = findClosestChildAlleleToParent(parentNames[0], childName, matrix);
     c2 = findClosestChildAlleleToParent(parentNames[1], childName, matrix); // NULL if only one parent
     if (c1 && c2)
         {
         if (c1->otherAlleleIx == c2->otherAlleleIx)
             {
-            // first choose the lowest score, if a tie then just advance c1
+            struct hapDistanceMatrixCell *c1Copy = c1;
+            struct hapDistanceMatrixCell *c2Copy = c2;
             if (c1->dist >= c2->dist)
                 {
                 while (c1->dist >= c2->dist && c1->otherAlleleIx == c2->otherAlleleIx)
                     c1 = c1->next;
+                // at this point c1->otherAlleleIx != c2>otherAlleleIx, BUT c2 may have a better
+                // scoring match to this allele and we shouldn't have advanced c1 to begin with!
+                // This case is exercised at the following location (chr1:53896598-53897933) in
+                // the following file: /gbdb/hg38/1000Genomes/trio/NA12878_1463_CEU/NA12878_1463_CEUTrio.chr1.vcf.gz
+                // where the child's haplotypes are identical to both of the mother's haplotypes!
+                maybeRollBackCell(c1, c2, c1Copy, c2Copy);
                 }
             else
                 {
                 while (c1->dist < c2->dist && c1->otherAlleleIx == c2->otherAlleleIx)
                     c2 = c2->next;
+                // similar to above case but swap c1 and c2
+                maybeRollBackCell(c2, c1, c2Copy, c1Copy);
                 }
             }
         }
-    fillOutHapOrder(hapOrder, hapCount, c1, c2);
+    fillOutHapOrder(hapOrder, hapCount, c1, c2, sampleDrawOrder);
     }
 else
     {
     hapOrder[0] = matrix->alleleIx;
     hapOrder[1] = matrix->next->alleleIx;
     }
 }
 
 static void assignHapArrayIx(int *ret, struct hapCluster **hapArray, struct vcfFile *vcff, char *sample, boolean doChild)
 {
 int i;
 struct vcfRecord *rec = vcff->records;
 int ix = 0; // index into ret
 for (i = 0; i < slCount(hapArray); i++)
     {
@@ -2104,72 +2134,78 @@
     vcfPhasedAddLabel(track, hvg, label->string, yOff, round(((y1 + y2) / 2) - (track->lineHeight / 2)), font, MG_BLACK);
     }
 }
 
 static void vcfPhasedDrawItems(struct track *track, int seqStart, int seqEnd,
 			      struct hvGfx *hvg, int xOff, int yOff, int width,
 			      MgFont *font, Color color, enum trackVisibility vis)
 /* Split samples' chromosomes (haplotypes), cluster them by parents, and
  * draw them all along a line representing each chromosome*/
 {
 struct vcfFile *vcff = track->extraUiData;
 if (vcff->records == NULL)
     return;
 
 const double scale = scaleForPixels(width);
-struct slPair *pair, *sampleNames = vcfPhasedGetSampleOrder(cart, track->tdb, FALSE);
+boolean hideOtherSamples = cartUsualBooleanClosestToHome(cart, track->tdb, FALSE, VCF_PHASED_HIDE_OTHER_VAR, FALSE);
+struct slPair *pair, *sampleNames = vcfPhasedGetSampleOrder(cart, track->tdb, FALSE, hideOtherSamples);
 int gtCount = slCount(sampleNames);
 int yOffsets[gtCount * 2]; // y offsets of each haplotype line
 char *sampleOrder[gtCount]; // order of sampleName lines
 int i;
 for (pair = sampleNames, i = 0; pair != NULL && i < gtCount; pair = pair->next, i++)
     sampleOrder[i] = pair->name;
-// child sample is required to be in the middle/bottom of the trio
-char *childSample = gtCount > 2 ? sampleOrder[1] : sampleOrder[0];
+
+char *childSample = cloneString(trackDbSetting(track->tdb, VCF_PHASED_CHILD_SAMPLE_SETTING));
+char *pt = strchr(childSample, '|');
+if (pt != NULL)
+    *pt = '\0';
 
 // set up the "haplotype" lines and the transparent yellow box to delineate samples
 vcfPhasedSetupHaplotypesLines(track, hvg, xOff, yOff, width, yOffsets, sampleNames, font);
 
-// sort the variants by haplotype then draw ticks
+// maybe sort the variants by haplotype then draw ticks
+unsigned int *hapOrder = needMem(sizeof(short) * gtCount * 2);
 int nRecords = slCount(vcff->records);
 int centerIx = getCenterVariantIx(track, seqStart, seqEnd, vcff->records);
 int startIx = 0;
 int endIx = nRecords;
-unsigned int *hapOrder = computeHapDist(vcff, centerIx, startIx, endIx, childSample, gtCount, sampleOrder);
+hapOrder = computeHapDist(vcff, centerIx, startIx, endIx, childSample, gtCount, sampleOrder);
 boolean highlightChildDiffs = cartUsualBooleanClosestToHome(cart, track->tdb, FALSE, VCF_PHASED_HIGHLIGHT_INCONSISTENT, FALSE);
 struct vcfRecord *rec = NULL;
 struct slList *item = NULL;
 for (rec = vcff->records, item = track->items; rec != NULL && item != NULL; rec = rec->next, item = item->next)
     {
     vcfPhasedDrawOneRecord(track, hvg, rec, item, hapOrder, gtCount * 2, xOff, yOffsets, sampleOrder, childSample, highlightChildDiffs, scale);
     }
 }
 
 static int vcfPhasedItemHeight(struct track *tg, void *item)
 {
 return tg->heightPer;
 }
 
 int vcfPhasedTrackHeight(struct track *tg, enum trackVisibility vis)
 {
 const struct vcfFile *vcff = tg->extraUiData;
 // when doing composite track height, vcfPhasedLoadItems won't have been called yet!
 if (vis == tvDense)
     return pgSnpHeight(tg, vis);
 if (!vcff || vcff->records == NULL)
     return 0;
-int totalSamples = slCount(vcfPhasedGetSampleOrder(cart, tg->tdb, FALSE));
+boolean hideOtherSamples = cartUsualBooleanClosestToHome(cart, tg->tdb, FALSE, VCF_PHASED_HIDE_OTHER_VAR, FALSE);
+int totalSamples = slCount(vcfPhasedGetSampleOrder(cart, tg->tdb, FALSE, hideOtherSamples));
 tg->lineHeight = tl.fontHeight + 1;
 tg->heightPer = tl.fontHeight;
 // if all variants in view are phased, then 3 lines per sample,
 // else 4 lines. The extra 2 is for clear separation
 int heightPerSample;
 if (vcff->allPhased)
     heightPerSample = (3 * tg->lineHeight) + 2;
 else
     heightPerSample = (4 * tg->lineHeight) + 2;
 tg->height = totalSamples * heightPerSample;
 tg->itemHeight = vcfPhasedItemHeight;
 // custom int is reserved for doing pgSnp coloring but as far as I can tell is
 // never actually used?
 tg->customInt = totalSamples;
 return tg->height;