39b3ad00ec43efa3dbd5a8b0836d7a6a00776528
angie
  Wed Jul 29 13:28:23 2020 -0700
Added vcfParseGenotypesGtOnly to speed up parsing when we have tens of thousands of genotype columns and just want the genotypes not any associated details.  refs #25943

diff --git src/lib/vcf.c src/lib/vcf.c
index 2e1c746..b38cbc6 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -1446,30 +1446,61 @@
 		       gt->id, VCF_MAX_INFO);
 	}
     if (!foundGT)   //try SGT
         {
         parseSgtAsGt(record, gt);
         if (gt->hapIxA != -1)
             foundGT = TRUE;
         }
     if (i == 0 && !foundGT)
         vcfFileErr(vcff,
                    "Genotype FORMAT column includes neither GT nor PL; unable to parse genotypes.");
     }
 record->genotypeUnparsedStrings = NULL;
 }
 
+void vcfParseGenotypesGtOnly(struct vcfRecord *record)
+/* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[], but ignore
+ * genotype info elements, IDs, etc; parse only the genotypes (e.g. for quick display in hgTracks).
+ * This destroys genotypesUnparsedStrings. */
+{
+if (record->genotypeUnparsedStrings == NULL)
+    return;
+struct vcfFile *vcff = record->file;
+record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype));
+int i;
+for (i = 0;  i < vcff->genotypeCount;  i++)
+    {
+    // Parse (.|[0-9])([/|](.|[0-9]))? in first one to four characters
+    char *string = record->genotypeUnparsedStrings[i];
+    struct vcfGenotype *gt = &(record->genotypes[i]);
+    gt->hapIxA = gt->hapIxB = -1;
+    if (string[0] != '.' && isdigit(string[0]))
+        gt->hapIxA = atoi(string);
+    if (string[1] == '/' || string[1] == '|')
+        {
+        if (isdigit(string[2]))
+            gt->hapIxB = atoi(string+2);
+        if (string[1] == '|')
+            gt->isPhased = TRUE;
+        }
+    else
+        gt->isHaploid = TRUE;
+    }
+record->genotypeUnparsedStrings = NULL;
+}
+
 const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId)
 /* Find the genotype and associated info for the individual, or return NULL.
  * This calls vcfParseGenotypes if it has not already been called. */
 {
 struct vcfFile *vcff = record->file;
 if (sampleId == NULL || vcff->genotypeCount == 0)
     return NULL;
 vcfParseGenotypes(record);
 int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount);
 if (ix >= 0)
     return &(record->genotypes[ix]);
 return NULL;
 }
 
 const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key)