533112afe2a2005e80cdb1f82904ea65032d4302
braney
  Sat Oct 2 11:37:34 2021 -0700
split hg/lib into two separate libaries, one only used by the cgis

diff --git src/hg/cgilib/variant.c src/hg/cgilib/variant.c
new file mode 100644
index 0000000..b6f9f71
--- /dev/null
+++ src/hg/cgilib/variant.c
@@ -0,0 +1,212 @@
+/* variant.c -- routines to convert other variant formats to a generic
+ *              variant structure */
+
+/* Copyright (C) 2014 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
+#include "common.h"
+#include "annoRow.h"
+#include "variant.h"
+
+struct allele  *alleleClip(struct allele *allele, int sx, int ex, struct lm *lm)
+/* Return new allele pointing to new variant, both clipped to region defined by [sx,ex). */
+{
+struct variant *oldVariant = allele->variant;
+int start = oldVariant->chromStart;
+int end = oldVariant->chromEnd;
+int delFront = 0;
+int delRear = 0;
+
+if (start < sx)
+    {
+    delFront = min(sx - start, allele->length);
+    start = sx;
+    }
+
+if (end > ex)
+    {
+    delRear = min(end - ex, allele->length - delFront);
+    end = ex;
+    }
+
+struct variant *newVariant;
+lmAllocVar(lm, newVariant);
+newVariant->chrom = lmCloneString(lm, oldVariant->chrom);
+newVariant->chromStart = start;
+newVariant->chromEnd = end;
+newVariant->numAlleles = 1;
+
+struct allele *newAllele;
+lmAllocVar(lm, newAllele);
+newVariant->alleles = newAllele;
+newAllele->variant = newVariant;
+newAllele->length = allele->length - delRear - delFront;
+assert(newAllele->length >= 0);
+newAllele->sequence = lmCloneString(lm, &allele->sequence[delFront]);
+newAllele->sequence[newAllele->length] = 0;   // cut off delRear part
+
+return newAllele;
+}
+
+static boolean isDash(char *string)
+/* Return TRUE if the only char in string is '-'
+ * (possibly repeated like the darn pgVenter alleles). */
+{
+char *p;
+for (p = string;  p != NULL && *p != '\0';  p++)
+    if (*p != '-')
+	return FALSE;
+return TRUE;
+}
+
+struct variant *variantNew(char *chrom, unsigned start, unsigned end, unsigned numAlleles,
+			   char *slashSepAlleles, char *refAllele, struct lm *lm)
+/* Create a variant from basic information that is easy to extract from most other variant
+ * formats: coords, allele count, string of slash-separated alleles and reference allele. */
+{
+struct variant *variant;
+
+// We have a new variant!
+lmAllocVar(lm, variant);
+variant->chrom = lmCloneString(lm, chrom);
+variant->chromStart = start;
+variant->chromEnd = end;
+variant->numAlleles = numAlleles;
+
+// get the alleles.
+char *nextAlleleString = lmCloneString(lm, slashSepAlleles);
+int alleleNumber = 0;
+for( ; alleleNumber < numAlleles; alleleNumber++)
+    {
+    if (nextAlleleString == NULL)
+	errAbort("number of alleles in /-separated string doesn't match numAlleles");
+    
+    char *thisAlleleString = nextAlleleString;
+
+    // advance pointer to next variant string
+    // probably there's some kent routine to do this behind the curtain
+    nextAlleleString = strchr(thisAlleleString, '/');
+    if (nextAlleleString)	 // null out '/' and move to next char
+	{
+	*nextAlleleString = 0;
+	nextAlleleString++;
+	}
+
+    boolean isRefAllele = (sameWord(thisAlleleString, refAllele) ||
+			   (isEmpty(refAllele) && sameString(thisAlleleString, "-")) ||
+			   sameString(thisAlleleString, "<X>") || // samtools mpileup no variation
+			   sameString(thisAlleleString, "<*>"));  // gVCF no variation
+    int alleleStringLength = strlen(thisAlleleString);
+    if (isDash(thisAlleleString))
+	{
+	alleleStringLength = 0;
+	thisAlleleString[0] = '\0';
+	}
+
+    // we have a new allele!
+    struct allele *allele;
+    lmAllocVar(lm, allele);
+    slAddHead(&variant->alleles, allele);
+    allele->variant = variant;
+    allele->length = alleleStringLength; 
+    allele->sequence = lmCloneString(lm, thisAlleleString);
+    allele->isReference = isRefAllele;
+    }
+
+slReverse(&variant->alleles);
+
+return variant;
+}
+
+struct variant *variantFromPgSnpAnnoRow(struct annoRow *row, char *refAllele, boolean hasBin,
+                                        struct lm *lm)
+/* Translate pgSnp annoRow into variant (allocated by lm). */
+{
+struct pgSnp pgSnp;
+char **words = row->data;
+char *wordsWithFakeBin[PGSNP_NUM_COLS];
+if (! hasBin)
+    {
+    // pgSnp file input doesn't have a bin column, but the pgSnp code expects one --
+    // so make a fake bin column to ignore.
+    wordsWithFakeBin[0] = "1";
+    int i;
+    for (i = 1;  i < PGSNP_NUM_COLS;  i++)
+        wordsWithFakeBin[i] = words[i-1];
+    words = wordsWithFakeBin;
+    }
+pgSnpStaticLoad(words, &pgSnp);
+return variantNew(pgSnp.chrom, pgSnp.chromStart, pgSnp.chromEnd, pgSnp.alleleCount,
+		  pgSnp.name, refAllele, lm);
+}
+
+struct variant *variantFromVcfAnnoRow(struct annoRow *row, char *refAllele, struct lm *lm,
+				      struct dyString *dyScratch)
+/* Translate vcf array of words into variant (allocated by lm, overwriting dyScratch
+ * as temporary scratch string). */
+{
+char **words = row->data;
+char *alStr = vcfGetSlashSepAllelesFromWords(words, dyScratch);
+// The reference allele is the first allele in alStr -- and it may be trimmed on both ends with
+// respect to the raw VCF ref allele in words[3], so copy vcfRefAllele back out of alStr.
+// That ensures that variantNew will get the reference allele that matches the slash-separated
+// allele string.
+int refLen = strlen(alStr);
+char *p = strchr(alStr, '/');
+if (p)
+    refLen = p - alStr;
+char vcfRefAllele[refLen + 1];
+safencpy(vcfRefAllele, sizeof(vcfRefAllele), alStr, refLen);
+unsigned alCount = countChars(alStr, '/') + 1;
+return variantNew(row->chrom, row->start, row->end, alCount, alStr, vcfRefAllele, lm);
+}
+
+static char *findRefAllele(struct variant *variant)
+/* Find the reference allele (preferably not symbolic); return NULL if variant doesn't have one.
+ * Don't free result. */
+{
+char *refAllele = NULL;
+struct allele *allele;
+for (allele = variant->alleles;  allele != NULL;  allele = allele->next)
+    {
+    if (allele->isReference)
+        {
+        refAllele = allele->sequence;
+        if (isAllNt(refAllele, strlen(refAllele)))
+            break;
+        }
+    }
+return refAllele;
+}
+
+struct variant *splitAndTrimVariants(struct variant *variantIn, struct lm *lm)
+/* Split variantIn into a list of single-allele variants with redundant ref/alt bases trimmed. */
+{
+struct variant *variantList = NULL;
+struct variant *variant;
+for (variant = variantIn; variant != NULL;  variant = variant->next)
+    {
+    char *refAllele = findRefAllele(variant);
+    if (refAllele == NULL || !isAllNt(refAllele, strlen(refAllele)))
+        slAddHead(&variantList, variant);
+    else
+        {
+        struct allele *allele;
+        for (allele = variant->alleles;  allele != NULL;  allele = allele->next)
+            {
+            if (! allele->isReference)
+                {
+                int refLen = strlen(refAllele), altLen = strlen(allele->sequence);
+                char ref[refLen+1], alt[altLen+1];
+                safecpy(ref, sizeof(ref), refAllele);
+                safecpy(alt, sizeof(alt), allele->sequence);
+                uint start = variant->chromStart, end = variant->chromEnd;
+                trimRefAlt(ref, alt, &start, &end, &refLen, &altLen);
+                slAddHead(&variantList, variantNew(variant->chrom, start, end, 1, alt, ref, lm));
+                }
+            }
+        }
+    }
+return variantList;
+}
+