bc21bd3d27fe3d29971231955b3fc544fa1c3d1e
angie
  Wed Oct 16 11:51:39 2013 -0700
Two new tracks for Locus Reference Genomic (LRG) (#11863) with customhandlers: LRG Regions and LRG Transcripts.
LRGs are frozen reference sequences for a particular gene plus some
upstream and downstream sequence.  They are intended to provide a
stable coordinate system for gene annotations that won't change
with every new genome assembly, but can be mapped to each genome
assembly.  Since there is a lot of metadata associated with each
region, I made LRG Regions a bigBed 12 + with fields describing
mismatches and indels, so that PSL can be derived from the bigBed
and the original LRG sequence can be reconstructed using genome
assembly sequence and the mismatch/indel info.  hgTracks shows
differences and LRG insertions into the reference assembly using
the cds.c baseColor code.  (LRG deletions from the reference appear
as gaps, which we get for free with bed12 info).
For LRG Transcripts, I found the genePred codon-coloring code
inadequate for showing an insertion into hg19 (or even mismatches),
so instead of genePred I ended up using PSL + sequence, more like
the mRNA track representation and display.

diff --git src/hg/lib/lrg.c src/hg/lib/lrg.c
new file mode 100644
index 0000000..3de7405
--- /dev/null
+++ src/hg/lib/lrg.c
@@ -0,0 +1,480 @@
+/* lrg.c was originally generated by the autoSql program, which also 
+ * generated lrg.h and lrg.sql.  This module links the database and
+ * the RAM representation of objects. */
+
+#include "common.h"
+#include "linefile.h"
+#include "dystring.h"
+#include "jksql.h"
+#include "lrg.h"
+
+
+
+char *lrgCommaSepFieldNames = "chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,reserved,blockCount,blockSizes,chromStarts,mismatches,indels,lrgSize,hgncId,hgncSymbol,ncbiAcc,lrgSource,lrgSourceUrl,creationDate";
+
+struct lrg *lrgLoad(char **row)
+/* Load a lrg from row fetched with select * from lrg
+ * from database.  Dispose of this with lrgFree(). */
+{
+struct lrg *ret;
+
+AllocVar(ret);
+ret->blockCount = sqlSigned(row[9]);
+ret->chrom = cloneString(row[0]);
+ret->chromStart = sqlUnsigned(row[1]);
+ret->chromEnd = sqlUnsigned(row[2]);
+ret->name = cloneString(row[3]);
+ret->score = sqlUnsigned(row[4]);
+safecpy(ret->strand, sizeof(ret->strand), row[5]);
+ret->thickStart = sqlUnsigned(row[6]);
+ret->thickEnd = sqlUnsigned(row[7]);
+ret->reserved = sqlUnsigned(row[8]);
+{
+int sizeOne;
+sqlSignedDynamicArray(row[10], &ret->blockSizes, &sizeOne);
+assert(sizeOne == ret->blockCount);
+}
+{
+int sizeOne;
+sqlSignedDynamicArray(row[11], &ret->chromStarts, &sizeOne);
+assert(sizeOne == ret->blockCount);
+}
+ret->mismatches = cloneString(row[12]);
+ret->indels = cloneString(row[13]);
+ret->lrgSize = sqlUnsigned(row[14]);
+ret->hgncId = sqlSigned(row[15]);
+ret->hgncSymbol = cloneString(row[16]);
+ret->ncbiAcc = cloneString(row[17]);
+ret->lrgSource = cloneString(row[18]);
+ret->lrgSourceUrl = cloneString(row[19]);
+ret->creationDate = cloneString(row[20]);
+return ret;
+}
+
+struct lrg *lrgLoadAll(char *fileName) 
+/* Load all lrg from a whitespace-separated file.
+ * Dispose of this with lrgFreeList(). */
+{
+struct lrg *list = NULL, *el;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[21];
+
+while (lineFileRow(lf, row))
+    {
+    el = lrgLoad(row);
+    slAddHead(&list, el);
+    }
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
+struct lrg *lrgLoadAllByChar(char *fileName, char chopper) 
+/* Load all lrg from a chopper separated file.
+ * Dispose of this with lrgFreeList(). */
+{
+struct lrg *list = NULL, *el;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[21];
+
+while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
+    {
+    el = lrgLoad(row);
+    slAddHead(&list, el);
+    }
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
+struct lrg *lrgCommaIn(char **pS, struct lrg *ret)
+/* Create a lrg out of a comma separated string. 
+ * This will fill in ret if non-null, otherwise will
+ * return a new lrg */
+{
+char *s = *pS;
+
+if (ret == NULL)
+    AllocVar(ret);
+ret->chrom = sqlStringComma(&s);
+ret->chromStart = sqlUnsignedComma(&s);
+ret->chromEnd = sqlUnsignedComma(&s);
+ret->name = sqlStringComma(&s);
+ret->score = sqlUnsignedComma(&s);
+sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
+ret->thickStart = sqlUnsignedComma(&s);
+ret->thickEnd = sqlUnsignedComma(&s);
+ret->reserved = sqlUnsignedComma(&s);
+ret->blockCount = sqlSignedComma(&s);
+{
+int i;
+s = sqlEatChar(s, '{');
+AllocArray(ret->blockSizes, ret->blockCount);
+for (i=0; i<ret->blockCount; ++i)
+    {
+    ret->blockSizes[i] = sqlSignedComma(&s);
+    }
+s = sqlEatChar(s, '}');
+s = sqlEatChar(s, ',');
+}
+{
+int i;
+s = sqlEatChar(s, '{');
+AllocArray(ret->chromStarts, ret->blockCount);
+for (i=0; i<ret->blockCount; ++i)
+    {
+    ret->chromStarts[i] = sqlSignedComma(&s);
+    }
+s = sqlEatChar(s, '}');
+s = sqlEatChar(s, ',');
+}
+ret->mismatches = sqlStringComma(&s);
+ret->indels = sqlStringComma(&s);
+ret->lrgSize = sqlUnsignedComma(&s);
+ret->hgncId = sqlSignedComma(&s);
+ret->hgncSymbol = sqlStringComma(&s);
+ret->ncbiAcc = sqlStringComma(&s);
+ret->lrgSource = sqlStringComma(&s);
+ret->lrgSourceUrl = sqlStringComma(&s);
+ret->creationDate = sqlStringComma(&s);
+*pS = s;
+return ret;
+}
+
+void lrgFree(struct lrg **pEl)
+/* Free a single dynamically allocated lrg such as created
+ * with lrgLoad(). */
+{
+struct lrg *el;
+
+if ((el = *pEl) == NULL) return;
+freeMem(el->chrom);
+freeMem(el->name);
+freeMem(el->blockSizes);
+freeMem(el->chromStarts);
+freeMem(el->mismatches);
+freeMem(el->indels);
+freeMem(el->hgncSymbol);
+freeMem(el->ncbiAcc);
+freeMem(el->lrgSource);
+freeMem(el->lrgSourceUrl);
+freeMem(el->creationDate);
+freez(pEl);
+}
+
+void lrgFreeList(struct lrg **pList)
+/* Free a list of dynamically allocated lrg's */
+{
+struct lrg *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+    {
+    next = el->next;
+    lrgFree(&el);
+    }
+*pList = NULL;
+}
+
+void lrgOutput(struct lrg *el, FILE *f, char sep, char lastSep) 
+/* Print out lrg.  Separate fields with sep. Follow last field with lastSep. */
+{
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->chrom);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+fprintf(f, "%u", el->chromStart);
+fputc(sep,f);
+fprintf(f, "%u", el->chromEnd);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->name);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+fprintf(f, "%u", el->score);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->strand);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+fprintf(f, "%u", el->thickStart);
+fputc(sep,f);
+fprintf(f, "%u", el->thickEnd);
+fputc(sep,f);
+fprintf(f, "%u", el->reserved);
+fputc(sep,f);
+fprintf(f, "%d", el->blockCount);
+fputc(sep,f);
+{
+int i;
+if (sep == ',') fputc('{',f);
+for (i=0; i<el->blockCount; ++i)
+    {
+    fprintf(f, "%d", el->blockSizes[i]);
+    fputc(',', f);
+    }
+if (sep == ',') fputc('}',f);
+}
+fputc(sep,f);
+{
+int i;
+if (sep == ',') fputc('{',f);
+for (i=0; i<el->blockCount; ++i)
+    {
+    fprintf(f, "%d", el->chromStarts[i]);
+    fputc(',', f);
+    }
+if (sep == ',') fputc('}',f);
+}
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->mismatches);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->indels);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+fprintf(f, "%u", el->lrgSize);
+fputc(sep,f);
+fprintf(f, "%d", el->hgncId);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->hgncSymbol);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->ncbiAcc);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->lrgSource);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->lrgSourceUrl);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->creationDate);
+if (sep == ',') fputc('"',f);
+fputc(lastSep,f);
+}
+
+/* -------------------------------- End autoSql Generated Code -------------------------------- */
+
+#include "psl.h"
+#include "hdb.h"
+
+static struct lrgDiff *lrgDiffNew(uint chromStart, uint chromEnd, char *chromSeq,
+				  uint lrgStart, uint lrgEnd, char *lrgSeq)
+/* Alloc, populate and return a new lrgDiff. */
+{
+struct lrgDiff *diff;
+AllocVar(diff);
+diff->chromStart = chromStart;
+diff->chromEnd = chromEnd;
+diff->chromSeq = chromSeq;
+diff->lrgStart = lrgStart;
+diff->lrgEnd = lrgEnd;
+diff->lrgSeq = lrgSeq;
+return diff;
+}
+
+static struct lrgDiff *lrgParseDiffs(struct lrg *lrg, char *diffStr)
+/* Parse diffStr, i.e. a comma-separated sequence of colon-separated elements.
+ * Each colon-separated element has chromStart, lrgStart, chromSeq and lrgSeq.
+ * Return a list of lrgDiffs. */
+{
+struct lrgDiff *diffList = NULL;
+struct slName *diffEl, *diffStrList = slNameListFromString(diffStr, ',');
+for (diffEl = diffStrList;  diffEl != NULL;  diffEl = diffEl->next)
+    {
+    int expectedCount = 4;
+    char *words[expectedCount+1];
+    int wordCount = chopByChar(cloneString(diffEl->name), ':', words, ArraySize(words));
+    if (wordCount != expectedCount)
+	errAbort("lrgParseMismatches: expected %d :-sep words, got %d", expectedCount, wordCount);
+    uint chromStart = atoll(words[0]) + lrg->chromStart;
+    uint lrgStart = atoll(words[1]);
+    char *chromSeq = words[2];
+    char *lrgSeq = words[3];
+    uint chromEnd = sameString(chromSeq, "-") ? chromStart : chromStart + strlen(chromSeq);
+    uint lrgEnd = sameString(lrgSeq, "-") ? lrgStart : lrgStart + strlen(lrgSeq);
+    struct lrgDiff *diff = lrgDiffNew(chromStart, chromEnd, chromSeq, lrgStart, lrgEnd, lrgSeq);
+    slAddHead(&diffList, diff);
+    }
+slNameFreeList(&diffStrList);
+slReverse(&diffList);
+return diffList;
+}
+
+struct lrgDiff *lrgParseMismatches(struct lrg *lrg)
+/* Parse lrg->mismatches and return a list of lrgDiffs. */
+{
+return lrgParseDiffs(lrg, lrg->mismatches);
+}
+
+struct lrgDiff *lrgParseIndels(struct lrg *lrg)
+/* Parse lrg->indels and return a list of lrgDiffs. */
+{
+return lrgParseDiffs(lrg, lrg->indels);
+}
+
+void lrgDiffFreeList(struct lrgDiff **pDiff)
+/* Free up a list of parsed diffs. */
+{
+if (pDiff == NULL || *pDiff == NULL)
+    return;
+struct lrgDiff *diff;
+for (diff = *pDiff;  diff != NULL;  diff = diff->next)
+    {
+    freeMem(diff->chromSeq);
+    freeMem(diff->lrgSeq);
+    }
+slFreeList(pDiff);
+}
+
+static int calcBlockSize(uint nextTStart, uint nextQStart, struct psl *psl, int blockIx)
+{
+int tLen = nextTStart - psl->tStarts[blockIx];
+int qLen = nextQStart - psl->qStarts[blockIx];
+return min(tLen, qLen);
+}
+
+struct psl *lrgToPsl(struct lrg *lrg, uint chromSize)
+/* Use lrg's mismatches and indels to make a PSL. */
+{
+struct psl *psl;
+AllocVar(psl);
+struct lrgDiff *mismatches = lrgParseMismatches(lrg), *indels = lrgParseIndels(lrg);
+psl->misMatch = slCount(mismatches);
+psl->repMatch = 0;
+psl->nCount = 0;
+psl->qNumInsert = 0;
+psl->qBaseInsert = 0;
+psl->tNumInsert = 0;
+psl->tBaseInsert = 0;
+psl->strand[0] = lrg->strand[0];
+psl->qName = cloneString(lrg->name);
+psl->qSize = lrg->lrgSize;
+psl->qStart = 0;
+psl->qEnd = lrg->lrgSize;
+psl->tName = cloneString(lrg->chrom);
+psl->tSize = chromSize;
+psl->tStart = lrg->chromStart;
+psl->tEnd = lrg->chromEnd;
+psl->blockCount = slCount(indels) + 1;
+AllocArray(psl->blockSizes, psl->blockCount);
+AllocArray(psl->qStarts, psl->blockCount);
+AllocArray(psl->tStarts, psl->blockCount);
+boolean isRc = (lrg->strand[0] == '-');
+// Translate gap coords from indels into block coords:
+psl->qStarts[0] = 0;
+psl->tStarts[0] = lrg->chromStart;
+int alignedBaseCount = 0;
+int blockIx = 1;
+struct lrgDiff *diff;
+for (diff = indels;  diff != NULL;  diff = diff->next)
+    {
+    // now we know the size of the previous block:
+    int lrgStartPosStrand = isRc ? (lrg->lrgSize - diff->lrgEnd) : diff->lrgStart;
+    psl->blockSizes[blockIx-1] = calcBlockSize(diff->chromStart, lrgStartPosStrand, psl, blockIx-1);
+    alignedBaseCount += psl->blockSizes[blockIx-1];
+    // Insertion on t or q?
+    int lrgLen = diff->lrgEnd - diff->lrgStart;
+    int refLen = diff->chromEnd - diff->chromStart;
+    if (lrgLen > refLen)
+	{
+	psl->qNumInsert++;
+	psl->qBaseInsert += (lrgLen - refLen);
+	}
+    else if (refLen > lrgLen)
+	{
+	psl->tNumInsert++;
+	psl->tBaseInsert += (refLen - lrgLen);
+	}
+    psl->qStarts[blockIx] = lrgStartPosStrand + lrgLen;
+    psl->tStarts[blockIx] = diff->chromEnd;
+    blockIx++;
+    }
+// size of last block:
+psl->blockSizes[blockIx-1] = calcBlockSize(lrg->chromEnd, lrg->lrgSize, psl, blockIx-1);
+alignedBaseCount += psl->blockSizes[blockIx-1];
+if (blockIx != psl->blockCount)
+    errAbort("lrgToPsl: expected %d blocks but somehow ended up with %d", psl->blockCount,
+	     blockIx);
+psl->match = alignedBaseCount - psl->misMatch;
+return psl;
+}
+
+static int sumInsertions(struct lrgDiff *indels)
+/* Return the sum of insertions of lrg into reference sequence, ignoring deletions.
+ * That is the max headroom that we will need while shifting bases around. */
+{
+int sumIns = 0;
+struct lrgDiff *diff;
+for (diff = indels;  diff != NULL;  diff = diff->next)
+    sumIns += (diff->lrgEnd - diff->lrgStart);
+return sumIns;
+}
+
+struct dnaSeq *lrgReconstructSequence(struct lrg *lrg, char *db)
+/* Use genomic sequence, lrg->mismatches and lrg->indels to reconstruct LRG sequence */
+{
+struct lrgDiff *diff, *indels = lrgParseIndels(lrg);
+int refSize = lrg->chromEnd - lrg->chromStart;
+// Fetch genomic sequence plus extra headroom for insertions (if any):
+int seqSize = refSize + sumInsertions(indels);
+struct dnaSeq *lrgSeq = hDnaFromSeq(db, lrg->chrom, lrg->chromStart, lrg->chromStart+seqSize,
+				    dnaUpper);
+char *lrgSeqDna = lrgSeq->dna;
+if (seqSize > refSize)
+    zeroBytes(lrgSeqDna+refSize, seqSize-refSize);
+boolean isRc = (lrg->strand[0] == '-');
+if (isRc)
+    reverseComplement(lrgSeqDna, refSize);
+// Go through indels backwards w.r.t. genome assembly so we move sequence to the right
+// while not changing coords to the left:
+if (!isRc)
+    slReverse(&indels);
+for (diff = indels;  diff != NULL;  diff = diff->next)
+    {
+    int lrgLen = diff->lrgEnd - diff->lrgStart;
+    int refLen = diff->chromEnd - diff->chromStart;
+    int refStart = diff->chromStart - lrg->chromStart;
+    if (isRc)
+	refStart = refSize - (refStart + refLen);
+    if (lrgLen > refLen)
+	{
+	// LRG inserts sequence: shift the rest of the sequence to the right
+	int insSize = lrgLen - refLen;
+	int moveSize = seqSize - refStart - insSize;
+	memmove(lrgSeqDna+refStart+insSize, lrgSeqDna+refStart, moveSize);
+	}
+    else
+	{
+	// LRG deletes sequence: shift the rest of the sequence to the left
+	int delSize = refLen - lrgLen;
+	int moveSize = seqSize - refStart - delSize + 1; // '\0' at end too
+	memmove(lrgSeqDna+refStart, lrgSeqDna+refStart+delSize, moveSize);
+	}
+    // If there is LRG sequence, copy it in.
+    if (lrgLen > 0)
+	memcpy(lrgSeqDna+refStart, diff->lrgSeq, lrgLen);
+    }
+// Now that indels have been resolved, use LRG coords to substitute mismatches:
+struct lrgDiff *mismatches = lrgParseMismatches(lrg);
+for (diff = mismatches;  diff != NULL;  diff = diff->next)
+    {
+    if (lrgSeqDna[diff->lrgStart] != diff->chromSeq[0])
+	errAbort("Mismatch with unexpected assembly sequence: expected '%c' on %c strand, "
+		 "got '%c'", diff->chromSeq[0], lrg->strand[0], lrgSeqDna[diff->lrgStart]);
+    int size = diff->lrgEnd - diff->lrgStart;
+    memcpy(lrgSeqDna+diff->lrgStart, diff->lrgSeq, size);
+    }
+if (strlen(lrgSeqDna) != lrg->lrgSize)
+    errAbort("maybeGetSeqUpper: Error applying LRG indels for '%s'", lrg->name);
+lrgSeq->size = lrg->lrgSize;
+return lrgSeq;
+}
+