bc21bd3d27fe3d29971231955b3fc544fa1c3d1e
angie
Wed Oct 16 11:51:39 2013 -0700
Two new tracks for Locus Reference Genomic (LRG) (#11863) with customhandlers: LRG Regions and LRG Transcripts.
LRGs are frozen reference sequences for a particular gene plus some
upstream and downstream sequence. They are intended to provide a
stable coordinate system for gene annotations that won't change
with every new genome assembly, but can be mapped to each genome
assembly. Since there is a lot of metadata associated with each
region, I made LRG Regions a bigBed 12 + with fields describing
mismatches and indels, so that PSL can be derived from the bigBed
and the original LRG sequence can be reconstructed using genome
assembly sequence and the mismatch/indel info. hgTracks shows
differences and LRG insertions into the reference assembly using
the cds.c baseColor code. (LRG deletions from the reference appear
as gaps, which we get for free with bed12 info).
For LRG Transcripts, I found the genePred codon-coloring code
inadequate for showing an insertion into hg19 (or even mismatches),
so instead of genePred I ended up using PSL + sequence, more like
the mRNA track representation and display.
diff --git src/hg/hgc/lrgClick.c src/hg/hgc/lrgClick.c
new file mode 100644
index 0000000..d6728ad
--- /dev/null
+++ src/hg/hgc/lrgClick.c
@@ -0,0 +1,248 @@
+/* lrgClick.c - Locus Reference Genomic (LRG) sequences mapped to genome assembly */
+
+#include "common.h"
+#include "hgc.h"
+#include "bigBed.h"
+#include "fa.h"
+#include "genbank.h"
+#include "hdb.h"
+#include "trackHub.h"
+#include "lrg.h"
+
+INLINE void printStartAndMaybeEnd(uint start, uint end)
+{
+if (end == start+1)
+ printf("%d", end);
+else
+ printf("%d-%d", start+1, end);
+}
+
+static void printLrgDiffs(struct lrg *lrg, struct lrgDiff *diffList)
+/* Diffstr is a comma-separated list of colon-separated blkStart, lrgStart, assemblySeq, lrgSeq.
+ * Make it more readable as a table with links to position ranges. */
+{
+printf("
\n"
+ "%s location | %s sequence | "
+ "%s location | %s sequence",
+ lrg->name, lrg->name, database, database);
+if (lrg->strand[0] == '-')
+ printf(" (on - strand)");
+printf(" |
");
+struct lrgDiff *diff;
+for (diff = diffList; diff != NULL; diff = diff->next)
+ {
+ printf("");
+ printStartAndMaybeEnd(diff->lrgStart, diff->lrgEnd);
+ printf(" | %s | ", diff->lrgSeq);
+ // Make position links that are zoomed out a tiny bit from the actual positions:
+ const int rangeFudge = 10;
+ uint rangeStart = (diff->chromStart <= rangeFudge) ? 0 : diff->chromStart - rangeFudge;
+ uint rangeEnd = diff->chromEnd + rangeFudge;
+ printf("%s:",
+ hgTracksPathAndSettings(), database, lrg->chrom, rangeStart+1, rangeEnd, lrg->chrom);
+ printStartAndMaybeEnd(diff->chromStart, diff->chromEnd);
+ printf(" | %s |
\n", diff->chromSeq);
+ }
+printf("
\n");
+slNameFreeList(&diffList);
+}
+
+void doLrg(struct trackDb *tdb, char *item)
+/* Locus Reference Genomic (LRG) track info. */
+{
+char *chrom = cartString(cart, "c");
+int start = cgiInt("o");
+int end = cgiInt("t");
+struct sqlConnection *conn = NULL ;
+if (!trackHubDatabase(database))
+ conn = hAllocConnTrack(database, tdb);
+char *fileName = bbiNameFromSettingOrTable(tdb, conn, tdb->table);
+hFreeConn(&conn);
+if (isEmpty(fileName))
+ {
+ errAbort("doLrg: missing bigBed fileName for track '%s'", tdb->track);
+ }
+
+genericHeader(tdb, item);
+
+// Get column urls from trackDb:
+char *urlsStr = trackDbSetting(tdb, "urls");
+struct hash *columnUrls = hashFromString(urlsStr);
+
+// Open BigWig file and get interval list.
+struct bbiFile *bbi = bigBedFileOpen(fileName);
+int bedSize = bbi->definedFieldCount;
+int fieldCount = bbi->fieldCount;
+struct lm *lm = lmInit(0);
+struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm);
+boolean found = FALSE;
+struct bigBedInterval *bb;
+for (bb = bbList; bb != NULL; bb = bb->next)
+ {
+ if (!(bb->start == start && bb->end == end))
+ continue;
+ char *fields[fieldCount];
+ char startBuf[16], endBuf[16];
+ int bbFieldCount = bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, fieldCount);
+ if (bbFieldCount != fieldCount)
+ errAbort("doLrg: inconsistent fieldCount (bbi has %d, row has %d)",
+ fieldCount, bbFieldCount);
+ struct lrg *lrg = lrgLoad(fields);
+ if (differentString(lrg->name, item))
+ continue;
+ found = TRUE;
+ printCustomUrl(tdb, lrg->name, TRUE);
+ bedPrintPos((struct bed *)lrg, bedSize, tdb);
+ char *url = hashFindVal(columnUrls, "hgncId");
+ printf("HGNC Gene Symbol: ");
+ if (isNotEmpty(url))
+ {
+ char hgncIdStr[32];
+ safef(hgncIdStr, sizeof(hgncIdStr), "%d", lrg->hgncId);
+ char *idUrl = replaceInUrl(tdb, url, hgncIdStr, TRUE);
+ printf("%s
\n", idUrl, lrg->hgncSymbol);
+ }
+ else
+ printf("%s
\n", lrg->hgncSymbol);
+ printf("Source of LRG sequence: "
+ "%s
\n", lrg->lrgSourceUrl, lrg->lrgSource);
+ printf("LRG Sequence at NCBI: ");
+ url = hashFindVal(columnUrls, "ncbiAcc");
+ if (isNotEmpty(url))
+ {
+ char *idUrl = replaceInUrl(tdb, url, lrg->ncbiAcc, TRUE);
+ printf("%s
\n", idUrl, lrg->ncbiAcc);
+ }
+ else
+ printf("%s
\n", lrg->ncbiAcc);
+ printf("Creation Date: %s
\n", lrg->creationDate);
+ char *assemblyDate = hFreezeDate(database);
+ if (isNotEmpty(lrg->mismatches))
+ {
+ printf("
Mismatches between %s and %s assembly sequence:
\n",
+ lrg->name, assemblyDate);
+ struct lrgDiff *mismatches = lrgParseMismatches(lrg);
+ printLrgDiffs(lrg, mismatches);
+ }
+ if (isNotEmpty(lrg->indels))
+ {
+ printf("
Insertions/deletions between %s and %s assembly sequence:
\n",
+ lrg->name, assemblyDate);
+ struct lrgDiff *indels = lrgParseIndels(lrg);
+ printLrgDiffs(lrg, indels);
+ }
+ }
+if (!found)
+ {
+ printf("No item '%s' starting at %d\n", emptyForNull(item), start);
+ }
+lmCleanup(&lm);
+printTrackHtml(tdb);
+}
+
+static struct genbankCds *getLrgCds(struct sqlConnection *conn, struct trackDb *tdb,
+ struct psl *psl)
+{
+char *cdsTable = trackDbSetting(tdb, "cdsTable");
+if (isEmpty(cdsTable) || !hTableExists(database, cdsTable))
+ return NULL;
+struct dyString *dy = sqlDyStringCreate("select cds from %s where id = '%s'",
+ cdsTable, psl->qName);
+char *cdsString = sqlQuickString(conn, dy->string);
+if (cdsString != NULL)
+ {
+ struct genbankCds *cds;
+ AllocVar(cds);
+ // Trash cdsString while parsing out start..end:
+ char *startString = cdsString, *endString = strchr(cdsString, '.');
+ if (endString == NULL || endString[1] != '.' || !isdigit(endString[2]))
+ errAbort("getLrgCds: can't parse cdsString '%s' for %s", cdsString, psl->qName);
+ *endString++ = '\0';
+ endString++;
+ cds->start = sqlUnsigned(startString) - 1;
+ cds->end = sqlUnsigned(endString);
+ cds->startComplete = TRUE;
+ cds->endComplete = TRUE;
+ cds->complement = (psl->strand[0] == '-');
+ return cds;
+ }
+return NULL;
+}
+
+void htcLrgCdna(char *item)
+/* Serve up LRG transcript cdna seq */
+{
+cartHtmlStart("LRG Transcript Sequence");
+char *pslTable = cartString(cart, "table");
+struct trackDb *tdb = hashFindVal(trackHash, pslTable);
+char *cdnaTable = trackDbSetting(tdb, "cdnaTable");
+if (isEmpty(cdnaTable) || !hTableExists(database, cdnaTable))
+ errAbort("htcLrgCdna: cdnaTable not found for '%s'", pslTable);
+
+struct sqlConnection *conn = hAllocConn(database);
+struct psl *psl = getAlignments(conn, pslTable, item);
+struct genbankCds *cds = getLrgCds(conn, tdb, psl);
+hFreeConn(&conn);
+
+struct dnaSeq *seq = hGenBankGetMrna(database, item, cdnaTable);
+if (seq == NULL)
+ errAbort("htcLrgCdna: no sequence for '%s' in cdnaTable '%s'", item, cdnaTable);
+tolowers(seq->dna);
+if (cds != NULL)
+ toUpperN(seq->dna + cds->start, cds->end - cds->start);
+
+printf("");
+printf(">%s\n", item);
+faWriteNext(stdout, NULL, seq->dna, seq->size);
+printf("
");
+
+hFreeConn(&conn);
+}
+
+void doLrgTranscriptPsl(struct trackDb *tdb, char *item)
+/* Locus Reference Genomic (LRG) transcript mapping and sequences. */
+{
+struct sqlConnection *conn = hAllocConn(database);
+struct psl *psl = getAlignments(conn, tdb->table, item);
+struct genbankCds *cds = getLrgCds(conn, tdb, psl);
+hFreeConn(&conn);
+
+genericHeader(tdb, item);
+char *url = trackDbSetting(tdb, "url");
+if (isNotEmpty(url))
+ {
+ char *lrgName = cloneString(item);
+ // Truncate the "t1" part to get the LRG ID for link:
+ char *p = strchr(lrgName, 't');
+ if (p)
+ *p = '\0';
+ char *urlLabel = trackDbSettingOrDefault(tdb, "urlLabel", "LRG Transcript link");
+ printf("%s ", urlLabel);
+ char *lrgTUrl = replaceInUrl(tdb, url, lrgName, TRUE);
+ printf("%s
\n", lrgTUrl, item);
+ }
+
+struct genePred *gp = genePredFromPsl3(psl, cds, genePredAllFlds, genePredPslCdsMod3, -1, -1);
+printPos(gp->chrom, gp->txStart, gp->txEnd, gp->strand, FALSE, NULL);
+printf("Links to %s sequence:
\n", item);
+printf("\n");
+char *pepTable = trackDbSetting(tdb, "pepTable");
+if ((pepTable != NULL) && hGenBankHaveSeq(database, item, pepTable))
+ {
+ puts("- \n");
+ hgcAnchorSomewhere("htcTranslatedProtein", item, pepTable, seqName);
+ printf("Translated Protein \n");
+ puts("
\n");
+ }
+puts("- \n");
+hgcAnchorSomewhere("htcLrgCdna", item, tdb->table, seqName);
+printf("Transcript mRNA (may be different from the genomic sequence)\n");
+puts("
\n");
+printf("
\n");
+
+printf("Alignment of %s to genome assembly:
", item);
+int start = cartInt(cart, "o");
+printAlignments(psl, start, "htcCdnaAli", tdb->table, item);
+
+printTrackHtml(tdb);
+}