c7723adc566681d4f69232076df356c36aa5a1cf
max
  Fri Sep 9 16:03:22 2016 -0700
adding first versin of pipeline for crispr tracks and trackDb statements supporting external extra fields, refs #17235

diff --git src/hg/hgc/bigBedClick.c src/hg/hgc/bigBedClick.c
index 174a478..7d12190 100644
--- src/hg/hgc/bigBedClick.c
+++ src/hg/hgc/bigBedClick.c
@@ -1,51 +1,310 @@
 /* Handle details pages for wiggle tracks. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "wiggle.h"
 #include "cart.h"
 #include "hgc.h"
 #include "hCommon.h"
 #include "hgColors.h"
 #include "bigBed.h"
 #include "hui.h"
+#include "subText.h"
 
 static void bigGenePredLinks(char *track, char *item)
 /* output links to genePred driven sequence dumps */
 {
 printf("<H3>Links to sequence:</H3>\n");
 printf("<UL>\n");
 puts("<LI>\n");
 hgcAnchorSomewhere("htcTranslatedPredMRna", item, "translate", seqName);
 printf("Translated Protein</A> from genomic DNA\n");
 puts("</LI>\n");
 
 puts("<LI>\n");
 hgcAnchorSomewhere("htcGeneMrna", item, track, seqName);
 printf("Predicted mRNA</A> \n");
 puts("</LI>\n");
 
 puts("<LI>\n");
 hgcAnchorSomewhere("htcGeneInGenome", item, track, seqName);
 printf("Genomic Sequence</A> from assembly\n");
 puts("</LI>\n");
 printf("</UL>\n");
 }
 
+void printMismatchString(char *a, char *b) 
+/* given two strings of same length, print . for every match and for mismatches, the letter of b */
+{
+int i = 0;
+while (TRUE)
+    {
+    if (a[i]=='\0' || b[i]=='\0')
+        break;
+    if (a[i]==b[i])
+        printf(".");
+    else
+        printf("%c", (b[i]));
+    i++;
+    }
+}
+
+static void detailsTabPrintSpecial(char *name, char *val, struct slPair *extraFields)
+/* some extra fields require special printing code, they all start with '_'  */
+{
+// crispr track: number of mismatches. A comma-sep string of integers
+if (sameWord(name, "_mismatchCounts"))
+    {
+    printf("<tr><td>Number of potential off-targets</td>\n");
+    printf("<td>\n");
+    char *words[255];
+    int wordCount = chopByChar(val, ',', words, ArraySize(words));
+    int i;
+    printf("<table style='border-style: hidden'><tr>\n");
+    for (i=0; i<wordCount; i++)
+        printf("<td style='border:1px solid #CCCCCC; font-weight: normal; width:auto'><b>%d mismatches:</b><br>%s off-targets</td>", i, words[i]);
+    printf("</tr></table>\n");
+    }
+
+// crispr track: locations of off-targets. A |-separated string of coords, including strand and 
+// a score
+// e.g. chr15;63615585-;71|chr16;8835640+;70
+if (sameWord(name, "_crisprOfftargets"))
+    {
+    printf("<tr><td>Potential Off-targets</td>\n");
+
+    printf("<td>\n");
+    char *coords[65536];
+    int coordCount = chopByChar(val, '|', coords, ArraySize(coords));
+    int i;
+
+    struct subText *subList = NULL;
+    slSafeAddHead(&subList, subTextNew("ig:", "intergenic "));
+    slSafeAddHead(&subList, subTextNew("ex:", "exon "));
+    slSafeAddHead(&subList, subTextNew("in:", "intron "));
+    slSafeAddHead(&subList, subTextNew("|", "-"));
+
+    struct sqlConnection *conn = hAllocConn(database);
+
+    boolean hasLocus = sqlTableExists(conn, "locusName");
+
+    printf("<table style='border-collapse:collapse; font-size:12px; table-layout:fixed'>\n");
+    printf("<tr>\n"
+           "<th style='width:26em'>Mismatched nucleotides</th>\n"
+           "<th style='width:9em'>CFD Score</th>\n");
+    if (hasLocus)
+           printf("<th style='width:40em'>Locus</th>\n");
+    printf("<th style='width:30em'>Position</th></tr>\n");
+
+    boolean collapsed = FALSE;
+    for (i=0; i<coordCount; i++)
+        {
+        if (i>10)
+            {
+            collapsed = TRUE;
+            printf("<tr class='crisprLinkHidden' style='display:none'>\n");
+            }
+        else
+            printf("<tr>\n");
+
+        // parse single coordinate string
+        // chr15;63615585-;71 = chrom;startPosStrand;scoreAsInt
+        char *parts[3];
+        chopByChar(coords[i], ';', parts, 3);
+        char* chrom = parts[0];
+        char* posStrand = parts[1];
+        char* scoreStr = parts[2];
+
+        // get score and strand
+        char strand = *(posStrand+strlen(posStrand)-1);
+        int pos = atol(posStrand);
+        int scoreInt = atoi(scoreStr);
+        float score = (float)scoreInt/1000;
+
+        // get the DNA sequence - this is slow! twoBit currently does not cache
+        // if the input is not sorted and this list is sorted by off-target score (CFD)
+        struct dnaSeq *seq = hDnaFromSeq(database, chrom, pos, pos+23, dnaUpper);
+        if (strand=='-')
+            reverseComplement(seq->dna, seq->size);
+        char *guideSeq = (char*)slPairFindVal(extraFields, "guideSeq");
+        // PAM = the last three chars of the off-target
+        char *pam = seq->dna+20;
+            
+        // print sequence + PAM
+        printf("<td><tt>");
+        printMismatchString(guideSeq, seq->dna);
+        printf("&nbsp;%s", pam);
+        printf("</tt></td>\n");
+
+        // print score of off-target
+        printf("<td>%0.3f</td>", score);
+
+        // print name of this locus
+        if (hasLocus)
+            {
+            struct sqlResult *sr = hRangeQuery(conn, "locusName", chrom, pos, pos+23, NULL, 0);
+            char **row;
+            row = sqlNextRow(sr);
+            if (row != NULL)
+                {
+                char *desc = row[4];
+                char *descLong = subTextString(subList, desc);
+                printf("<td>%s</td>", descLong);
+                freeMem(descLong);
+                }
+            sqlFreeResult(&sr);
+            }
+        
+        // print link to location
+        printf("<td><a href='%s&db=%s&position=%s%%3A%d-%d'>%s:%d (%c)</a></td>\n", 
+            hgTracksPathAndSettings(), database,
+            chrom, pos+1, pos+23, chrom, pos+1, strand);
+
+        printf("</tr>\n");
+        }
+    hFreeConn(&conn);
+    printf("<tr>\n");
+    printf("</table>\n");
+    if (collapsed)
+        {
+        printf("<p><a id='crisprShowAllLink' href='javascript:crisprShowAll()'>"
+            "Show all %d off-targets...</a>\n", coordCount);
+        // inline .js is bad style but why pollute our global .js files for such a rare
+        // case? Maybe we should have a generic "collapsible" class, like bootstrap?
+        printf("<script>\n");
+        printf("function crisprShowAll() {\n");
+        printf("    $('#crisprShowAllLink').hide();\n");
+        printf("    $('.crisprLinkHidden').show();\n");
+        printf("    return false;\n");
+        printf("}\n");
+        printf("</script>\n");
+        }
+    }
+}
+
+static void seekAndPrintTable(char *url, off_t offset, struct slPair *extraFields)
+/* seek to 0 at url, get headers, then seek to offset, read tab-sep fields and output 
+ * (extraFields are needed for some special field handlers) */
+{
+char *detailsUrl = replaceChars(url, "$db", database);
+
+// open the URL
+struct lineFile *lf = lineFileUdcMayOpen(detailsUrl, FALSE);
+if (lf==NULL)
+    {
+    printf("Error: Could not open the URL referenced in detailsTabUrls, %s", detailsUrl);
+    return;
+    }
+
+// get the headers
+char *headLine;
+int lineSize;
+lineFileNext(lf, &headLine, &lineSize);
+char *headers[1024];
+int headerCount = chopTabs(headLine, headers);
+
+lineFileSeek(lf, offset, SEEK_SET);
+
+// read a line
+char *detailsLine;
+lineFileNext(lf, &detailsLine, &lineSize);
+if (!detailsLine || isEmpty(detailsLine))
+    return;
+char *fields[1024];
+int fieldCount = chopTabs(detailsLine, fields);
+
+if (fieldCount!=headerCount)
+    {
+    printf("Error encountered when reading %s:<br>", detailsUrl);
+    printf("The header line of the tab-sep file has a different number of fields compared ");
+    printf("with the line pointed to by offset %lld in the bigBed file.<br>", (long long int)offset);
+    printf("Number of headers: %d", headerCount);
+    printf("Number of fields at offset: %d", fieldCount);
+    return;
+    }
+
+// print the table for all external extra fields 
+printf("<br><table class='bedExtraTbl'>\n");
+fieldCount = min(fieldCount, headerCount);
+int i;
+for (i=0; i<fieldCount; i++)
+{
+    char *name = headers[i];
+    char *val  = fields[i];
+    
+    if (startsWith("_", name))
+        detailsTabPrintSpecial(name, val, extraFields);
+    else
+        {
+        printf("<tr><td>%s</td>\n", name);
+        printf("<td>%s</td></tr>\n", val);
+        }
+}
+printf("</table>\n");
+
+lineFileClose(&lf);
+freez(&detailsUrl);
+}
+
+static void printAllExternalExtraFields(struct trackDb *tdb, struct slPair *extraFields)
+/* handle the "detailsTabUrls" trackDb setting: 
+ * For each field, print a separate html table with all field names and values
+ * from the external tab-sep file */
+
+{
+char *detailsUrlsStr = trackDbSetting(tdb, "detailsTabUrls");
+if (!detailsUrlsStr)
+    return;
+
+struct slPair *detailsUrls = slPairListFromString(detailsUrlsStr, TRUE);
+if (!detailsUrls)
+    {
+    printf("Problem when parsing trackDb setting detailsTabUrls<br>\n");
+    printf("Expected: a space-separated key=val list, like 'fieldName1=URL1 fieldName2=URL2'<br>\n");
+    printf("But got: '%s'<br>", detailsUrlsStr);
+    return;
+    }
+
+struct slPair *pair;
+for (pair = detailsUrls; pair != NULL; pair = pair->next)
+    {
+    char *fieldName = pair->name;
+    char *detailsUrl = pair->val;
+
+    // get extra bigBed field (=the offset) and seek to it
+    void *p = slPairFindVal(extraFields, fieldName);
+    if (p==NULL)
+        {
+        printf("Error when parsing trackDb detailsTabUrls statement:<br>\n");
+        printf("Cannot find extra bigBed field with name %s\n", fieldName);
+        return;
+        }
+    char *offsetStr = (char*)p;
+
+    if (offsetStr==NULL || sameWord(offsetStr, "0"))
+        // empty or "0" value in bigBed means that the lookup should not be performed
+        continue;
+    off_t offset = atoll(offsetStr);
+
+    seekAndPrintTable(detailsUrl, offset, extraFields);
+    }
+slPairFreeValsAndList(&detailsUrls);
+}
+
 static void bigBedClick(char *fileName, struct trackDb *tdb,
                      char *item, int start, int end, int bedSize)
 /* Handle click in generic bigBed track. */
 {
 boolean showUrl = FALSE;
 char *chrom = cartString(cart, "c");
 
 /* Open BigWig file and get interval list. */
 struct bbiFile *bbi = bigBedFileOpen(fileName);
 struct lm *lm = lmInit(0);
 int ivStart = start, ivEnd = end;
 if (start == end)
     {
     // item is an insertion; expand the search range from 0 bases to 2 so we catch it:
     ivStart = max(0, start-1);
@@ -107,45 +366,54 @@
     if (showUrl && (bedSize >= 4))
         printCustomUrl(tdb, item, TRUE);
     bedPrintPos(bed, bedSize, tdb);
 
     // display seq1 and seq2
     if (seq1Seq2 && bedSize+seq1Seq2Fields == 8)
         printf("<table><tr><th>Sequence 1</th><th>Sequence 2</th></tr>"
 	       "<tr><td> %s </td><td> %s </td></tr></table>", fields[6], fields[7]);
     else if (isNotEmpty(rest))
 	{
 	char *restFields[256];
 	int restCount = chopTabs(rest, restFields);
 	int restBedFields = bedSize - 3;
 	if (restCount > restBedFields)
 	    {
-            if (0 == extraFieldsPrint(tdb,NULL,restFields + restBedFields,restCount - restBedFields))
+            char **extraFields = (restFields + restBedFields);
+            int extraFieldCount = restCount - restBedFields;
+            int printCount = extraFieldsPrint(tdb,NULL,extraFields, extraFieldCount);
+
+            struct slPair* extraFieldPairs = getExtraFields(tdb, extraFields, extraFieldCount);
+            printAllExternalExtraFields(tdb, extraFieldPairs);
+
+            if (printCount == 0)
                 {
                 int i;
                 char label[20];
                 safef(label, sizeof(label), "nonBedFieldsLabel");
                 printf("<B>%s&nbsp;</B>",
                        trackDbSettingOrDefault(tdb, label, "Non-BED fields:"));
                 for (i = restBedFields;  i < restCount;  i++)
                     printf("%s%s", (i > 0 ? "\t" : ""), restFields[i]);
                 printf("<BR>\n");
                 }
 	    }
 	if (sameString(tdb->type, "bigGenePred"))
 	    bigGenePredLinks(tdb->track, item);
 	}
+
+
     if (isCustomTrack(tdb->track))
 	{
 	time_t timep = bbiUpdateTime(bbi);
 	printBbiUpdateTime(&timep);
 	}
 
     }
 
 if (!found)
     {
     printf("No item %s starting at %d\n", emptyForNull(item), start);
     }
 
 lmCleanup(&lm);
 bbiFileClose(&bbi);