15973bdabacf5ddf5dd9289d05e83a7cd4080dbb braney Tue Nov 9 09:12:48 2021 -0800 support for bigRmsk from Robert Hubley robert.hubley@isbscience.org diff --git src/hg/hgc/bigRmskClick.c src/hg/hgc/bigRmskClick.c new file mode 100644 index 0000000..9ef3f1d --- /dev/null +++ src/hg/hgc/bigRmskClick.c @@ -0,0 +1,368 @@ +/* bigRmskClick - Click handler for bigRmskTrack. This is based + * loosely on the original rmsk click handler inside + * of hgc.c. + * + * Written by Robert Hubley 10/2021 + */ + +/* Copyright (C) 2021 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "common.h" +#include "hgc.h" +#include "rmskAlign.h" +#include "hCommon.h" + + +void printAlignmentBR (struct rmskAlign *ra) +/* + * Print RepeatMasker alignment data stored in RM's cAlign format. + * The format is basically a lightly compressed diff format where + * the query and subject are merged into one squence line. The + * runs of exact matching sequences are interrupted by either + * single base substitutions annotated as queryBase "/" subjectBase, + * insertions in the subject annotated as "+ACTAT+", or deletions + * in the query annotated as "-ACTTTG-". + */ +{ +int alignLength = 80; +char querySeq[alignLength + 1]; +char diffSeq[alignLength + 1]; +char subjSeq[alignLength + 1]; +int inSub = 0; +int inDel = 0; +int inIns = 0; + +int aIdx = 0; +int sIdx = 0; +int qCnt = 0; +int sCnt = 0; +int qStart = ra->genoStart; +int sStart = ra->repStart; +if (ra->strand[0] == '-') + sStart = ra->repEnd; + +int maxNameLen = + (strlen (ra->genoName) > + strlen (ra->repName) ? strlen (ra->genoName) : strlen (ra->repName)); + +while (ra->alignment[aIdx] != '\0') + { + if (ra->alignment[aIdx] == '/') + inSub = 1; + else if (ra->alignment[aIdx] == '-') + inDel ^= 1; + else if (ra->alignment[aIdx] == '+') + inIns ^= 1; + else + { + if (inSub) + { + subjSeq[sIdx - 1] = ra->alignment[aIdx]; + if ((querySeq[sIdx - 1] == 'C' && + subjSeq[sIdx - 1] == 'T') || + (querySeq[sIdx - 1] == 'T' && + subjSeq[sIdx - 1] == 'C') || + (querySeq[sIdx - 1] == 'A' && + subjSeq[sIdx - 1] == 'G') || + (querySeq[sIdx - 1] == 'G' && subjSeq[sIdx - 1] == 'A')) + diffSeq[sIdx - 1] = 'i'; + else if ((index ("BDHVRYKMSWNX", querySeq[sIdx - 1]) != NULL) || + (index ("BDHVRYKMSWNX", subjSeq[sIdx - 1]) != NULL)) + diffSeq[sIdx - 1] = '?'; + else + diffSeq[sIdx - 1] = 'v'; + inSub = 0; + } + else if (inDel) + { + querySeq[sIdx] = ra->alignment[aIdx]; + subjSeq[sIdx] = '-'; + diffSeq[sIdx] = '-'; + qCnt++; + sIdx++; + } + else if (inIns) + { + querySeq[sIdx] = '-'; + subjSeq[sIdx] = ra->alignment[aIdx]; + diffSeq[sIdx] = '-'; + sCnt++; + sIdx++; + } + else + { + diffSeq[sIdx] = ' '; + querySeq[sIdx] = ra->alignment[aIdx]; + subjSeq[sIdx] = ra->alignment[aIdx]; + sCnt++; + qCnt++; + sIdx++; + } + if (sIdx == alignLength) + { + querySeq[sIdx] = '\0'; + diffSeq[sIdx] = '\0'; + subjSeq[sIdx] = '\0'; + printf ("%*s %10d %s %d\n", maxNameLen, ra->genoName, qStart, + querySeq, (qStart + qCnt - 1)); + printf ("%*s %s\n", maxNameLen, " ", diffSeq); + if (ra->strand[0] == '+') + printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, + subjSeq, (sStart + sCnt - 1)); + else + printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, + subjSeq, (sStart - sCnt + 1)); + printf ("\n"); + qStart += qCnt; + if (ra->strand[0] == '+') + sStart += sCnt; + else + sStart -= sCnt; + qCnt = 0; + sCnt = 0; + sIdx = 0; + } + } + aIdx++; + } +if (sIdx) + { + querySeq[sIdx] = '\0'; + diffSeq[sIdx] = '\0'; + subjSeq[sIdx] = '\0'; + printf ("%*s %10d %s %d\n", maxNameLen, ra->genoName, qStart, querySeq, + (qStart + qCnt - 1)); + printf ("%*s %s\n", maxNameLen, " ", diffSeq); + if (ra->strand[0] == '+') + printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, subjSeq, + (sStart + sCnt - 1)); + else + printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, subjSeq, + (sStart - sCnt + 1)); + ; + } +} + +void printOutTableHeaderBR (char strand) +/* Print the appropriate HTML table header + * given the strand of the element. + */ +{ +const char *hd1_style = + "style=\"background-color:#C2C9E0; font-size:14px; padding:4px 14px;\""; +const char *hd1span_style = + "style=\"background-color:#6678B1; font-size:14px; padding:4px 14px;\""; +const char *hd2_style = + "style=\"background-color:#C2C9E0; font-size:14px; padding:4px 14px; " + "border-bottom: 2px solid #6678B1;\""; +printf (" <thead>\n"); +printf (" <tr>\n"); +printf (" <th %s>bit/sw</th>\n", hd1_style); +printf (" <th %s>perc</th>\n", hd1_style); +printf (" <th %s>perc</th>\n", hd1_style); +printf (" <th %s>perc</th>\n", hd1_style); +printf (" <th colspan=\"4\" %s>query</th>\n", hd1span_style); +printf (" <th %s></th>\n", hd1_style); +printf (" <th colspan=\"5\" %s>matching repeat</th>\n", hd1span_style); +printf (" <th %s></th>\n", hd1_style); +printf (" </tr>\n"); +printf (" <tr>\n"); +printf (" <th %s>score</th>\n", hd2_style); +printf (" <th %s>div.</th>\n", hd2_style); +printf (" <th %s>del.</th>\n", hd2_style); +printf (" <th %s>ins.</th>\n", hd2_style); +printf (" <th %s>sequence</th>\n", hd2_style); +printf (" <th %s>begin</th>\n", hd2_style); +printf (" <th %s>end</th>\n", hd2_style); +printf (" <th %s>remaining</th>\n", hd2_style); +printf (" <th %s>orient.</th>\n", hd2_style); +printf (" <th %s>name</th>\n", hd2_style); +printf (" <th %s>class/family</th>\n", hd2_style); +if (strand == 'C') + { + printf (" <th %s>remaining</th>\n", hd2_style); + printf (" <th %s>end</th>\n", hd2_style); + printf (" <th %s>begin</th>\n", hd2_style); + } +else + { + printf (" <th %s>begin</th>\n", hd2_style); + printf (" <th %s>end</th>\n", hd2_style); + printf (" <th %s>remaining</th>\n", hd2_style); + } +printf (" <th %s>id</th>\n", hd2_style); +printf (" </tr>\n"); +printf (" </thead>\n"); +} + + +void doBigRmskRepeat (struct trackDb *tdb, char *item) +/* Main entry point */ +{ +cartWebStart (cart, database, "%s", tdb->longLabel); +char *chrom = cartString(cart, "c"); +int start = cartInt(cart, "o"); +int end = cartInt(cart, "t"); +char *fileName = trackDbSetting(tdb, "bigDataUrl"); + +/* Open BigWig file and get interval list. */ +struct bbiFile *bbi = bigBedFileOpen(fileName); +struct lm *lm = lmInit(0); +struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); + +/* Find particular item in list - matching start, and item if possible. */ +boolean found = FALSE; +struct bigBedInterval *bb; +const char *data_style = "style=\"padding:0px 6px;\""; +int bedSize = bbi->fieldCount; +char *fields[bedSize]; +char startBuf[16], endBuf[16]; +for (bb = bbList; bb != NULL; bb = bb->next) + { + bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, + bedSize); + if ( sameOk(fields[12], item) ) + { + found = TRUE; + break; + } + } + +if ( found ) + { + char class[32]; + class[0] = '\0'; + char subclass[32]; + subclass[0] = '\0'; + char *poundPtr = index (fields[3], '#'); + if (poundPtr) + { + // Terminate name string properly + safecpy (class, sizeof (class), poundPtr + 1); + *poundPtr = '\0'; + char *slashPtr = index (class, '/'); + if (slashPtr) + { + // Terminate class string properly + safecpy (subclass, sizeof (subclass), slashPtr + 1); + *slashPtr = '\0'; + } + } + printf ("<b>Repeat:</b> %s<br>\n", fields[3]); + printf ("<b>Class:</b> %s<br>\n", class); + printf ("<b>Subclass:</b> %s<br>\n", subclass); + printf ("<b>Orientation:</b> %s<br>\n", fields[5]); + //printf ("<b>Joined Element Genomic Range:</b> %s:%d-%d<br>\n", + //rmJoin->chrom, rmJoin->alignStart+1, rmJoin->alignEnd); + printf ("<br><br>\n"); + + printf ("<h4>RepeatMasker Joined Annotations:</h4>\n"); + printf + ("The RepeatMasker annotation line(s) for this element. " + "If the element is fragmented the output will contain one " + "line per joined fragment.<p>\n"); + + printf ("<table cellspacing=\"0\">\n"); + char *description = fields[13]; + printOutTableHeaderBR(*fields[5]); + char *outLine; + char *outLineTok = description; + while ((outLine = strsep(&outLineTok,",")) != NULL) + { + char *outField; + char *outFieldTok = outLine; + printf (" <tr>\n"); + while ((outField = strsep(&outFieldTok," ")) != NULL) + { + printf (" <td %s>%s</td>\n", data_style, outField); + } + printf (" <td>\n"); + printf ("<A HREF=\"%s&db=%s&position=%s%%3A%d-%s\">Browser</A>", + hgTracksPathAndSettings (), database, fields[0], + atoi(fields[6]) + 1, fields[7]); + char *tbl = cgiUsualString ("table", cgiString ("g")); + printf (" - <A HREF=\"%s&o=%s&g=getDna&i=%s&c=%s&l=%s&r=%s&" + "strand=%s&table=%s\">" + "DNA</A>\n", hgcPathAndSettings (), fields[6], + (fields[3] != NULL ? cgiEncode (fields[3]) : ""), + fields[0], fields[6], fields[7], cgiEncode (fields[5]), + tbl); + printf (" </td>\n"); + printf (" </tr>\n"); + } + printf("</table>\n"); + } +else + { + printf("No item %s starting at %d\n", emptyForNull(item), start); + } +lmCleanup(&lm); +bbiFileClose(&bbi); + + /* + * Locate *.align data for this element + */ +/* + * DISABLED FOR NOW, this might be part of a future enhancement if we create a auxilary bigBed file to + * contain this data + * + if (hTableExists (database, alignTable)) + { + struct rmskAlign *ro; + if (!hFindSplitTable (database, seqName, alignTable, qTable, sizeof(qTable), &hasBin)) + errAbort("track %s not found", alignTable); + sqlSafef (query, sizeof (query), + "select * from %s where genoName = '%s' and genoStart >= %d" + " and id = %s", qTable, seqName, start-1, repeat); + sr2 = sqlGetResult (conn2, query); + printf ("<h4>RepeatMasker Alignments:</h4>\n"); + printf + ("The raw alignment data used by RepeatMasker to generate " + "the final annotation call for this element. NOTE: The " + "aligned sequence names and consensus positions may differ " + "from the final annotation.<p>\n"); + printf ("<table>\n"); + while ((row = sqlNextRow (sr2)) != NULL) + { + ro = rmskAlignLoad (row + hasBin); + printf (" <tr>\n"); + printf (" <td>%d</td>\n", ro->swScore); + printf (" <td>%3.2f</td>\n", + (double) ro->milliDiv * (double) 0.01); + printf (" <td>%3.2f</td>\n", + (double) ro->milliDel * (double) 0.01); + printf (" <td>%3.2f</td>\n", + (double) ro->milliIns * (double) 0.01); + printf (" <td>%s</td>\n", ro->genoName); + printf (" <td>%d</td>\n", ro->genoStart + 1); + printf (" <td>%d</td>\n", ro->genoEnd); + printf (" <td>(%d)</td>\n", ro->genoLeft); + printf (" <td>%s</td>\n", ro->strand); + printf (" <td>%s</td>\n", ro->repName); + printf (" <td>%s/%s</td>\n", ro->repClass, ro->repFamily); + if (ro->strand[0] == '-') + { + printf (" <td>(%d)</td>\n", ro->repLeft); + printf (" <td>%d</td>\n", ro->repEnd); + printf (" <td>%d</td>\n", ro->repStart); + } + else + { + printf (" <td>%d</td>\n", ro->repStart); + printf (" <td>%d</td>\n", ro->repEnd); + printf (" <td>(%d)</td>\n", ro->repLeft); + } + printf (" <td>%d</td>\n", ro->id); + printf (" </tr><tr><td colspan=\"15\"><pre>\n"); + printAlignment (ro); + printf (" </pre><td></tr>\n"); + } + sqlFreeResult (&sr2); + printf ("</table>\n"); + } + hFreeConn (&conn2); + } + */ +printTrackHtml (tdb); +}