18b8815d717cb178232321e32c31dba40fd5e72d markd Wed Jul 13 17:12:25 2022 -0700 integrate bigRmsk track updates from Robert Hubley diff --git src/hg/hgc/bigRmskClick.c src/hg/hgc/bigRmskClick.c index 8527233..e7c1718 100644 --- src/hg/hgc/bigRmskClick.c +++ src/hg/hgc/bigRmskClick.c @@ -3,157 +3,180 @@ * of hgc.c. * * Written by Robert Hubley 10/2021 */ /* Copyright (C) 2021 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hgc.h" #include "rmskAlign.h" #include "hCommon.h" #include "chromAlias.h" -void printAlignmentBR (struct rmskAlign *ra) +struct bigRmskAlignRecord + { + char *chrom; /*Reference sequence chromosome or scaffold*/ + unsigned chromStart; /*Start position of alignment on chromosome*/ + unsigned chromEnd; /*End position of alignment on chromosome*/ + unsigned chromRemain; /*Remaining bp in the chromosome or scaffold*/ + float score; /*alignment score (sw, bits or evalue)*/ + float percSubst; /*Base substitution percentage*/ + float percDel; /*Base deletion percentage*/ + float percIns; /*Bases insertion percentage*/ + char strand[2]; /*Strand - either + or -*/ + char *repName; /*Name of repeat*/ + char *repType; /*Type of repeat*/ + char *repSubtype; /*Subtype of repeat*/ + unsigned repStart; /*Start in repeat sequence*/ + unsigned repEnd; /*End in repeat sequence*/ + unsigned repRemain; /*Remaining unaligned bp in the repeat sequence*/ + unsigned id; /*The ID of the hit. Used to link related fragments*/ + char *calignData; /*The alignment data stored as a single string*/ + }; + + +void printAlignmentBR (char strand, char * genoName, uint genoStart, uint genoEnd, + char *repName, uint repStart, uint repEnd, + char *calignData ) /* * Print RepeatMasker alignment data stored in RM's cAlign format. * The format is basically a lightly compressed diff format where * the query and subject are merged into one squence line. The * runs of exact matching sequences are interrupted by either * single base substitutions annotated as queryBase "/" subjectBase, * insertions in the subject annotated as "+ACTAT+", or deletions * in the query annotated as "-ACTTTG-". */ { int alignLength = 80; char querySeq[alignLength + 1]; char diffSeq[alignLength + 1]; char subjSeq[alignLength + 1]; int inSub = 0; int inDel = 0; int inIns = 0; int aIdx = 0; int sIdx = 0; int qCnt = 0; int sCnt = 0; -int qStart = ra->genoStart; -int sStart = ra->repStart; -if (ra->strand[0] == '-') - sStart = ra->repEnd; +uint qStart = genoStart; +uint sStart = repStart; +if (strand == '-') + sStart = repEnd; int maxNameLen = - (strlen (ra->genoName) > - strlen (ra->repName) ? strlen (ra->genoName) : strlen (ra->repName)); + (strlen(genoName) > + strlen(repName) ? strlen(genoName) : strlen(repName)); -while (ra->alignment[aIdx] != '\0') +while (calignData[aIdx] != '\0') { - if (ra->alignment[aIdx] == '/') + if (calignData[aIdx] == '/') inSub = 1; - else if (ra->alignment[aIdx] == '-') + else if (calignData[aIdx] == '-') inDel ^= 1; - else if (ra->alignment[aIdx] == '+') + else if (calignData[aIdx] == '+') inIns ^= 1; else { if (inSub) { - subjSeq[sIdx - 1] = ra->alignment[aIdx]; + subjSeq[sIdx - 1] = calignData[aIdx]; if ((querySeq[sIdx - 1] == 'C' && subjSeq[sIdx - 1] == 'T') || (querySeq[sIdx - 1] == 'T' && subjSeq[sIdx - 1] == 'C') || (querySeq[sIdx - 1] == 'A' && subjSeq[sIdx - 1] == 'G') || (querySeq[sIdx - 1] == 'G' && subjSeq[sIdx - 1] == 'A')) diffSeq[sIdx - 1] = 'i'; else if ((index ("BDHVRYKMSWNX", querySeq[sIdx - 1]) != NULL) || (index ("BDHVRYKMSWNX", subjSeq[sIdx - 1]) != NULL)) diffSeq[sIdx - 1] = '?'; else diffSeq[sIdx - 1] = 'v'; inSub = 0; } else if (inDel) { - querySeq[sIdx] = ra->alignment[aIdx]; + querySeq[sIdx] = calignData[aIdx]; subjSeq[sIdx] = '-'; diffSeq[sIdx] = '-'; qCnt++; sIdx++; } else if (inIns) { querySeq[sIdx] = '-'; - subjSeq[sIdx] = ra->alignment[aIdx]; + subjSeq[sIdx] = calignData[aIdx]; diffSeq[sIdx] = '-'; sCnt++; sIdx++; } else { diffSeq[sIdx] = ' '; - querySeq[sIdx] = ra->alignment[aIdx]; - subjSeq[sIdx] = ra->alignment[aIdx]; + querySeq[sIdx] = calignData[aIdx]; + subjSeq[sIdx] = calignData[aIdx]; sCnt++; qCnt++; sIdx++; } if (sIdx == alignLength) { querySeq[sIdx] = '\0'; diffSeq[sIdx] = '\0'; subjSeq[sIdx] = '\0'; - printf ("%*s %10d %s %d\n", maxNameLen, ra->genoName, qStart, + printf ("%*s %10d %s %d\n", maxNameLen, genoName, qStart, querySeq, (qStart + qCnt - 1)); printf ("%*s %s\n", maxNameLen, " ", diffSeq); - if (ra->strand[0] == '+') - printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, + if (strand == '+') + printf ("%*s %10d %s %d\n", maxNameLen, repName, sStart, subjSeq, (sStart + sCnt - 1)); else - printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, + printf ("%*s %10d %s %d\n", maxNameLen, repName, sStart, subjSeq, (sStart - sCnt + 1)); printf ("\n"); qStart += qCnt; - if (ra->strand[0] == '+') + if (strand == '+') sStart += sCnt; else sStart -= sCnt; qCnt = 0; sCnt = 0; sIdx = 0; } } aIdx++; } if (sIdx) { querySeq[sIdx] = '\0'; diffSeq[sIdx] = '\0'; subjSeq[sIdx] = '\0'; - printf ("%*s %10d %s %d\n", maxNameLen, ra->genoName, qStart, querySeq, + printf ("%*s %10d %s %d\n", maxNameLen, genoName, qStart, querySeq, (qStart + qCnt - 1)); printf ("%*s %s\n", maxNameLen, " ", diffSeq); - if (ra->strand[0] == '+') - printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, subjSeq, + if (strand == '+') + printf ("%*s %10d %s %d\n", maxNameLen, repName, sStart, subjSeq, (sStart + sCnt - 1)); else - printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, subjSeq, + printf ("%*s %10d %s %d\n", maxNameLen, repName, sStart, subjSeq, (sStart - sCnt + 1)); - ; } } void printOutTableHeaderBR (char strand) /* Print the appropriate HTML table header * given the strand of the element. */ { const char *hd1_style = "style=\"background-color:#C2C9E0; font-size:14px; padding:4px 14px;\""; const char *hd1span_style = "style=\"background-color:#6678B1; font-size:14px; padding:4px 14px;\""; const char *hd2_style = "style=\"background-color:#C2C9E0; font-size:14px; padding:4px 14px; " "border-bottom: 2px solid #6678B1;\""; @@ -193,44 +216,45 @@ printf (" <th %s>remaining</th>\n", hd2_style); } printf (" <th %s>id</th>\n", hd2_style); printf (" </tr>\n"); printf (" </thead>\n"); } void doBigRmskRepeat (struct trackDb *tdb, char *item) /* Main entry point */ { cartWebStart (cart, database, "%s", tdb->longLabel); char *chrom = cartString(cart, "c"); int start = cartInt(cart, "o"); int end = cartInt(cart, "t"); -char *fileName = trackDbSetting(tdb, "bigDataUrl"); -/* Open BigWig file and get interval list. */ + +/* Open bigBed file and get interval list. */ +char *fileName = trackDbSetting(tdb, "bigDataUrl"); struct bbiFile *bbi = bigBedFileOpenAlias(fileName,chromAliasFindAliases); struct lm *lm = lmInit(0); struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); /* Find particular item in list - matching start, and item if possible. */ boolean found = FALSE; struct bigBedInterval *bb; const char *data_style = "style=\"padding:0px 6px;\""; int bedSize = bbi->fieldCount; char *fields[bedSize]; -char startBuf[16], endBuf[16]; +char startBuf[17], endBuf[17]; for (bb = bbList; bb != NULL; bb = bb->next) { bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, bedSize); if ( sameOk(fields[12], item) ) { found = TRUE; break; } } if ( found ) { char class[32]; class[0] = '\0'; @@ -292,78 +316,96 @@ printf (" </td>\n"); printf (" </tr>\n"); } printf("</table>\n"); } else { printf("No item %s starting at %d\n", emptyForNull(item), start); } lmCleanup(&lm); bbiFileClose(&bbi); /* * Locate *.align data for this element */ -/* - * DISABLED FOR NOW, this might be part of a future enhancement if we create a auxilary bigBed file to - * contain this data - * - if (hTableExists (database, alignTable)) +char *xrefDataUrl = trackDbSetting(tdb, "xrefDataUrl"); +if ( xrefDataUrl ) { - struct rmskAlign *ro; - if (!hFindSplitTable (database, seqName, alignTable, qTable, sizeof(qTable), &hasBin)) - errAbort("track %s not found", alignTable); - sqlSafef (query, sizeof (query), - "select * from %s where genoName = '%s' and genoStart >= %d" - " and id = %s", qTable, seqName, start-1, repeat); - sr2 = sqlGetResult (conn2, query); + //struct bbiFile *abbi = bigBedFileOpen(xrefDataUrl); + struct bbiFile *abbi = bigBedFileOpenAlias(xrefDataUrl,chromAliasFindAliases); + struct lm *alm = lmInit(0); + struct bigBedInterval *abbList = bigBedIntervalQuery(abbi, chrom, start, end, 0, alm); + printf ("<h4>RepeatMasker Alignments:</h4>\n"); - printf - ("The raw alignment data used by RepeatMasker to generate " + printf ("The raw alignment data used by RepeatMasker to generate " "the final annotation call for this element. NOTE: The " "aligned sequence names and consensus positions may differ " "from the final annotation.<p>\n"); printf ("<table>\n"); - while ((row = sqlNextRow (sr2)) != NULL) + + struct bigRmskAlignRecord *arec = NULL; + AllocVar(arec); + for (bb = abbList; bb != NULL; bb = bb->next) + { + char *afields[abbi->fieldCount]; + bigBedIntervalToRow(bb, chrom, startBuf, endBuf, afields, + abbi->fieldCount); + if ( sameOk(afields[15], item) ) { - ro = rmskAlignLoad (row + hasBin); + arec->chrom = afields[0]; + arec->chromStart = sqlUnsigned(afields[1]); + arec->chromEnd = sqlUnsigned(afields[2]); + arec->chromRemain = sqlUnsigned(afields[3]); + arec->score = sqlFloat(afields[4]); + arec->percSubst = sqlFloat(afields[5]); + arec->percDel = sqlFloat(afields[6]); + arec->percIns = sqlFloat(afields[7]); + safef(arec->strand, sizeof(arec->strand), "%s", afields[8]); + arec->repName = afields[9]; + arec->repType = afields[10]; + arec->repSubtype = afields[11]; + arec->repStart = sqlUnsigned(afields[12]); + arec->repEnd = sqlUnsigned(afields[13]); + arec->repRemain = sqlUnsigned(afields[14]); + arec->id = sqlUnsigned(afields[15]); + arec->calignData = afields[16]; + printf (" <tr>\n"); - printf (" <td>%d</td>\n", ro->swScore); - printf (" <td>%3.2f</td>\n", - (double) ro->milliDiv * (double) 0.01); - printf (" <td>%3.2f</td>\n", - (double) ro->milliDel * (double) 0.01); - printf (" <td>%3.2f</td>\n", - (double) ro->milliIns * (double) 0.01); - printf (" <td>%s</td>\n", ro->genoName); - printf (" <td>%d</td>\n", ro->genoStart + 1); - printf (" <td>%d</td>\n", ro->genoEnd); - printf (" <td>(%d)</td>\n", ro->genoLeft); - printf (" <td>%s</td>\n", ro->strand); - printf (" <td>%s</td>\n", ro->repName); - printf (" <td>%s/%s</td>\n", ro->repClass, ro->repFamily); - if (ro->strand[0] == '-') + printf (" <td>%.2f</td>\n", arec->score); + printf (" <td>%3.2f</td>\n", arec->percSubst); + printf (" <td>%3.2f</td>\n", arec->percDel); + printf (" <td>%3.2f</td>\n", arec->percIns); + printf (" <td>%s</td>\n", arec->chrom); + printf (" <td>%d</td>\n", arec->chromStart + 1); + printf (" <td>%d</td>\n", arec->chromEnd); + printf (" <td>(%d)</td>\n", arec->chromRemain); + printf (" <td>%s</td>\n", arec->strand); + printf (" <td>%s</td>\n", arec->repName); + printf (" <td>%s/%s</td>\n", arec->repType, arec->repSubtype); + if (arec->strand[0] == '-') { - printf (" <td>(%d)</td>\n", ro->repLeft); - printf (" <td>%d</td>\n", ro->repEnd); - printf (" <td>%d</td>\n", ro->repStart); + printf (" <td>(%d)</td>\n", arec->repRemain); + printf (" <td>%d</td>\n", arec->repEnd); + printf (" <td>%d</td>\n", arec->repStart); } else { - printf (" <td>%d</td>\n", ro->repStart); - printf (" <td>%d</td>\n", ro->repEnd); - printf (" <td>(%d)</td>\n", ro->repLeft); + printf (" <td>%d</td>\n", arec->repStart); + printf (" <td>%d</td>\n", arec->repEnd); + printf (" <td>(%d)</td>\n", arec->repRemain); } - printf (" <td>%d</td>\n", ro->id); + printf (" <td>%d</td>\n", arec->id); printf (" </tr><tr><td colspan=\"15\"><pre>\n"); - printAlignment (ro); + printAlignmentBR ( + arec->strand[0], arec->chrom, arec->chromStart, arec->chromEnd, + arec->repName, arec->repStart, arec->repEnd, + arec->calignData ); printf (" </pre><td></tr>\n"); } - sqlFreeResult (&sr2); + } //for(bb = abblist.... printf ("</table>\n"); - } - hFreeConn (&conn2); - } - */ + lmCleanup(&alm); + bbiFileClose(&abbi); + } // if ( xref... printTrackHtml(tdb); }