18b8815d717cb178232321e32c31dba40fd5e72d markd Wed Jul 13 17:12:25 2022 -0700 integrate bigRmsk track updates from Robert Hubley diff --git src/hg/hgc/bigRmskClick.c src/hg/hgc/bigRmskClick.c index 8527233..e7c1718 100644 --- src/hg/hgc/bigRmskClick.c +++ src/hg/hgc/bigRmskClick.c @@ -1,369 +1,411 @@ /* bigRmskClick - Click handler for bigRmskTrack. This is based * loosely on the original rmsk click handler inside * of hgc.c. * * Written by Robert Hubley 10/2021 */ /* Copyright (C) 2021 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hgc.h" #include "rmskAlign.h" #include "hCommon.h" #include "chromAlias.h" -void printAlignmentBR (struct rmskAlign *ra) +struct bigRmskAlignRecord + { + char *chrom; /*Reference sequence chromosome or scaffold*/ + unsigned chromStart; /*Start position of alignment on chromosome*/ + unsigned chromEnd; /*End position of alignment on chromosome*/ + unsigned chromRemain; /*Remaining bp in the chromosome or scaffold*/ + float score; /*alignment score (sw, bits or evalue)*/ + float percSubst; /*Base substitution percentage*/ + float percDel; /*Base deletion percentage*/ + float percIns; /*Bases insertion percentage*/ + char strand[2]; /*Strand - either + or -*/ + char *repName; /*Name of repeat*/ + char *repType; /*Type of repeat*/ + char *repSubtype; /*Subtype of repeat*/ + unsigned repStart; /*Start in repeat sequence*/ + unsigned repEnd; /*End in repeat sequence*/ + unsigned repRemain; /*Remaining unaligned bp in the repeat sequence*/ + unsigned id; /*The ID of the hit. Used to link related fragments*/ + char *calignData; /*The alignment data stored as a single string*/ + }; + + +void printAlignmentBR (char strand, char * genoName, uint genoStart, uint genoEnd, + char *repName, uint repStart, uint repEnd, + char *calignData ) /* * Print RepeatMasker alignment data stored in RM's cAlign format. * The format is basically a lightly compressed diff format where * the query and subject are merged into one squence line. The * runs of exact matching sequences are interrupted by either * single base substitutions annotated as queryBase "/" subjectBase, * insertions in the subject annotated as "+ACTAT+", or deletions * in the query annotated as "-ACTTTG-". */ { int alignLength = 80; char querySeq[alignLength + 1]; char diffSeq[alignLength + 1]; char subjSeq[alignLength + 1]; int inSub = 0; int inDel = 0; int inIns = 0; int aIdx = 0; int sIdx = 0; int qCnt = 0; int sCnt = 0; -int qStart = ra->genoStart; -int sStart = ra->repStart; -if (ra->strand[0] == '-') - sStart = ra->repEnd; +uint qStart = genoStart; +uint sStart = repStart; +if (strand == '-') + sStart = repEnd; int maxNameLen = - (strlen (ra->genoName) > - strlen (ra->repName) ? strlen (ra->genoName) : strlen (ra->repName)); + (strlen(genoName) > + strlen(repName) ? strlen(genoName) : strlen(repName)); -while (ra->alignment[aIdx] != '\0') +while (calignData[aIdx] != '\0') { - if (ra->alignment[aIdx] == '/') + if (calignData[aIdx] == '/') inSub = 1; - else if (ra->alignment[aIdx] == '-') + else if (calignData[aIdx] == '-') inDel ^= 1; - else if (ra->alignment[aIdx] == '+') + else if (calignData[aIdx] == '+') inIns ^= 1; else { if (inSub) { - subjSeq[sIdx - 1] = ra->alignment[aIdx]; + subjSeq[sIdx - 1] = calignData[aIdx]; if ((querySeq[sIdx - 1] == 'C' && subjSeq[sIdx - 1] == 'T') || (querySeq[sIdx - 1] == 'T' && subjSeq[sIdx - 1] == 'C') || (querySeq[sIdx - 1] == 'A' && subjSeq[sIdx - 1] == 'G') || (querySeq[sIdx - 1] == 'G' && subjSeq[sIdx - 1] == 'A')) diffSeq[sIdx - 1] = 'i'; else if ((index ("BDHVRYKMSWNX", querySeq[sIdx - 1]) != NULL) || (index ("BDHVRYKMSWNX", subjSeq[sIdx - 1]) != NULL)) diffSeq[sIdx - 1] = '?'; else diffSeq[sIdx - 1] = 'v'; inSub = 0; } else if (inDel) { - querySeq[sIdx] = ra->alignment[aIdx]; + querySeq[sIdx] = calignData[aIdx]; subjSeq[sIdx] = '-'; diffSeq[sIdx] = '-'; qCnt++; sIdx++; } else if (inIns) { querySeq[sIdx] = '-'; - subjSeq[sIdx] = ra->alignment[aIdx]; + subjSeq[sIdx] = calignData[aIdx]; diffSeq[sIdx] = '-'; sCnt++; sIdx++; } else { diffSeq[sIdx] = ' '; - querySeq[sIdx] = ra->alignment[aIdx]; - subjSeq[sIdx] = ra->alignment[aIdx]; + querySeq[sIdx] = calignData[aIdx]; + subjSeq[sIdx] = calignData[aIdx]; sCnt++; qCnt++; sIdx++; } if (sIdx == alignLength) { querySeq[sIdx] = '\0'; diffSeq[sIdx] = '\0'; subjSeq[sIdx] = '\0'; - printf ("%*s %10d %s %d\n", maxNameLen, ra->genoName, qStart, + printf ("%*s %10d %s %d\n", maxNameLen, genoName, qStart, querySeq, (qStart + qCnt - 1)); printf ("%*s %s\n", maxNameLen, " ", diffSeq); - if (ra->strand[0] == '+') - printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, + if (strand == '+') + printf ("%*s %10d %s %d\n", maxNameLen, repName, sStart, subjSeq, (sStart + sCnt - 1)); else - printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, + printf ("%*s %10d %s %d\n", maxNameLen, repName, sStart, subjSeq, (sStart - sCnt + 1)); printf ("\n"); qStart += qCnt; - if (ra->strand[0] == '+') + if (strand == '+') sStart += sCnt; else sStart -= sCnt; qCnt = 0; sCnt = 0; sIdx = 0; } } aIdx++; } if (sIdx) { querySeq[sIdx] = '\0'; diffSeq[sIdx] = '\0'; subjSeq[sIdx] = '\0'; - printf ("%*s %10d %s %d\n", maxNameLen, ra->genoName, qStart, querySeq, + printf ("%*s %10d %s %d\n", maxNameLen, genoName, qStart, querySeq, (qStart + qCnt - 1)); printf ("%*s %s\n", maxNameLen, " ", diffSeq); - if (ra->strand[0] == '+') - printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, subjSeq, + if (strand == '+') + printf ("%*s %10d %s %d\n", maxNameLen, repName, sStart, subjSeq, (sStart + sCnt - 1)); else - printf ("%*s %10d %s %d\n", maxNameLen, ra->repName, sStart, subjSeq, + printf ("%*s %10d %s %d\n", maxNameLen, repName, sStart, subjSeq, (sStart - sCnt + 1)); - ; } } void printOutTableHeaderBR (char strand) /* Print the appropriate HTML table header * given the strand of the element. */ { const char *hd1_style = "style=\"background-color:#C2C9E0; font-size:14px; padding:4px 14px;\""; const char *hd1span_style = "style=\"background-color:#6678B1; font-size:14px; padding:4px 14px;\""; const char *hd2_style = "style=\"background-color:#C2C9E0; font-size:14px; padding:4px 14px; " "border-bottom: 2px solid #6678B1;\""; printf (" \n"); printf (" \n"); printf (" bit/sw\n", hd1_style); printf (" perc\n", hd1_style); printf (" perc\n", hd1_style); printf (" perc\n", hd1_style); printf (" query\n", hd1span_style); printf (" \n", hd1_style); printf (" matching repeat\n", hd1span_style); printf (" \n", hd1_style); printf (" \n"); printf (" \n"); printf (" score\n", hd2_style); printf (" div.\n", hd2_style); printf (" del.\n", hd2_style); printf (" ins.\n", hd2_style); printf (" sequence\n", hd2_style); printf (" begin\n", hd2_style); printf (" end\n", hd2_style); printf (" remaining\n", hd2_style); printf (" orient.\n", hd2_style); printf (" name\n", hd2_style); printf (" class/family\n", hd2_style); if (strand == 'C') { printf (" remaining\n", hd2_style); printf (" end\n", hd2_style); printf (" begin\n", hd2_style); } else { printf (" begin\n", hd2_style); printf (" end\n", hd2_style); printf (" remaining\n", hd2_style); } printf (" id\n", hd2_style); printf (" \n"); printf (" \n"); } void doBigRmskRepeat (struct trackDb *tdb, char *item) /* Main entry point */ { cartWebStart (cart, database, "%s", tdb->longLabel); char *chrom = cartString(cart, "c"); int start = cartInt(cart, "o"); int end = cartInt(cart, "t"); -char *fileName = trackDbSetting(tdb, "bigDataUrl"); -/* Open BigWig file and get interval list. */ + +/* Open bigBed file and get interval list. */ +char *fileName = trackDbSetting(tdb, "bigDataUrl"); struct bbiFile *bbi = bigBedFileOpenAlias(fileName,chromAliasFindAliases); struct lm *lm = lmInit(0); struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); /* Find particular item in list - matching start, and item if possible. */ boolean found = FALSE; struct bigBedInterval *bb; const char *data_style = "style=\"padding:0px 6px;\""; int bedSize = bbi->fieldCount; char *fields[bedSize]; -char startBuf[16], endBuf[16]; +char startBuf[17], endBuf[17]; for (bb = bbList; bb != NULL; bb = bb->next) { bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, bedSize); if ( sameOk(fields[12], item) ) { found = TRUE; break; } } if ( found ) { char class[32]; class[0] = '\0'; char subclass[32]; subclass[0] = '\0'; char *poundPtr = index (fields[3], '#'); if (poundPtr) { // Terminate name string properly safecpy (class, sizeof (class), poundPtr + 1); *poundPtr = '\0'; char *slashPtr = index (class, '/'); if (slashPtr) { // Terminate class string properly safecpy (subclass, sizeof (subclass), slashPtr + 1); *slashPtr = '\0'; } } printf ("Repeat: %s
\n", fields[3]); printf ("Class: %s
\n", class); printf ("Subclass: %s
\n", subclass); printf ("Orientation: %s
\n", fields[5]); //printf ("Joined Element Genomic Range: %s:%d-%d
\n", //rmJoin->chrom, rmJoin->alignStart+1, rmJoin->alignEnd); printf ("

\n"); printf ("

RepeatMasker Joined Annotations:

\n"); printf ("The RepeatMasker annotation line(s) for this element. " "If the element is fragmented the output will contain one " "line per joined fragment.

\n"); printf ("\n"); char *description = fields[13]; printOutTableHeaderBR(*fields[5]); char *outLine; char *outLineTok = description; while ((outLine = strsep(&outLineTok,",")) != NULL) { char *outField; char *outFieldTok = outLine; printf (" \n"); while ((outField = strsep(&outFieldTok," ")) != NULL) { printf (" \n", data_style, outField); } printf (" \n"); printf (" \n"); } printf("
%s\n"); printf ("Browser", hgTracksPathAndSettings (), database, fields[0], atoi(fields[6]) + 1, fields[7]); char *tbl = cgiUsualString ("table", cgiString ("g")); printf (" - " "DNA\n", hgcPathAndSettings (), fields[6], (fields[3] != NULL ? cgiEncode (fields[3]) : ""), fields[0], fields[6], fields[7], cgiEncode (fields[5]), tbl); printf ("
\n"); } else { printf("No item %s starting at %d\n", emptyForNull(item), start); } lmCleanup(&lm); bbiFileClose(&bbi); /* * Locate *.align data for this element */ -/* - * DISABLED FOR NOW, this might be part of a future enhancement if we create a auxilary bigBed file to - * contain this data - * - if (hTableExists (database, alignTable)) +char *xrefDataUrl = trackDbSetting(tdb, "xrefDataUrl"); +if ( xrefDataUrl ) { - struct rmskAlign *ro; - if (!hFindSplitTable (database, seqName, alignTable, qTable, sizeof(qTable), &hasBin)) - errAbort("track %s not found", alignTable); - sqlSafef (query, sizeof (query), - "select * from %s where genoName = '%s' and genoStart >= %d" - " and id = %s", qTable, seqName, start-1, repeat); - sr2 = sqlGetResult (conn2, query); + //struct bbiFile *abbi = bigBedFileOpen(xrefDataUrl); + struct bbiFile *abbi = bigBedFileOpenAlias(xrefDataUrl,chromAliasFindAliases); + struct lm *alm = lmInit(0); + struct bigBedInterval *abbList = bigBedIntervalQuery(abbi, chrom, start, end, 0, alm); + printf ("

RepeatMasker Alignments:

\n"); - printf - ("The raw alignment data used by RepeatMasker to generate " + printf ("The raw alignment data used by RepeatMasker to generate " "the final annotation call for this element. NOTE: The " "aligned sequence names and consensus positions may differ " "from the final annotation.

\n"); printf ("\n"); - while ((row = sqlNextRow (sr2)) != NULL) + + struct bigRmskAlignRecord *arec = NULL; + AllocVar(arec); + for (bb = abbList; bb != NULL; bb = bb->next) + { + char *afields[abbi->fieldCount]; + bigBedIntervalToRow(bb, chrom, startBuf, endBuf, afields, + abbi->fieldCount); + if ( sameOk(afields[15], item) ) { - ro = rmskAlignLoad (row + hasBin); + arec->chrom = afields[0]; + arec->chromStart = sqlUnsigned(afields[1]); + arec->chromEnd = sqlUnsigned(afields[2]); + arec->chromRemain = sqlUnsigned(afields[3]); + arec->score = sqlFloat(afields[4]); + arec->percSubst = sqlFloat(afields[5]); + arec->percDel = sqlFloat(afields[6]); + arec->percIns = sqlFloat(afields[7]); + safef(arec->strand, sizeof(arec->strand), "%s", afields[8]); + arec->repName = afields[9]; + arec->repType = afields[10]; + arec->repSubtype = afields[11]; + arec->repStart = sqlUnsigned(afields[12]); + arec->repEnd = sqlUnsigned(afields[13]); + arec->repRemain = sqlUnsigned(afields[14]); + arec->id = sqlUnsigned(afields[15]); + arec->calignData = afields[16]; + printf (" \n"); - printf (" \n", ro->swScore); - printf (" \n", - (double) ro->milliDiv * (double) 0.01); - printf (" \n", - (double) ro->milliDel * (double) 0.01); - printf (" \n", - (double) ro->milliIns * (double) 0.01); - printf (" \n", ro->genoName); - printf (" \n", ro->genoStart + 1); - printf (" \n", ro->genoEnd); - printf (" \n", ro->genoLeft); - printf (" \n", ro->strand); - printf (" \n", ro->repName); - printf (" \n", ro->repClass, ro->repFamily); - if (ro->strand[0] == '-') + printf (" \n", arec->score); + printf (" \n", arec->percSubst); + printf (" \n", arec->percDel); + printf (" \n", arec->percIns); + printf (" \n", arec->chrom); + printf (" \n", arec->chromStart + 1); + printf (" \n", arec->chromEnd); + printf (" \n", arec->chromRemain); + printf (" \n", arec->strand); + printf (" \n", arec->repName); + printf (" \n", arec->repType, arec->repSubtype); + if (arec->strand[0] == '-') { - printf (" \n", ro->repLeft); - printf (" \n", ro->repEnd); - printf (" \n", ro->repStart); + printf (" \n", arec->repRemain); + printf (" \n", arec->repEnd); + printf (" \n", arec->repStart); } else { - printf (" \n", ro->repStart); - printf (" \n", ro->repEnd); - printf (" \n", ro->repLeft); + printf (" \n", arec->repStart); + printf (" \n", arec->repEnd); + printf (" \n", arec->repRemain); } - printf (" \n", ro->id); + printf (" \n", arec->id); printf (" \n"); } - sqlFreeResult (&sr2); + } //for(bb = abblist.... printf ("
%d%3.2f%3.2f%3.2f%s%d%d(%d)%s%s%s/%s%.2f%3.2f%3.2f%3.2f%s%d%d(%d)%s%s%s/%s(%d)%d%d(%d)%d%d%d%d(%d)%d%d(%d)%d%d
\n");
-	    printAlignment (ro);
+            printAlignmentBR ( 
+                       arec->strand[0], arec->chrom, arec->chromStart, arec->chromEnd, 
+                       arec->repName, arec->repStart, arec->repEnd,
+                       arec->calignData );
 	    printf ("    
\n"); - } - hFreeConn (&conn2); - } - */ + lmCleanup(&alm); + bbiFileClose(&abbi); + } // if ( xref... printTrackHtml(tdb); }