44ccfacbe3a3d4b300f80d48651c77837a4b571e galt Tue Apr 26 11:12:02 2022 -0700 SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix. diff --git src/hg/lib/pgSnp.c src/hg/lib/pgSnp.c index c5c2295..c383e19 100644 --- src/hg/lib/pgSnp.c +++ src/hg/lib/pgSnp.c @@ -1,1011 +1,1011 @@ /* pgSnp.c was originally generated by the autoSql program, which also * generated pgSnp.h and pgSnp.sql. This module links the database and * the RAM representation of objects. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "pgSnp.h" #include "hdb.h" #include "dnaseq.h" #include "pgPhenoAssoc.h" #include "regexHelper.h" void pgSnpStaticLoad(char **row, struct pgSnp *ret) /* Load a row from pgSnp table into ret. The contents of ret will * be replaced at the next call to this function. */ { ret->bin = sqlUnsigned(row[0]); ret->chrom = row[1]; ret->chromStart = sqlUnsigned(row[2]); ret->chromEnd = sqlUnsigned(row[3]); ret->name = row[4]; ret->alleleCount = sqlSigned(row[5]); ret->alleleFreq = row[6]; ret->alleleScores = row[7]; } struct pgSnp *pgSnpLoad(char **row) /* Load a pgSnp from row fetched with select * from pgSnp * from database. Dispose of this with pgSnpFree(). */ { struct pgSnp *ret; AllocVar(ret); ret->bin = sqlUnsigned(row[0]); ret->chrom = cloneString(row[1]); ret->chromStart = sqlUnsigned(row[2]); ret->chromEnd = sqlUnsigned(row[3]); ret->name = cloneString(row[4]); ret->alleleCount = sqlSigned(row[5]); ret->alleleFreq = cloneString(row[6]); ret->alleleScores = cloneString(row[7]); return ret; } struct pgSnp *pgSnpLoadAll(char *fileName) /* Load all pgSnp from a whitespace-separated file. * Dispose of this with pgSnpFreeList(). */ { struct pgSnp *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[8]; while (lineFileRow(lf, row)) { el = pgSnpLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct pgSnp *pgSnpLoadAllByChar(char *fileName, char chopper) /* Load all pgSnp from a chopper separated file. * Dispose of this with pgSnpFreeList(). */ { struct pgSnp *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[8]; while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) { el = pgSnpLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct pgSnp *pgSnpLoadByQuery(struct sqlConnection *conn, char *query) /* Load all pgSnp from table that satisfy the query given. * Where query is of the form 'select * from example where something=something' * or 'select example.* from example, anotherTable where example.something = * anotherTable.something'. * Dispose of this with pgSnpFreeList(). */ { struct pgSnp *list = NULL, *el; struct sqlResult *sr; char **row; sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { el = pgSnpLoad(row); slAddHead(&list, el); } slReverse(&list); sqlFreeResult(&sr); return list; } void pgSnpSaveToDb(struct sqlConnection *conn, struct pgSnp *el, char *tableName, int updateSize) /* Save pgSnp as a row to the table specified by tableName. * As blob fields may be arbitrary size updateSize specifies the approx size * of a string that would contain the entire query. Arrays of native types are * converted to comma separated strings and loaded as such, User defined types are * inserted as NULL. Strings are automatically escaped to allow insertion into the database. */ { -struct dyString *update = newDyString(updateSize); +struct dyString *update = dyStringNew(updateSize); sqlDyStringPrintf(update, "insert into %s values ( %u,'%s',%u,%u,'%s',%d,'%s','%s')", tableName, el->bin, el->chrom, el->chromStart, el->chromEnd, el->name, el->alleleCount, el->alleleFreq, el->alleleScores); sqlUpdate(conn, update->string); -freeDyString(&update); +dyStringFree(&update); } struct pgSnp *pgSnpCommaIn(char **pS, struct pgSnp *ret) /* Create a pgSnp out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new pgSnp */ { char *s = *pS; if (ret == NULL) AllocVar(ret); ret->bin = sqlUnsignedComma(&s); ret->chrom = sqlStringComma(&s); ret->chromStart = sqlUnsignedComma(&s); ret->chromEnd = sqlUnsignedComma(&s); ret->name = sqlStringComma(&s); ret->alleleCount = sqlSignedComma(&s); ret->alleleFreq = sqlStringComma(&s); ret->alleleScores = sqlStringComma(&s); *pS = s; return ret; } void pgSnpFree(struct pgSnp **pEl) /* Free a single dynamically allocated pgSnp such as created * with pgSnpLoad(). */ { struct pgSnp *el; if ((el = *pEl) == NULL) return; freeMem(el->chrom); freeMem(el->name); freeMem(el->alleleFreq); freeMem(el->alleleScores); freez(pEl); } void pgSnpFreeList(struct pgSnp **pList) /* Free a list of dynamically allocated pgSnp's */ { struct pgSnp *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; pgSnpFree(&el); } *pList = NULL; } void pgSnpOutput(struct pgSnp *el, FILE *f, char sep, char lastSep) /* Print out pgSnp. Separate fields with sep. Follow last field with lastSep. */ { fprintf(f, "%s", el->chrom); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->chromStart); fputc(sep,f); fprintf(f, "%u", el->chromEnd); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->name); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%d", el->alleleCount); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->alleleFreq); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->alleleScores); if (sep == ',') fputc('"',f); fputc(lastSep,f); } /* -------------------------------- End autoSql Generated Code -------------------------------- */ struct pgCodon *fetchCodons(char *db, struct bed *gene, unsigned chrStart, unsigned chrEnd) /* find codons containing region, return sequence and positions */ /* gene should have coding sequence only, bedThickOnly */ { int codNum = 0; int cPos = 0; struct pgCodon *rv = NULL; AllocVar(rv); int frame = 0; /* num of nts needed to complete codon in next exon */ int i0, iN, iInc, i; boolean posStrand; if (startsWith("+", gene->strand)) //positive strand { i0 = 0; iN = gene->blockCount; iInc = 1; posStrand = TRUE; } else { i0 = gene->blockCount-1; iN=-1; iInc = -1; posStrand = FALSE; } for (i=i0; (iInc*i)<(iInc*iN); i=i+iInc) { //printf("TESTING cPos=%d i=%d frame=%d\n", cPos, i, frame); if (chrStart >= (gene->chromStarts[i] + gene->chromStart) && chrEnd <= gene->chromStarts[i] + gene->blockSizes[i] + gene->chromStart) { unsigned d = 0; int codStart = cPos + 1; unsigned cStart = 0; unsigned cEnd = 0; if (posStrand) d = chrStart - (gene->chromStarts[i] + gene->chromStart); else d = (gene->chromStarts[i] + gene->chromStart + gene->blockSizes[i]-1) - (chrEnd - 1); cPos += d; if (d > frame) codNum += trunc((d - frame)/3) + 1; if (frame > 0) { codNum++; } rv->firstCodon = codNum; if (d >= frame) frame = (d - frame) % 3; else frame = 3 - frame + d; /* start of codon */ rv->cdStart = cPos - frame; rv->cdEnd = rv->cdStart + 3; /* end of first codon */ rv->regStart = cPos; rv->regEnd = cPos + (chrEnd - chrStart); //printf("TESTING d = %d cdSt = %d cdEnd = %d reg %d - %d frame=%d\n", d, rv->cdStart, rv->cdEnd, rv->regStart, rv->regEnd, frame); /* more than 1 codon? */ while (rv->regEnd > rv->cdEnd) rv->cdEnd += 3; - struct dyString *seq = newDyString(1024); + struct dyString *seq = dyStringNew(1024); /* check prev exon, chrom order not cds order */ if (rv->cdStart < codStart && posStrand) { int st = rv->cdStart; int end = codStart - 1; cStart = gene->chromStart + gene->chromStarts[i-1] + gene->blockSizes[i-1] - (end - st); cEnd = gene->chromStart + gene->chromStarts[i-1] + gene->blockSizes[i-1]; if (cStart < cEnd) { struct dnaSeq *s = hDnaFromSeq(db, gene->chrom, cStart, cEnd, dnaUpper); dyStringPrintf(seq, "%s", s->dna); //freeDnaSeq(&s); } } else if (!posStrand && rv->cdEnd >= (codStart + gene->blockSizes[i])) { int st = codStart + gene->blockSizes[i] - 1; int end = rv->cdEnd; cEnd = gene->chromStart + gene->chromStarts[i-1] + gene->blockSizes[i-1]; cStart = gene->chromStart + gene->chromStarts[i-1] + gene->blockSizes[i-1] - (end - st); if (cStart < cEnd) { struct dnaSeq *s = hDnaFromSeq(db, gene->chrom, cStart, cEnd, dnaUpper); dyStringPrintf(seq, "%s", s->dna); //error here? //printf("TESTING got seq=%s<br>\n", s->dna); //freeDnaSeq(&s); } } /* get sequence needed from this exon */ int st = rv->cdStart; if (rv->cdStart < codStart) st = codStart - 1; int end = rv->cdEnd; if (rv->cdEnd > (codStart + gene->blockSizes[i] - 1)) end = codStart + gene->blockSizes[i] - 1; if (posStrand) { cStart = gene->chromStart + gene->chromStarts[i] + (st - codStart + 1); cEnd = gene->chromStart + gene->chromStarts[i] + (end - codStart + 1); } else { /* minus strand coding start == chromEnd */ cEnd = gene->chromStart + gene->chromStarts[i] + gene->blockSizes[i] - (st - codStart + 1); cStart = gene->chromStart + gene->chromStarts[i] + gene->blockSizes[i] - (end - codStart + 1); } //printf("TESTING fetching sequence for %s:%d-%d\n", gene->chrom, cStart, cEnd); if (cStart < cEnd) { struct dnaSeq *s = hDnaFromSeq(db, gene->chrom, cStart, cEnd, dnaUpper); dyStringPrintf(seq, "%s", s->dna); //printf("TESTING got seq=%s<br>\n", s->dna); //freeDnaSeq(&s); } /* check following exons, chrom order */ if (i+1 < gene->blockCount) { if (posStrand && rv->cdEnd >= codStart + gene->blockSizes[i]) { int st = codStart + gene->blockSizes[i] + 1; int end = rv->cdEnd; cStart = gene->chromStart + gene->chromStarts[i+1] - 1; cEnd = gene->chromStart + gene->chromStarts[i+1] + (end - st + 1); if (cStart < cEnd) { struct dnaSeq *s = hDnaFromSeq(db, gene->chrom, cStart, cEnd, dnaUpper); dyStringPrintf(seq, "%s", s->dna); //freeDnaSeq(&s); } } else if (!posStrand && rv->cdStart < (codStart - 1)) { int st = rv->cdStart; int end = codStart - 1; cStart = gene->chromStart + gene->chromStarts[i+1]; cEnd = gene->chromStart + gene->chromStarts[i+1] + (end - st); //printf("TESTING fetching sequence for %s:%d-%d\n", gene->chrom, cStart, cEnd); if (cStart < cEnd) { struct dnaSeq *s = hDnaFromSeq(db, gene->chrom, cStart, cEnd, dnaUpper); dyStringPrintf(seq, "%s", s->dna); //freeDnaSeq(&s); //printf("TESTING got seq=%s<br>\n", s->dna); } } } rv->seq = dyStringCannibalize(&seq); break; } /* increment values past this exon */ codNum += trunc((gene->blockSizes[i] - frame) / 3); if (frame > 0) codNum++; cPos += gene->blockSizes[i]; frame = 3 - ((gene->blockSizes[i] - frame) % 3); } if (rv->seq == NULL) return NULL; /* not found */ return rv; } char *replaceString(char *old, int from, int to, char *rep) /* replace part of string based on character positions */ { char *result = NULL; char *resultPtr = NULL; char *ptr = old+from; int strLen = strlen(old) + strlen(rep); int oldLen = to - from; result = needMem(strLen); resultPtr = result; strLen = from; //beg strcpy(resultPtr, old); old = ptr + oldLen; resultPtr += strLen; strcpy(resultPtr, rep); //mid resultPtr += strlen(rep); strcpy(resultPtr, old); //end return result; } void aaProperties (char *aa1, char *aa2); void printSeqCodDisplay(char *db, struct pgSnp *item, char *genePredTable) /* print the display of sequence changes for a coding variant */ { struct bed *list = NULL, *el, *th = NULL; struct sqlResult *sr; char **row; char query[512]; struct sqlConnection *conn = hAllocConn(db); if (!sqlTableExists(conn, genePredTable)) { hFreeConn(&conn); return; } sqlSafef(query, sizeof(query), "select chrom, txStart, txEnd, name, 0, strand, cdsStart, cdsEnd, " "0, exonCount, exonEnds, exonStarts from %s " "where chrom = '%s' and cdsStart <= %d and cdsEnd >= %d", genePredTable, item->chrom, item->chromStart, item->chromEnd); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { el = bedLoad12(row); /* adjust exonStarts and exonEnds to actual bed values */ int i; for (i=0;i<el->blockCount;i++) { el->blockSizes[i] = el->blockSizes[i] - el->chromStarts[i]; el->chromStarts[i] = el->chromStarts[i] - el->chromStart; } slAddHead(&list, el); } slReverse(&list); sqlFreeResult(&sr); hFreeConn(&conn); int found = 0; th = bedThickOnlyList(list); for (el = th; el != NULL; el = el->next) { struct pgCodon *cod = fetchCodons(db, el, item->chromStart, item->chromEnd); if (cod == NULL) continue; /* not in exon */ if (found == 0) printf("\n<BR>Coding sequence changes are relative to strand of transcript:<BR>\n"); found++; if (sameString(el->strand, "-")) reverseComplement(cod->seq, strlen(cod->seq)); /* bold changing seq */ char old[512]; strncpy(old, cod->seq+(cod->regStart - cod->cdStart), (cod->regEnd - cod->regStart)); old[cod->regEnd - cod->regStart] = '\0'; char b[512]; safef(b, sizeof(b), "<B>%s</B>", isEmpty(old) ? "-" : old); char *bold = replaceString(cod->seq, (cod->regStart - cod->cdStart), (cod->regEnd - cod->cdStart), b); char *nameCopy = cloneString(item->name); char *allele[8]; (void) chopByChar(nameCopy, '/', allele, item->alleleCount); int i; printf("%s:starting positions codon %d cds %d<BR>\n", el->name, cod->firstCodon, (cod->cdStart+1)); for (i=0;i<item->alleleCount;i++) { char a[512]; if (sameString(el->strand, "-")) reverseComplement(allele[i], strlen(allele[i])); safef(a, sizeof(a), "<B>%s</B>", allele[i]); char *rep = replaceString(cod->seq, (cod->regStart - cod->cdStart), (cod->regEnd - cod->cdStart), a); if (sameString(bold, rep)) continue; printf("%s > %s<BR>\n", bold, rep); if (item->chromStart == item->chromEnd && !strstr(rep, "-") && (strlen(rep)-7) % 3 != 0) { printf(" frameshift<BR>\n"); } else if (item->chromStart == item->chromEnd && countChars(allele[i], '-') == strlen(allele[i])) { printf(" wildtype<BR>\n"); } else { struct dnaSeq *dnaseq = newDnaSeq(cod->seq, strlen(cod->seq), "orig"); aaSeq *origAa = translateSeq(dnaseq, 0, FALSE); if (!strstr(origAa->dna, "X")) { char *rep2 = replaceString(cod->seq, (cod->regStart - cod->cdStart), (cod->regEnd - cod->cdStart), allele[i]); dnaseq = newDnaSeq(rep2, strlen(rep2), "rep2"); aaSeq *repAa = translateSeq(dnaseq, 0, FALSE); //freeDnaSeq(&dnaseq); if (!strstr(rep2, "-") && strlen(allele[i]) != (item->chromEnd - item->chromStart)) { //indel int diff = abs((item->chromEnd - item->chromStart) - strlen(allele[i])); if (diff % 3 == 0) printf(" in-frame indel<BR>\n"); else printf(" frameshift indel<BR>\n"); } else if (!strstr(rep2, "-") && isNotEmpty(repAa->dna)) { printf(" %s > %s<BR>\n", origAa->dna, repAa->dna); if (differentString(origAa->dna, repAa->dna)) aaProperties(origAa->dna, repAa->dna); } else if (strstr(rep2, "-") && (item->chromEnd - item->chromStart) % 3 != 0) { printf(" frameshift<BR>\n"); } else if (strstr(rep2, "-") && (item->chromEnd - item->chromStart) % 3 == 0) { printf(" in-frame deletion<BR>\n"); } } } } /* print a hr between gene models */ printf("<hr style=\"width:30%%;text-align:left;margin-left:0\" />\n"); } bedFreeList(&list); } char *aaPolarity (char *aa) /* return the polarity of the amino acid */ { if (sameString(aa, "A") || sameString(aa, "G") || sameString(aa, "I") || sameString(aa, "L") || sameString(aa, "M") || sameString(aa, "F") || sameString(aa, "P") || sameString(aa, "W") || sameString(aa, "V")) return cloneString("nonpolar"); return cloneString("polar"); } char *aaAcidity (char *aa) /* return the acidity */ { if (sameString(aa, "R")) return cloneString("basic (strongly)"); else if (sameString(aa, "H")) return cloneString("basic (weakly)"); else if (sameString(aa, "K")) return cloneString("basic"); else if (sameString(aa, "D") || sameString(aa, "E")) return cloneString("acidic"); else return cloneString("neutral"); } float aaHydropathy (char *aa) /* return the hydropathy */ { if (sameString(aa, "A")) return 1.8; if (sameString(aa, "R")) return -4.5; if (sameString(aa, "N")) return -3.5; if (sameString(aa, "D")) return -3.5; if (sameString(aa, "C")) return 2.5; if (sameString(aa, "E")) return -3.5; if (sameString(aa, "Q")) return -3.5; if (sameString(aa, "G")) return -0.4; if (sameString(aa, "H")) return -3.2; if (sameString(aa, "I")) return 4.5; if (sameString(aa, "L")) return 3.8; if (sameString(aa, "K")) return -3.9; if (sameString(aa, "M")) return 1.9; if (sameString(aa, "F")) return 2.8; if (sameString(aa, "P")) return -1.6; if (sameString(aa, "S")) return -0.8; if (sameString(aa, "T")) return -0.7; if (sameString(aa, "W")) return -0.9; if (sameString(aa, "Y")) return -1.3; if (sameString(aa, "V")) return 4.2; return 0; } void aaProperties (char *aa1, char *aa2) /* print amino acid properties for these amino acids */ { char *pol1 = aaPolarity(aa1); char *pol2 = aaPolarity(aa2); char *acid1 = aaAcidity(aa1); char *acid2 = aaAcidity(aa2); float hyd1 = aaHydropathy(aa1); float hyd2 = aaHydropathy(aa2); printf("<table class=\"descTbl\"><caption>Amino acid properties</caption>" "<tr><th> </th><th>%s</th><th>%s</th></tr>\n", aa1, aa2); /* take out highlights, not sure what is significant change for hydropathy */ //if (differentString(pol1, pol2)) //printf("<tr bgcolor=\"white\"><td>polarity</td><td>%s</td><td>%s</td></tr>\n", pol1, pol2); //else printf("<tr><td>polarity</td><td>%s</td><td>%s</td></tr>\n", pol1, pol2); //if (differentString(acid1, acid2) && //(!startsWith("basic", acid1) || !startsWith("basic", acid2)) ) //printf("<tr bgcolor=\"white\"><td>acidity</td><td>%s</td><td>%s</td></tr>\n", acid1, acid2); //else printf("<tr><td>acidity</td><td>%s</td><td>%s</td></tr>\n", acid1, acid2); //if ((hyd1 < 0 && hyd2 > 0) || (hyd1 > 0 && hyd2 < 0)) //printf("<tr bgcolor=\"white\"><td>hydropathy</td><td>%1.1f</td><td>%1.1f</td></tr></table>\n", hyd1, hyd2); //else printf("<tr><td>hydropathy</td><td>%1.1f</td><td>%1.1f</td></tr></table>\n", hyd1, hyd2); printf("<br>"); } void printPgDbLink(char *db, struct trackDb *tdb, struct pgSnp *item) /* print the links to phenotype and other databases for pgSnps */ { struct pgPhenoAssoc *el; struct sqlResult *sr; char **row; char query[512]; struct sqlConnection *conn = hAllocConn(db); char *dbList[8]; int tot = 0, i = 0, first = 1; char *tabs = trackDbSetting(tdb, "pgDbLink"); if (tabs == NULL) return; tot = chopByWhite(tabs, dbList, ArraySize(dbList)); for(i=0;i<tot;i++) { sqlSafef(query, sizeof(query), "select chrom, chromStart, chromEnd, name, srcUrl from %s where chrom = '%s' and chromStart = %d and chromEnd = %d", dbList[i], item->chrom, item->chromStart, item->chromEnd); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { if (first == 1) { printf("<br><b>Links to phenotype databases</b><br>\n"); first = 0; } el = pgPhenoAssocLoad(row); printf("<a href=\"%s\">%s</a></br>\n", el->srcUrl, el->name); } } } static char *pgSnpAutoSqlString = "table pgSnp" "\"personal genome SNP database table\"" " (" " ushort bin; \"A field to speed indexing\"" " string chrom; \"Chromosome\"" " uint chromStart; \"Start position in chrom\"" " uint chromEnd; \"End position in chrom\"" " string name; \"alleles ACTG[/ACTG]\"" " int alleleCount; \"number of alleles\"" " string alleleFreq; \"comma separated list of frequency of each allele\"" " string alleleScores; \"comma separated list of quality scores\"" " )" ; static char *pgSnpFileAutoSqlString = "table pgSnp" "\"personal genome SNP file format\"" " (" " string chrom; \"Chromosome\"" " uint chromStart; \"Start position in chrom\"" " uint chromEnd; \"End position in chrom\"" " string name; \"alleles ACTG[/ACTG]\"" " int alleleCount; \"number of alleles\"" " string alleleFreq; \"comma separated list of frequency of each allele\"" " string alleleScores; \"comma separated list of quality scores\"" " )" ; struct asObject *pgSnpAsObj() // Return asObject describing fields of pgSnp database table (includes bin) { return asParseText(pgSnpAutoSqlString); } struct asObject *pgSnpFileAsObj() // Return asObject describing fields of pgSnp file (no bin) { return asParseText(pgSnpFileAutoSqlString); } struct pgSnp *pgSnpLoadNoBin(char **row) /* load pgSnp struct from row without bin */ { struct pgSnp *ret; AllocVar(ret); ret->bin = 0; ret->chrom = cloneString(row[0]); ret->chromStart = sqlUnsigned(row[1]); ret->chromEnd = sqlUnsigned(row[2]); ret->name = cloneString(row[3]); ret->alleleCount = sqlSigned(row[4]); ret->alleleFreq = cloneString(row[5]); ret->alleleScores = cloneString(row[6]); return ret; } struct pgSnp *pgSnpLineFileLoad(char **row, struct lineFile *lf) /* Load pgSnp from a lineFile line, with error checking. */ /* Requires comma separated zeroes for frequency and scores. */ { struct pgSnp *item; AllocVar(item); item->chrom = cloneString(row[0]); item->chromStart = lineFileNeedNum(lf, row, 1); item->chromEnd = lineFileNeedNum(lf, row, 2); if (item->chromEnd < 1) lineFileAbort(lf, "chromEnd less than 1 (%d)", item->chromEnd); if (item->chromEnd < item->chromStart) lineFileAbort(lf, "chromStart after chromEnd (%d > %d)", item->chromStart, item->chromEnd); /* use pattern match to check values and counts both */ /* alleles are separated by / and can be ACTG- */ item->name = cloneString(row[3]); /* allele count, positive integer matching # of alleles */ item->alleleCount = lineFileNeedNum(lf, row, 4); char alleles[128]; /* pattern to match alleles */ safef(alleles, sizeof(alleles), "^[ACTG-]+(\\/[ACTG-]+){%d}$", item->alleleCount - 1); if (! regexMatchNoCase(row[3], alleles)) lineFileAbort(lf, "invalid alleles (%s)" " - must be slash-separated nucleotide(s) with correct number of alleles (%d)", row[3], item->alleleCount); /* read count, comma separated list of numbers with above # of items */ item->alleleFreq = cloneString(row[5]); char pattern[128]; safef(pattern, sizeof(pattern), "^[0-9]+(,[0-9]+){%d}$", item->alleleCount - 1); if (! regexMatchNoCase(row[5], pattern)) lineFileAbort(lf, "invalid allele frequency, %s with count of %d", row[5], item->alleleCount); /* scores, comma separated list of numbers with above # of items */ item->alleleScores = cloneString(row[6]); safef(pattern, sizeof(pattern), "^[0-9.]+(,[0-9.]+){%d}$", item->alleleCount - 1); if (! regexMatchNoCase(row[6], pattern)) lineFileAbort(lf, "invalid allele scores, %s with count of %d", row[6], item->alleleCount); return item; } #define VCF_MAX_ALLELE_LEN 80 static char *alleleCountsFromVcfTumorAd(const struct vcfInfoElement *ad) /* Build up comma-sep list of read counts supporting tumor alleles */ { struct dyString *dy = dyStringNew(0); // Assume only one alternate allele int r = ad->values[0].datInt; int a = ad->values[1].datInt; dyStringPrintf(dy, "%d,%d", r, a); return dyStringCannibalize(&dy); } static char *alleleCountsFromVcfRecord(struct vcfRecord *rec, int alDescCount) /* Build up comma-sep list of per-allele counts, if available, up to alDescCount * which may be less than rec->alleleCount: */ { struct dyString *dy = dyStringNew(0); int alCounts[VCF_MAX_ALLELE_LEN]; boolean gotTotalCount = FALSE, gotAltCounts = FALSE; int i; for (i = 0; i < rec->infoCount; i++) if (sameString(rec->infoElements[i].key, "AN")) { if (rec->infoElements[i].missingData[0]) break; gotTotalCount = TRUE; // Set ref allele to total count, subtract alt counts below. alCounts[0] = rec->infoElements[i].values[0].datInt; break; } for (i = 0; i < rec->infoCount; i++) if (sameString(rec->infoElements[i].key, "AC")) { if (rec->infoElements[i].count > 0) { gotAltCounts = TRUE; int j; for (j = 0; j < rec->infoElements[i].count && j < alDescCount-1; j++) { if (rec->infoElements[i].missingData[j]) alCounts[1+j] = -1; else { int ac = rec->infoElements[i].values[j].datInt; alCounts[1+j] = ac; if (gotTotalCount) alCounts[0] -= ac; } } if (gotTotalCount) dyStringPrintf(dy, "%d", alCounts[0]); else dyStringAppend(dy, "-1"); for (j = 1; j < alDescCount; j++) if (alCounts[j] >= 0) dyStringPrintf(dy, ",%d", alCounts[j]); else dyStringAppend(dy, ",-1"); } break; } if (!gotTotalCount) { for (i = 0; i < rec->infoCount; i++) { // Added specifically for UWash ExomVariants, redmine #9329 if (sameString(rec->infoElements[i].key, "TAC")) { struct vcfInfoElement *tacEl = &rec->infoElements[i]; int refIx = tacEl->count - 1; // Ref allele count is last in TAC int alleleCount = -1; if (!tacEl->missingData[refIx]) // Not in datInt, but don't assume int in datString alleleCount = atoi(tacEl->values[refIx].datString); // sqlSigned will errAbort! if (alleleCount > 0) { dyStringPrintf(dy, "%d", alleleCount); gotTotalCount = TRUE; } else dyStringAppend(dy, "-1"); int tacIx = 0; // Now continue with alt alleles. for ( ; tacIx < refIx; tacIx++) { alleleCount = -1; if (!tacEl->missingData[tacIx]) alleleCount = atoi(tacEl->values[tacIx].datString); if (alleleCount > 0) { dyStringPrintf(dy, ",%d", alleleCount); gotAltCounts = TRUE; } else dyStringAppend(dy, "-1"); } break; } } } if (gotTotalCount && !gotAltCounts) { dyStringPrintf(dy, "%d", alCounts[0]); for (i = 1; i < alDescCount; i++) dyStringAppend(dy, ",-1"); } else if (!gotTotalCount && !gotAltCounts && rec->file->genotypeCount > 0) { vcfParseGenotypes(rec); for (i = 0; i < alDescCount; i++) alCounts[i] = 0; for (i = 0; i < rec->file->genotypeCount; i++) { struct vcfGenotype *gt = &(rec->genotypes[i]); if (gt == NULL) uglyf("i=%d gt=NULL wtf?\n", i); if (gt->hapIxA >= 0) alCounts[(unsigned char)gt->hapIxA]++; if (!gt->isHaploid && gt->hapIxB >= 0) alCounts[(unsigned char)gt->hapIxB]++; } dyStringPrintf(dy, "%d", alCounts[0]); for (i = 1; i < alDescCount; i++) dyStringPrintf(dy, ",%d", alCounts[i]); } return dyStringCannibalize(&dy); } static void pgSnpFromVcfSgtIndelRecord(struct vcfRecord *rec, struct pgSnp *pgs) /* parse alleles and frequency from vcf file with SGT for indels */ { pgs->alleleCount = 2; // assume always 2: normal->tumor struct dyString *dy = dyStringCreate("%s/%s", rec->alleles[0], rec->alleles[1]); pgs->name = dyStringCannibalize(&dy); int refCnt = -1, altCnt = -1; const struct vcfGenotype *gt = vcfRecordFindGenotype(rec, "TUMOR"); if (gt) { const struct vcfInfoElement *dp = vcfGenotypeFindInfo(gt, "DP"); const struct vcfInfoElement *tar = vcfGenotypeFindInfo(gt, "TAR"); if (dp && tar) { int totCnt = dp->values[0].datInt; altCnt = tar->values[0].datInt; refCnt = totCnt - altCnt; } } if (refCnt == -1) pgs->alleleFreq = cloneString("?,?"); else { char freq[64]; safef(freq, sizeof(freq), "%d,%d", refCnt, altCnt); pgs->alleleFreq = cloneString(freq); } } static struct pgSnp *pgSnpFromVcfSgtRecord(struct vcfRecord *rec) /* Convert VCF rec with SGT (from Strelka) to pgSnp; don't free rec->file (vcfFile) until * you're done with pgSnp because pgSnp points to rec->chrom. */ { struct pgSnp *pgs; AllocVar(pgs); pgs->chrom = rec->chrom; pgs->chromStart = rec->chromStart; pgs->chromEnd = rec->chromEnd; pgs->alleleScores = cloneString("-1,-1"); //set alleles from INFO SGT const struct vcfInfoElement *sgtEl = vcfRecordFindInfo(rec, "SGT"); if (sgtEl) { char *val = sgtEl->values[0].datString; // Indels encode allele frequencies differently if (startsWith("ref->", val)) { pgSnpFromVcfSgtIndelRecord(rec, pgs); return pgs; } // Get tumor genotype, assuming SNVs only char last = val[strlen(val)-1]; char nlast = val[strlen(val)-2]; struct dyString *dy = dyStringCreate("%c/%c", nlast, last); pgs->name = dyStringCannibalize(&dy); //add read counts to frequency field if (last == nlast) pgs->alleleCount = 1; else pgs->alleleCount = 2; //set freqs from AU,CU,GU,TU for TUMOR genotype const struct vcfGenotype *gt = vcfRecordFindGenotype(rec, "TUMOR"); if (gt) { // Find counts for alleles int first = -1, second = -1; int k; for (k = 0; k < gt->infoCount; k++) { struct vcfInfoElement *info = >->infoElements[k]; if (sameString(info->key, "AU") || sameString(info->key, "CU") || sameString(info->key, "GU") || sameString(info->key, "TU")) { char base = info->key[0]; if (nlast == base) first = info->values[0].datInt; else if (last == base) second = info->values[0].datInt; } } struct dyString *freq = dyStringCreate("%d,%d", first, second); pgs->alleleFreq = dyStringCannibalize(&freq); } } else errAbort("pcfSnpFromVcfSgtRecord: no SGT found in INFO for VCF at %s:%d-%d %s", rec->chrom, rec->chromStart+1, rec->chromEnd, rec->name); return pgs; } struct pgSnp *pgSnpFromVcfRecord(struct vcfRecord *rec) /* Convert VCF rec to pgSnp; don't free rec->file (vcfFile) until * you're done with pgSnp because pgSnp points to rec->chrom. */ { // If this is Strelka VCF with SGT (Somatic GenoType) info then deduce genotypes from // other Strelka-specific info keys. if (vcfRecordFindInfo(rec, "SGT")) return pgSnpFromVcfSgtRecord(rec); struct dyString *dy = dyStringNew(0); struct pgSnp *pgs; AllocVar(pgs); pgs->chrom = rec->chrom; pgs->chromStart = rec->chromStart; pgs->chromEnd = rec->chromEnd; // Build up slash-separated allele string from rec->alleles, starting with ref allele: dyStringAppend(dy, rec->alleles[0]); int alCount = rec->alleleCount, i; if (rec->alleleCount == 2 && (sameString(rec->alleles[1], ".") || sameString(rec->alleles[1], "<X>") || sameString(rec->alleles[1], "<*>"))) // ignore N/A alternate allele alCount = 1; else if (rec->alleleCount >= 2) { // append /-sep'd alternate alleles for (i = 1; i < rec->alleleCount; i++) dyStringPrintf(dy, "/%s", rec->alleles[i]); } pgs->name = cloneStringZ(dy->string, dy->stringSize+1); pgs->alleleCount = alCount; // If this is Mutect2 TUMOR/NORMAL VCF with AD (allelic depths for ref and alt) // then display read counts from tumor sample as allele counts. const struct vcfGenotype *tumorGt = vcfRecordFindGenotype(rec, "TUMOR"); const struct vcfInfoElement *tumorAd = tumorGt ? vcfGenotypeFindInfo(tumorGt, "AD") : NULL; if (rec->file->genotypeCount == 2 && tumorAd && vcfRecordFindGenotype(rec, "NORMAL")) pgs->alleleFreq = alleleCountsFromVcfTumorAd(tumorAd); else pgs->alleleFreq = alleleCountsFromVcfRecord(rec, alCount); // Build up comma-sep list... supposed to be per-allele quality scores but I think // the VCF spec only gives us one BQ... for the reference position? should ask. dyStringClear(dy); for (i = 0; i < rec->infoCount; i++) if (sameString(rec->infoElements[i].key, "BQ") && !rec->infoElements[i].missingData[0]) { float qual = rec->infoElements[i].values[0].datFloat; dyStringPrintf(dy, "%.1f", qual); int j; for (j = 1; j < rec->alleleCount; j++) dyStringPrintf(dy, ",%.1f", qual); break; } pgs->alleleScores = dyStringCannibalize(&dy); return pgs; }