acca3deffc05c4d8d11590a1cf3d893763254712
angie
Thu Oct 31 13:43:05 2019 -0700
dbSnp153: Adding new ucscNotes suggested by Ana Benet: clinvar{Benign,Conflicting,Pathogenic}, rareAll, rareSome. refs #23283
diff --git src/hg/lib/bigDbSnp.c src/hg/lib/bigDbSnp.c
index 863d8af..8ebf546 100644
--- src/hg/lib/bigDbSnp.c
+++ src/hg/lib/bigDbSnp.c
@@ -1,477 +1,489 @@
/* bigDbSnp.c was originally generated by the autoSql program, which also
* generated bigDbSnp.h and bigDbSnp.sql. This module links the database and
* the RAM representation of objects. */
#include "common.h"
#include "linefile.h"
#include "dystring.h"
#include "jksql.h"
#include "bigDbSnp.h"
char *bigDbSnpCommaSepFieldNames = "chrom,chromStart,chromEnd,name,ref,altCount,alts,shiftBases,freqSourceCount,minorAlleleFreq,majorAllele,minorAllele,maxFuncImpact,class,ucscNotes,_dataOffset,_dataLen";
/* definitions for class column */
static char *values_class[] = {"snv", "mnv", "ins", "del", "delins", "identity", NULL};
static struct hash *valhash_class = NULL;
struct bigDbSnp *bigDbSnpLoad(char **row)
/* Load a bigDbSnp from row fetched with select * from bigDbSnp
* from database. Dispose of this with bigDbSnpFree(). */
{
struct bigDbSnp *ret;
AllocVar(ret);
ret->altCount = sqlSigned(row[5]);
ret->freqSourceCount = sqlSigned(row[8]);
ret->chrom = cloneString(row[0]);
ret->chromStart = sqlUnsigned(row[1]);
ret->chromEnd = sqlUnsigned(row[2]);
ret->name = cloneString(row[3]);
ret->ref = cloneString(row[4]);
{
int sizeOne;
sqlStringDynamicArray(row[6], &ret->alts, &sizeOne);
assert(sizeOne == ret->altCount);
}
ret->shiftBases = sqlUnsigned(row[7]);
{
int sizeOne;
sqlDoubleDynamicArray(row[9], &ret->minorAlleleFreq, &sizeOne);
assert(sizeOne == ret->freqSourceCount);
}
{
int sizeOne;
sqlStringDynamicArray(row[10], &ret->majorAllele, &sizeOne);
assert(sizeOne == ret->freqSourceCount);
}
{
int sizeOne;
sqlStringDynamicArray(row[11], &ret->minorAllele, &sizeOne);
assert(sizeOne == ret->freqSourceCount);
}
ret->maxFuncImpact = sqlUnsigned(row[12]);
ret->class = sqlEnumParse(row[13], values_class, &valhash_class);
ret->ucscNotes = cloneString(row[14]);
ret->_dataOffset = sqlLongLong(row[15]);
ret->_dataLen = sqlSigned(row[16]);
return ret;
}
struct bigDbSnp *bigDbSnpLoadAll(char *fileName)
/* Load all bigDbSnp from a whitespace-separated file.
* Dispose of this with bigDbSnpFreeList(). */
{
struct bigDbSnp *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[17];
while (lineFileRow(lf, row))
{
el = bigDbSnpLoad(row);
slAddHead(&list, el);
}
lineFileClose(&lf);
slReverse(&list);
return list;
}
struct bigDbSnp *bigDbSnpLoadAllByChar(char *fileName, char chopper)
/* Load all bigDbSnp from a chopper separated file.
* Dispose of this with bigDbSnpFreeList(). */
{
struct bigDbSnp *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[17];
while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
{
el = bigDbSnpLoad(row);
slAddHead(&list, el);
}
lineFileClose(&lf);
slReverse(&list);
return list;
}
struct bigDbSnp *bigDbSnpCommaIn(char **pS, struct bigDbSnp *ret)
/* Create a bigDbSnp out of a comma separated string.
* This will fill in ret if non-null, otherwise will
* return a new bigDbSnp */
{
char *s = *pS;
if (ret == NULL)
AllocVar(ret);
ret->chrom = sqlStringComma(&s);
ret->chromStart = sqlUnsignedComma(&s);
ret->chromEnd = sqlUnsignedComma(&s);
ret->name = sqlStringComma(&s);
ret->ref = sqlStringComma(&s);
ret->altCount = sqlSignedComma(&s);
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->alts, ret->altCount);
for (i=0; i<ret->altCount; ++i)
{
ret->alts[i] = sqlStringComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
ret->shiftBases = sqlUnsignedComma(&s);
ret->freqSourceCount = sqlSignedComma(&s);
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->minorAlleleFreq, ret->freqSourceCount);
for (i=0; i<ret->freqSourceCount; ++i)
{
ret->minorAlleleFreq[i] = sqlDoubleComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->majorAllele, ret->freqSourceCount);
for (i=0; i<ret->freqSourceCount; ++i)
{
ret->majorAllele[i] = sqlStringComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->minorAllele, ret->freqSourceCount);
for (i=0; i<ret->freqSourceCount; ++i)
{
ret->minorAllele[i] = sqlStringComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
ret->maxFuncImpact = sqlUnsignedComma(&s);
ret->class = sqlEnumComma(&s, values_class, &valhash_class);
ret->ucscNotes = sqlStringComma(&s);
ret->_dataOffset = sqlLongLongComma(&s);
ret->_dataLen = sqlSignedComma(&s);
*pS = s;
return ret;
}
void bigDbSnpFree(struct bigDbSnp **pEl)
/* Free a single dynamically allocated bigDbSnp such as created
* with bigDbSnpLoad(). */
{
struct bigDbSnp *el;
if ((el = *pEl) == NULL) return;
freeMem(el->chrom);
freeMem(el->name);
freeMem(el->ref);
/* All strings in alts are allocated at once, so only need to free first. */
if (el->alts != NULL)
freeMem(el->alts[0]);
freeMem(el->alts);
freeMem(el->minorAlleleFreq);
/* All strings in majorAllele are allocated at once, so only need to free first. */
if (el->majorAllele != NULL)
freeMem(el->majorAllele[0]);
freeMem(el->majorAllele);
/* All strings in minorAllele are allocated at once, so only need to free first. */
if (el->minorAllele != NULL)
freeMem(el->minorAllele[0]);
freeMem(el->minorAllele);
freeMem(el->ucscNotes);
freez(pEl);
}
void bigDbSnpFreeList(struct bigDbSnp **pList)
/* Free a list of dynamically allocated bigDbSnp's */
{
struct bigDbSnp *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
bigDbSnpFree(&el);
}
*pList = NULL;
}
void bigDbSnpOutput(struct bigDbSnp *el, FILE *f, char sep, char lastSep)
/* Print out bigDbSnp. Separate fields with sep. Follow last field with lastSep. */
{
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->chrom);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->chromStart);
fputc(sep,f);
fprintf(f, "%u", el->chromEnd);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->ref);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%d", el->altCount);
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->altCount; ++i)
{
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->alts[i]);
if (sep == ',') fputc('"',f);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
fprintf(f, "%u", el->shiftBases);
fputc(sep,f);
fprintf(f, "%d", el->freqSourceCount);
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->freqSourceCount; ++i)
{
fprintf(f, "%g", el->minorAlleleFreq[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->freqSourceCount; ++i)
{
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->majorAllele[i]);
if (sep == ',') fputc('"',f);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->freqSourceCount; ++i)
{
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->minorAllele[i]);
if (sep == ',') fputc('"',f);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
fprintf(f, "%u", el->maxFuncImpact);
fputc(sep,f);
if (sep == ',') fputc('"',f);
sqlEnumPrint(f, el->class, values_class);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->ucscNotes);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%lld", el->_dataOffset);
fputc(sep,f);
fprintf(f, "%d", el->_dataLen);
fputc(lastSep,f);
}
/* -------------------------------- End autoSql Generated Code -------------------------------- */
struct symbolDesc
{
char *symbol;
char *description;
};
struct symbolDesc ucscNotesDesc[] =
{
{ bdsAltIsAmbiguous,
"At least one alternate allele "
"contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
{ bdsClassMismatch,
"Variation class/type is inconsistent with alleles mapped to this genome assembly." },
{ bdsClinvar,
"Variant is in ClinVar." },
+ { bdsClinvarBenign,
+ "Variant is in ClinVar with clinical significance of benign and/or likely benign." },
+ { bdsClinvarConflicting,
+ "Variant is in ClinVar with reports of both benign and pathogenic significance." },
+ { bdsClinvarPathogenic,
+ "Variant is in ClinVar with clinical significance of pathogenic and/or likely pathogenic." },
{ bdsClusterError,
"This variant has the same start, end and class as another variant; "
"they probably should have been merged into one variant." },
{ bdsCommonAll,
"Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
"in all projects reporting frequencies." },
{ bdsCommonSome,
"Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
"in some, but not all, projects reporting frequencies." },
{ bdsDiffMajor,
"Different frequency sources have different major alleles "
"(see table of allele frequencies above)." },
{ bdsFreqIsAmbiguous,
"At least one allele reported by at least one project "
"contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
{ bdsFreqNotRefAlt,
"The reference genome allele is not the major allele in at least one project." },
{ bdsMultiMap,
"This variant has been mapped to more than one distinct genomic location." },
{ bdsOverlapDiffClass,
"This variant overlaps another variant with a different type/class." },
{ bdsOverlapSameClass,
"This variant overlaps another with the same type/class but different start/end." },
+ { bdsRareAll,
+ "Variant is \"rare\", i.e. has a Minor Allele Frequency of less than 1% "
+ "in all projects reporting frequencies, or has been reported without frequency data." },
+ { bdsRareSome,
+ "Variant is \"rare\", i.e. has a Minor Allele Frequency of less than 1% "
+ "in some, but not all, projects reporting frequencies." },
{ bdsRefIsAmbiguous,
"The reference genome allele "
"contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
{ bdsRefIsMinor,
"The reference genome allele is not the major allele in at least one project." },
{ bdsRefIsRare,
"The reference genome allele is rare (i.e. allele frequency < 1%)." },
{ bdsRefIsSingleton,
"The reference genome allele has never been observed "
"in a population sequencing project reporting frequencies." },
{ bdsRefMismatch,
"The reference genome allele reported by dbSNP differs from the GenBank assembly sequence." },
{ bdsRevStrand,
"The orientation of the currently viewed reference genome sequence is different from "
"the orientation of dbSNP's preferred assembly; alleles are "
"presented on the forward strand of the currently viewed reference sequence." },
};
char *bigDbSnpDescribeUcscNote(char *ucscNote)
/* Return a string describing ucscNote, unless it is unrecognized in which case return NULL.
* Do not free returned value. */
{
int i;
for (i = 0; i < ArraySize(ucscNotesDesc); i++)
{
if (sameString(ucscNote, ucscNotesDesc[i].symbol))
return ucscNotesDesc[i].description;
}
return NULL;
}
char *bigDbSnpClassToString(enum bigDbSnpClass class)
/* Return the string version of enum bigDbSnpClass. Do not free result. */
{
char *string = NULL;
switch (class)
{
case bigDbSnpSnv:
string = "snv";
break;
case bigDbSnpMnv:
string = "mnv";
break;
case bigDbSnpIns:
string = "ins";
break;
case bigDbSnpDel:
string = "del";
break;
case bigDbSnpDelins:
string = "delins";
break;
case bigDbSnpIdentity:
string = "identity";
break;
default:
errAbort("bigDbSnpClassToString: unrecognized value %d", (int)class);
}
return string;
}
static boolean abbrevNRepeat(char *allele, int n, char *buf, size_t bufLen)
/* If allele is an N-base repeat, and a shorter representation fits in buf, return TRUE. */
{
boolean canAbbrev = FALSE;
int len = strlen(allele);
int minAbbrevLen = max(n*2, n+4);
if (len >= minAbbrevLen && bufLen >= minAbbrevLen)
{
int reps = 1;
int i;
for (i = n; i < len; i++)
{
if (allele[i] != allele[i-n])
break;
if (i % n == n-1)
reps++;
}
if (i >= minAbbrevLen)
{
// End of repeating section; are there enough repeats to make the notation shorter?
char repeatUnit[n+1];
safencpy(repeatUnit, sizeof repeatUnit, allele, n);
int abbrevLen = snprintf(buf, bufLen, "(%s)%d", repeatUnit, reps);
// Does the rest of the sequence start with a different repeat?
char *bufRest = buf+abbrevLen;
size_t bufRestLen = bufLen - abbrevLen;
char *alRest = allele + (reps * n);
if (bufRestLen > 5 && abbrevNRepeat(alRest, n, bufRest, bufRestLen))
abbrevLen = strlen(buf);
else
abbrevLen += snprintf(bufRest, bufRestLen, "%s", alRest);
if (abbrevLen < bufLen)
canAbbrev = TRUE;
else
buf[0] = '\0';
}
}
return canAbbrev;
}
char *bigDbSnpAbbrevAllele(char *allele, char *buf, size_t bufLen)
/* If allele can be abbreviated to something shorter than itself that fits in buf,
* and doesn't end up with a tiny bit of abbreviation followed by a bunch of unabbreviated
* sequence, then put the abbreviation in buf and return buf; otherwise return allele.
* If allele is the empty string, returns "-" (in buf). */
{
if (isEmpty(allele))
{
safecpy(buf, bufLen, "-");
return buf;
}
char *abbrev = allele;
int maxN = (bufLen - 3) / 2;
int n;
for (n = 1; n <= maxN; n++)
{
if (abbrevNRepeat(allele, n, buf, bufLen))
{
abbrev = buf;
break;
}
}
if (abbrev == buf)
{
int alLen = strlen(buf);
char *abbrevEnd = strrchr(buf, ')');
if (abbrevEnd == NULL)
errAbort("bigDbSnpAbbrevAllele: expect abbreviated allele '%s' to contain at least one ')'",
buf);
int abbrevLen = abbrevEnd + 1 - buf;
if (abbrevLen < alLen>>2)
{
// Never mind, the abbreviated portion is much smaller than the unabbreviated portion.
abbrev = allele;
}
}
return abbrev;
}