54452ec022a6073410955c04e110a1784f71fb57 angie Wed Nov 13 17:37:34 2019 -0800 dbSnp153: add new ucscNote otherMapErr for mappings with the same rs# as a mapping w/inconsistent SPDI in BadCoords/Map Err subtrack. refs #23283 diff --git src/hg/lib/bigDbSnp.c src/hg/lib/bigDbSnp.c index 1116859..9fd0390 100644 --- src/hg/lib/bigDbSnp.c +++ src/hg/lib/bigDbSnp.c @@ -1,489 +1,491 @@ /* bigDbSnp.c was originally generated by the autoSql program, which also * generated bigDbSnp.h and bigDbSnp.sql. This module links the database and * the RAM representation of objects. */ #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "bigDbSnp.h" char *bigDbSnpCommaSepFieldNames = "chrom,chromStart,chromEnd,name,ref,altCount,alts,shiftBases,freqSourceCount,minorAlleleFreq,majorAllele,minorAllele,maxFuncImpact,class,ucscNotes,_dataOffset,_dataLen"; /* definitions for class column */ static char *values_class[] = {"snv", "mnv", "ins", "del", "delins", "identity", NULL}; static struct hash *valhash_class = NULL; struct bigDbSnp *bigDbSnpLoad(char **row) /* Load a bigDbSnp from row fetched with select * from bigDbSnp * from database. Dispose of this with bigDbSnpFree(). */ { struct bigDbSnp *ret; AllocVar(ret); ret->altCount = sqlSigned(row[5]); ret->freqSourceCount = sqlSigned(row[8]); ret->chrom = cloneString(row[0]); ret->chromStart = sqlUnsigned(row[1]); ret->chromEnd = sqlUnsigned(row[2]); ret->name = cloneString(row[3]); ret->ref = cloneString(row[4]); { int sizeOne; sqlStringDynamicArray(row[6], &ret->alts, &sizeOne); assert(sizeOne == ret->altCount); } ret->shiftBases = sqlUnsigned(row[7]); { int sizeOne; sqlDoubleDynamicArray(row[9], &ret->minorAlleleFreq, &sizeOne); assert(sizeOne == ret->freqSourceCount); } { int sizeOne; sqlStringDynamicArray(row[10], &ret->majorAllele, &sizeOne); assert(sizeOne == ret->freqSourceCount); } { int sizeOne; sqlStringDynamicArray(row[11], &ret->minorAllele, &sizeOne); assert(sizeOne == ret->freqSourceCount); } ret->maxFuncImpact = sqlUnsigned(row[12]); ret->class = sqlEnumParse(row[13], values_class, &valhash_class); ret->ucscNotes = cloneString(row[14]); ret->_dataOffset = sqlLongLong(row[15]); ret->_dataLen = sqlSigned(row[16]); return ret; } struct bigDbSnp *bigDbSnpLoadAll(char *fileName) /* Load all bigDbSnp from a whitespace-separated file. * Dispose of this with bigDbSnpFreeList(). */ { struct bigDbSnp *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[17]; while (lineFileRow(lf, row)) { el = bigDbSnpLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct bigDbSnp *bigDbSnpLoadAllByChar(char *fileName, char chopper) /* Load all bigDbSnp from a chopper separated file. * Dispose of this with bigDbSnpFreeList(). */ { struct bigDbSnp *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[17]; while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) { el = bigDbSnpLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct bigDbSnp *bigDbSnpCommaIn(char **pS, struct bigDbSnp *ret) /* Create a bigDbSnp out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new bigDbSnp */ { char *s = *pS; if (ret == NULL) AllocVar(ret); ret->chrom = sqlStringComma(&s); ret->chromStart = sqlUnsignedComma(&s); ret->chromEnd = sqlUnsignedComma(&s); ret->name = sqlStringComma(&s); ret->ref = sqlStringComma(&s); ret->altCount = sqlSignedComma(&s); { int i; s = sqlEatChar(s, '{'); AllocArray(ret->alts, ret->altCount); for (i=0; ialtCount; ++i) { ret->alts[i] = sqlStringComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } ret->shiftBases = sqlUnsignedComma(&s); ret->freqSourceCount = sqlSignedComma(&s); { int i; s = sqlEatChar(s, '{'); AllocArray(ret->minorAlleleFreq, ret->freqSourceCount); for (i=0; ifreqSourceCount; ++i) { ret->minorAlleleFreq[i] = sqlDoubleComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } { int i; s = sqlEatChar(s, '{'); AllocArray(ret->majorAllele, ret->freqSourceCount); for (i=0; ifreqSourceCount; ++i) { ret->majorAllele[i] = sqlStringComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } { int i; s = sqlEatChar(s, '{'); AllocArray(ret->minorAllele, ret->freqSourceCount); for (i=0; ifreqSourceCount; ++i) { ret->minorAllele[i] = sqlStringComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } ret->maxFuncImpact = sqlUnsignedComma(&s); ret->class = sqlEnumComma(&s, values_class, &valhash_class); ret->ucscNotes = sqlStringComma(&s); ret->_dataOffset = sqlLongLongComma(&s); ret->_dataLen = sqlSignedComma(&s); *pS = s; return ret; } void bigDbSnpFree(struct bigDbSnp **pEl) /* Free a single dynamically allocated bigDbSnp such as created * with bigDbSnpLoad(). */ { struct bigDbSnp *el; if ((el = *pEl) == NULL) return; freeMem(el->chrom); freeMem(el->name); freeMem(el->ref); /* All strings in alts are allocated at once, so only need to free first. */ if (el->alts != NULL) freeMem(el->alts[0]); freeMem(el->alts); freeMem(el->minorAlleleFreq); /* All strings in majorAllele are allocated at once, so only need to free first. */ if (el->majorAllele != NULL) freeMem(el->majorAllele[0]); freeMem(el->majorAllele); /* All strings in minorAllele are allocated at once, so only need to free first. */ if (el->minorAllele != NULL) freeMem(el->minorAllele[0]); freeMem(el->minorAllele); freeMem(el->ucscNotes); freez(pEl); } void bigDbSnpFreeList(struct bigDbSnp **pList) /* Free a list of dynamically allocated bigDbSnp's */ { struct bigDbSnp *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; bigDbSnpFree(&el); } *pList = NULL; } void bigDbSnpOutput(struct bigDbSnp *el, FILE *f, char sep, char lastSep) /* Print out bigDbSnp. Separate fields with sep. Follow last field with lastSep. */ { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->chrom); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->chromStart); fputc(sep,f); fprintf(f, "%u", el->chromEnd); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->name); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->ref); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%d", el->altCount); fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; ialtCount; ++i) { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->alts[i]); if (sep == ',') fputc('"',f); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); fprintf(f, "%u", el->shiftBases); fputc(sep,f); fprintf(f, "%d", el->freqSourceCount); fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; ifreqSourceCount; ++i) { fprintf(f, "%g", el->minorAlleleFreq[i]); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; ifreqSourceCount; ++i) { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->majorAllele[i]); if (sep == ',') fputc('"',f); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; ifreqSourceCount; ++i) { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->minorAllele[i]); if (sep == ',') fputc('"',f); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); fprintf(f, "%u", el->maxFuncImpact); fputc(sep,f); if (sep == ',') fputc('"',f); sqlEnumPrint(f, el->class, values_class); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->ucscNotes); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%lld", el->_dataOffset); fputc(sep,f); fprintf(f, "%d", el->_dataLen); fputc(lastSep,f); } /* -------------------------------- End autoSql Generated Code -------------------------------- */ struct symbolDesc { char *symbol; char *description; }; struct symbolDesc ucscNotesDesc[] = { { bdsAltIsAmbiguous, "At least one alternate allele " "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." }, { bdsClassMismatch, "Variation class/type is inconsistent with alleles mapped to this genome assembly." }, { bdsClinvar, "Variant is in ClinVar." }, { bdsClinvarBenign, "Variant is in ClinVar with clinical significance of benign and/or likely benign." }, { bdsClinvarConflicting, "Variant is in ClinVar with reports of both benign and pathogenic significance." }, { bdsClinvarPathogenic, "Variant is in ClinVar with clinical significance of pathogenic and/or likely pathogenic." }, { bdsClusterError, "This variant has the same start, end and class as another variant; " "they probably should have been merged into one variant." }, { bdsCommonAll, "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% " "in all projects reporting frequencies." }, { bdsCommonSome, "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% " "in some, but not all, projects reporting frequencies." }, { bdsDiffMajor, "Different frequency sources have different major alleles " "(see table of allele frequencies above)." }, { bdsFreqIsAmbiguous, "At least one allele reported by at least one project " "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." }, { bdsFreqNotRefAlt, "The reference genome allele is not the major allele in at least one project." }, { bdsMultiMap, "This variant has been mapped to more than one distinct genomic location." }, + { bdsOtherMapErr, + "Another mapping of this variant has illegal coordinates implying indel mapping error." }, { bdsOverlapDiffClass, "This variant overlaps another variant with a different type/class." }, { bdsOverlapSameClass, "This variant overlaps another with the same type/class but different start/end." }, { bdsRareAll, "Variant is \"rare\", i.e. has a Minor Allele Frequency of less than 1% " "in all projects reporting frequencies, or has no frequency data." }, { bdsRareSome, "Variant is \"rare\", i.e. has a Minor Allele Frequency of less than 1% " "in some, but not all, projects reporting frequencies." }, { bdsRefIsAmbiguous, "The reference genome allele " "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." }, { bdsRefIsMinor, "The reference genome allele is not the major allele in at least one project." }, { bdsRefIsRare, "The reference genome allele is rare (i.e. allele frequency < 1%)." }, { bdsRefIsSingleton, "The reference genome allele has never been observed " "in a population sequencing project reporting frequencies." }, { bdsRefMismatch, "The reference genome allele reported by dbSNP differs from the GenBank assembly sequence." }, { bdsRevStrand, "The orientation of the currently viewed reference genome sequence is different from " "the orientation of dbSNP's preferred assembly; alleles are " "presented on the forward strand of the currently viewed reference sequence." }, }; char *bigDbSnpDescribeUcscNote(char *ucscNote) /* Return a string describing ucscNote, unless it is unrecognized in which case return NULL. * Do not free returned value. */ { int i; for (i = 0; i < ArraySize(ucscNotesDesc); i++) { if (sameString(ucscNote, ucscNotesDesc[i].symbol)) return ucscNotesDesc[i].description; } return NULL; } char *bigDbSnpClassToString(enum bigDbSnpClass class) /* Return the string version of enum bigDbSnpClass. Do not free result. */ { char *string = NULL; switch (class) { case bigDbSnpSnv: string = "snv"; break; case bigDbSnpMnv: string = "mnv"; break; case bigDbSnpIns: string = "ins"; break; case bigDbSnpDel: string = "del"; break; case bigDbSnpDelins: string = "delins"; break; case bigDbSnpIdentity: string = "identity"; break; default: errAbort("bigDbSnpClassToString: unrecognized value %d", (int)class); } return string; } static boolean abbrevNRepeat(char *allele, int n, char *buf, size_t bufLen) /* If allele is an N-base repeat, and a shorter representation fits in buf, return TRUE. */ { boolean canAbbrev = FALSE; int len = strlen(allele); int minAbbrevLen = max(n*2, n+4); if (len >= minAbbrevLen && bufLen >= minAbbrevLen) { int reps = 1; int i; for (i = n; i < len; i++) { if (allele[i] != allele[i-n]) break; if (i % n == n-1) reps++; } if (i >= minAbbrevLen) { // End of repeating section; are there enough repeats to make the notation shorter? char repeatUnit[n+1]; safencpy(repeatUnit, sizeof repeatUnit, allele, n); int abbrevLen = snprintf(buf, bufLen, "(%s)%d", repeatUnit, reps); // Does the rest of the sequence start with a different repeat? char *bufRest = buf+abbrevLen; size_t bufRestLen = bufLen - abbrevLen; char *alRest = allele + (reps * n); if (bufRestLen > 5 && abbrevNRepeat(alRest, n, bufRest, bufRestLen)) abbrevLen = strlen(buf); else abbrevLen += snprintf(bufRest, bufRestLen, "%s", alRest); if (abbrevLen < bufLen) canAbbrev = TRUE; else buf[0] = '\0'; } } return canAbbrev; } char *bigDbSnpAbbrevAllele(char *allele, char *buf, size_t bufLen) /* If allele can be abbreviated to something shorter than itself that fits in buf, * and doesn't end up with a tiny bit of abbreviation followed by a bunch of unabbreviated * sequence, then put the abbreviation in buf and return buf; otherwise return allele. * If allele is the empty string, returns "-" (in buf). */ { if (isEmpty(allele)) { safecpy(buf, bufLen, "-"); return buf; } char *abbrev = allele; int maxN = (bufLen - 3) / 2; int n; for (n = 1; n <= maxN; n++) { if (abbrevNRepeat(allele, n, buf, bufLen)) { abbrev = buf; break; } } if (abbrev == buf) { int alLen = strlen(buf); char *abbrevEnd = strrchr(buf, ')'); if (abbrevEnd == NULL) errAbort("bigDbSnpAbbrevAllele: expect abbreviated allele '%s' to contain at least one ')'", buf); int abbrevLen = abbrevEnd + 1 - buf; if (abbrevLen < alLen>>2) { // Never mind, the abbreviated portion is much smaller than the unabbreviated portion. abbrev = allele; } } return abbrev; }