d3752edc12da1bf08427946150f564dbdd5d2254 angie Thu Oct 24 13:55:51 2019 -0700 bigDbSnp track handler code - initial commit. refs #23283 * dnautil: Added trimRefAltLeft to get left-justified trimming (a la VCF not HGVS). * bigBedClick: do hReplaceGbdb up front in parseDetailsTablUrls instead of waiting until endpoint. * trackDbCustom.c: consolidating type-handling for wig/bigWig vs. bigBed-based big*. diff --git src/hg/lib/bigDbSnp.c src/hg/lib/bigDbSnp.c index 71c30b7..863d8af 100644 --- src/hg/lib/bigDbSnp.c +++ src/hg/lib/bigDbSnp.c @@ -1,299 +1,477 @@ /* bigDbSnp.c was originally generated by the autoSql program, which also * generated bigDbSnp.h and bigDbSnp.sql. This module links the database and * the RAM representation of objects. */ #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "bigDbSnp.h" char *bigDbSnpCommaSepFieldNames = "chrom,chromStart,chromEnd,name,ref,altCount,alts,shiftBases,freqSourceCount,minorAlleleFreq,majorAllele,minorAllele,maxFuncImpact,class,ucscNotes,_dataOffset,_dataLen"; /* definitions for class column */ static char *values_class[] = {"snv", "mnv", "ins", "del", "delins", "identity", NULL}; static struct hash *valhash_class = NULL; struct bigDbSnp *bigDbSnpLoad(char **row) /* Load a bigDbSnp from row fetched with select * from bigDbSnp * from database. Dispose of this with bigDbSnpFree(). */ { struct bigDbSnp *ret; AllocVar(ret); ret->altCount = sqlSigned(row[5]); ret->freqSourceCount = sqlSigned(row[8]); ret->chrom = cloneString(row[0]); ret->chromStart = sqlUnsigned(row[1]); ret->chromEnd = sqlUnsigned(row[2]); ret->name = cloneString(row[3]); ret->ref = cloneString(row[4]); { int sizeOne; sqlStringDynamicArray(row[6], &ret->alts, &sizeOne); assert(sizeOne == ret->altCount); } ret->shiftBases = sqlUnsigned(row[7]); { int sizeOne; sqlDoubleDynamicArray(row[9], &ret->minorAlleleFreq, &sizeOne); assert(sizeOne == ret->freqSourceCount); } { int sizeOne; sqlStringDynamicArray(row[10], &ret->majorAllele, &sizeOne); assert(sizeOne == ret->freqSourceCount); } { int sizeOne; sqlStringDynamicArray(row[11], &ret->minorAllele, &sizeOne); assert(sizeOne == ret->freqSourceCount); } ret->maxFuncImpact = sqlUnsigned(row[12]); ret->class = sqlEnumParse(row[13], values_class, &valhash_class); ret->ucscNotes = cloneString(row[14]); ret->_dataOffset = sqlLongLong(row[15]); ret->_dataLen = sqlSigned(row[16]); return ret; } struct bigDbSnp *bigDbSnpLoadAll(char *fileName) /* Load all bigDbSnp from a whitespace-separated file. * Dispose of this with bigDbSnpFreeList(). */ { struct bigDbSnp *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[17]; while (lineFileRow(lf, row)) { el = bigDbSnpLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct bigDbSnp *bigDbSnpLoadAllByChar(char *fileName, char chopper) /* Load all bigDbSnp from a chopper separated file. * Dispose of this with bigDbSnpFreeList(). */ { struct bigDbSnp *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[17]; while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) { el = bigDbSnpLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct bigDbSnp *bigDbSnpCommaIn(char **pS, struct bigDbSnp *ret) /* Create a bigDbSnp out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new bigDbSnp */ { char *s = *pS; if (ret == NULL) AllocVar(ret); ret->chrom = sqlStringComma(&s); ret->chromStart = sqlUnsignedComma(&s); ret->chromEnd = sqlUnsignedComma(&s); ret->name = sqlStringComma(&s); ret->ref = sqlStringComma(&s); ret->altCount = sqlSignedComma(&s); { int i; s = sqlEatChar(s, '{'); AllocArray(ret->alts, ret->altCount); for (i=0; i<ret->altCount; ++i) { ret->alts[i] = sqlStringComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } ret->shiftBases = sqlUnsignedComma(&s); ret->freqSourceCount = sqlSignedComma(&s); { int i; s = sqlEatChar(s, '{'); AllocArray(ret->minorAlleleFreq, ret->freqSourceCount); for (i=0; i<ret->freqSourceCount; ++i) { ret->minorAlleleFreq[i] = sqlDoubleComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } { int i; s = sqlEatChar(s, '{'); AllocArray(ret->majorAllele, ret->freqSourceCount); for (i=0; i<ret->freqSourceCount; ++i) { ret->majorAllele[i] = sqlStringComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } { int i; s = sqlEatChar(s, '{'); AllocArray(ret->minorAllele, ret->freqSourceCount); for (i=0; i<ret->freqSourceCount; ++i) { ret->minorAllele[i] = sqlStringComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); } ret->maxFuncImpact = sqlUnsignedComma(&s); ret->class = sqlEnumComma(&s, values_class, &valhash_class); ret->ucscNotes = sqlStringComma(&s); ret->_dataOffset = sqlLongLongComma(&s); ret->_dataLen = sqlSignedComma(&s); *pS = s; return ret; } void bigDbSnpFree(struct bigDbSnp **pEl) /* Free a single dynamically allocated bigDbSnp such as created * with bigDbSnpLoad(). */ { struct bigDbSnp *el; if ((el = *pEl) == NULL) return; freeMem(el->chrom); freeMem(el->name); freeMem(el->ref); /* All strings in alts are allocated at once, so only need to free first. */ if (el->alts != NULL) freeMem(el->alts[0]); freeMem(el->alts); freeMem(el->minorAlleleFreq); /* All strings in majorAllele are allocated at once, so only need to free first. */ if (el->majorAllele != NULL) freeMem(el->majorAllele[0]); freeMem(el->majorAllele); /* All strings in minorAllele are allocated at once, so only need to free first. */ if (el->minorAllele != NULL) freeMem(el->minorAllele[0]); freeMem(el->minorAllele); freeMem(el->ucscNotes); freez(pEl); } void bigDbSnpFreeList(struct bigDbSnp **pList) /* Free a list of dynamically allocated bigDbSnp's */ { struct bigDbSnp *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; bigDbSnpFree(&el); } *pList = NULL; } void bigDbSnpOutput(struct bigDbSnp *el, FILE *f, char sep, char lastSep) /* Print out bigDbSnp. Separate fields with sep. Follow last field with lastSep. */ { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->chrom); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->chromStart); fputc(sep,f); fprintf(f, "%u", el->chromEnd); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->name); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->ref); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%d", el->altCount); fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; i<el->altCount; ++i) { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->alts[i]); if (sep == ',') fputc('"',f); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); fprintf(f, "%u", el->shiftBases); fputc(sep,f); fprintf(f, "%d", el->freqSourceCount); fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; i<el->freqSourceCount; ++i) { fprintf(f, "%g", el->minorAlleleFreq[i]); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; i<el->freqSourceCount; ++i) { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->majorAllele[i]); if (sep == ',') fputc('"',f); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); { int i; if (sep == ',') fputc('{',f); for (i=0; i<el->freqSourceCount; ++i) { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->minorAllele[i]); if (sep == ',') fputc('"',f); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(sep,f); fprintf(f, "%u", el->maxFuncImpact); fputc(sep,f); if (sep == ',') fputc('"',f); sqlEnumPrint(f, el->class, values_class); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->ucscNotes); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%lld", el->_dataOffset); fputc(sep,f); fprintf(f, "%d", el->_dataLen); fputc(lastSep,f); } /* -------------------------------- End autoSql Generated Code -------------------------------- */ + +struct symbolDesc +{ + char *symbol; + char *description; +}; + +struct symbolDesc ucscNotesDesc[] = + { + { bdsAltIsAmbiguous, + "At least one alternate allele " + "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." }, + { bdsClassMismatch, + "Variation class/type is inconsistent with alleles mapped to this genome assembly." }, + { bdsClinvar, + "Variant is in ClinVar." }, + { bdsClusterError, + "This variant has the same start, end and class as another variant; " + "they probably should have been merged into one variant." }, + { bdsCommonAll, + "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% " + "in all projects reporting frequencies." }, + { bdsCommonSome, + "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% " + "in some, but not all, projects reporting frequencies." }, + { bdsDiffMajor, + "Different frequency sources have different major alleles " + "(see table of allele frequencies above)." }, + { bdsFreqIsAmbiguous, + "At least one allele reported by at least one project " + "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." }, + { bdsFreqNotRefAlt, + "The reference genome allele is not the major allele in at least one project." }, + { bdsMultiMap, + "This variant has been mapped to more than one distinct genomic location." }, + { bdsOverlapDiffClass, + "This variant overlaps another variant with a different type/class." }, + { bdsOverlapSameClass, + "This variant overlaps another with the same type/class but different start/end." }, + { bdsRefIsAmbiguous, + "The reference genome allele " + "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." }, + { bdsRefIsMinor, + "The reference genome allele is not the major allele in at least one project." }, + { bdsRefIsRare, + "The reference genome allele is rare (i.e. allele frequency < 1%)." }, + { bdsRefIsSingleton, + "The reference genome allele has never been observed " + "in a population sequencing project reporting frequencies." }, + { bdsRefMismatch, + "The reference genome allele reported by dbSNP differs from the GenBank assembly sequence." }, + { bdsRevStrand, + "The orientation of the currently viewed reference genome sequence is different from " + "the orientation of dbSNP's preferred assembly; alleles are " + "presented on the forward strand of the currently viewed reference sequence." }, + }; + +char *bigDbSnpDescribeUcscNote(char *ucscNote) +/* Return a string describing ucscNote, unless it is unrecognized in which case return NULL. + * Do not free returned value. */ +{ +int i; +for (i = 0; i < ArraySize(ucscNotesDesc); i++) + { + if (sameString(ucscNote, ucscNotesDesc[i].symbol)) + return ucscNotesDesc[i].description; + } +return NULL; +} + +char *bigDbSnpClassToString(enum bigDbSnpClass class) +/* Return the string version of enum bigDbSnpClass. Do not free result. */ +{ +char *string = NULL; +switch (class) + { + case bigDbSnpSnv: + string = "snv"; + break; + case bigDbSnpMnv: + string = "mnv"; + break; + case bigDbSnpIns: + string = "ins"; + break; + case bigDbSnpDel: + string = "del"; + break; + case bigDbSnpDelins: + string = "delins"; + break; + case bigDbSnpIdentity: + string = "identity"; + break; + default: + errAbort("bigDbSnpClassToString: unrecognized value %d", (int)class); + } +return string; +} + +static boolean abbrevNRepeat(char *allele, int n, char *buf, size_t bufLen) +/* If allele is an N-base repeat, and a shorter representation fits in buf, return TRUE. */ +{ +boolean canAbbrev = FALSE; +int len = strlen(allele); +int minAbbrevLen = max(n*2, n+4); +if (len >= minAbbrevLen && bufLen >= minAbbrevLen) + { + int reps = 1; + int i; + for (i = n; i < len; i++) + { + if (allele[i] != allele[i-n]) + break; + if (i % n == n-1) + reps++; + } + if (i >= minAbbrevLen) + { + // End of repeating section; are there enough repeats to make the notation shorter? + char repeatUnit[n+1]; + safencpy(repeatUnit, sizeof repeatUnit, allele, n); + int abbrevLen = snprintf(buf, bufLen, "(%s)%d", repeatUnit, reps); + // Does the rest of the sequence start with a different repeat? + char *bufRest = buf+abbrevLen; + size_t bufRestLen = bufLen - abbrevLen; + char *alRest = allele + (reps * n); + if (bufRestLen > 5 && abbrevNRepeat(alRest, n, bufRest, bufRestLen)) + abbrevLen = strlen(buf); + else + abbrevLen += snprintf(bufRest, bufRestLen, "%s", alRest); + if (abbrevLen < bufLen) + canAbbrev = TRUE; + else + buf[0] = '\0'; + } + } +return canAbbrev; +} + +char *bigDbSnpAbbrevAllele(char *allele, char *buf, size_t bufLen) +/* If allele can be abbreviated to something shorter than itself that fits in buf, + * and doesn't end up with a tiny bit of abbreviation followed by a bunch of unabbreviated + * sequence, then put the abbreviation in buf and return buf; otherwise return allele. + * If allele is the empty string, returns "-" (in buf). */ +{ +if (isEmpty(allele)) + { + safecpy(buf, bufLen, "-"); + return buf; + } +char *abbrev = allele; +int maxN = (bufLen - 3) / 2; +int n; +for (n = 1; n <= maxN; n++) + { + if (abbrevNRepeat(allele, n, buf, bufLen)) + { + abbrev = buf; + break; + } + } +if (abbrev == buf) + { + int alLen = strlen(buf); + char *abbrevEnd = strrchr(buf, ')'); + if (abbrevEnd == NULL) + errAbort("bigDbSnpAbbrevAllele: expect abbreviated allele '%s' to contain at least one ')'", + buf); + int abbrevLen = abbrevEnd + 1 - buf; + if (abbrevLen < alLen>>2) + { + // Never mind, the abbreviated portion is much smaller than the unabbreviated portion. + abbrev = allele; + } + } +return abbrev; +}