589685da775f5a122bb23d2b8eca372f53e4b5bc angie Wed Sep 10 16:34:34 2014 -0700 Changes to snpNcbiToUcsc to handle b141 on hg19 and hg38:- When checking range{I,D,S}* locTypes, use strlen(refNCBI) as length only if refNCBI contains only nucleotides. - Checking for overlapping variants at the same location (checkCluster -> DuplicateObserved, MixedObserved) used to be limited to insertions because those are easier to check. Now we check all variants that have all-nucleotide observed. Libified some code from hgTracks that can reverse-complement a slash-separated string of alleles, e.g. G/T -> A/C, -/AG -> -/CT. - SNP IDs are getting too huge (and sparse) to use as indexes into a statically allocated array; use a hash. - Erroneous NULL frequencies in dbSNP's SNPAlleleFreq need to be detected and ignored. refs #13309 diff --git src/lib/dnautil.c src/lib/dnautil.c index d02806d..7d8a665 100644 --- src/lib/dnautil.c +++ src/lib/dnautil.c @@ -462,30 +462,65 @@ temp = *pStart; *pStart = size - *pEnd; *pEnd = size - temp; } /* Switch start/end (zero based half open) coordinates * to opposite strand. */ void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size) { unsigned temp; temp = *pStart; *pStart = size - *pEnd; *pEnd = size - temp; } +char *reverseComplementSlashSeparated(char *alleleStr) +/* Given a slash-separated series of sequences (a common representation of variant alleles), + * returns a slash-sep series with the reverse complement of each sequence (if it is a + * nucleotide sequence). + * Special behavior to support dbSNP's variant allele conventions: + * 1. Reverse the order of sequences (to maintain alphabetical ordering). + * 2. If alleleStr begins with "-/", then after reversing, move "-/" back to the beginning. */ +{ +int len = strlen(alleleStr); +char choppyCopy[len+1]; +safecpy(choppyCopy, sizeof(choppyCopy), alleleStr); +char *alleles[len]; +int alCount = chopByChar(choppyCopy, '/', alleles, ArraySize(alleles)); +char *outStr = needMem(len+1); +int i; +for (i = alCount-1; i >= 0; i--) + { + char *allele = alleles[i]; + int alLen = strlen(allele); + if (isAllNt(allele, alLen)) + reverseComplement(allele, alLen); + if (i != alCount-1) + safecat(outStr, len+1, "/"); + safecat(outStr, len+1, allele); + } +if (startsWith("-/", alleleStr)) + { + // Keep "-/" at the beginning: + memmove(outStr+2, outStr, len-2); + outStr[0] = '-'; + outStr[1] = '/'; + } +return outStr; +} + int cmpDnaStrings(DNA *a, DNA *b) /* Compare using screwy non-alphabetical DNA order TCGA */ { for (;;) { DNA aa = *a++; DNA bb = *b++; if (aa != bb) return ntVal[(int)aa] - ntVal[(int)bb]; if (aa == 0) break; } return 0; }