589685da775f5a122bb23d2b8eca372f53e4b5bc
angie
Wed Sep 10 16:34:34 2014 -0700
Changes to snpNcbiToUcsc to handle b141 on hg19 and hg38:- When checking range{I,D,S}* locTypes, use strlen(refNCBI) as length only if
refNCBI contains only nucleotides.
- Checking for overlapping variants at the same location (checkCluster ->
DuplicateObserved, MixedObserved) used to be limited to insertions because
those are easier to check. Now we check all variants that have
all-nucleotide observed. Libified some code from hgTracks that can
reverse-complement a slash-separated string of alleles, e.g. G/T -> A/C,
-/AG -> -/CT.
- SNP IDs are getting too huge (and sparse) to use as indexes into a
statically allocated array; use a hash.
- Erroneous NULL frequencies in dbSNP's SNPAlleleFreq need to be detected
and ignored.
refs #13309
diff --git src/lib/dnautil.c src/lib/dnautil.c
index d02806d..7d8a665 100644
--- src/lib/dnautil.c
+++ src/lib/dnautil.c
@@ -1,1129 +1,1164 @@
/* Some stuff that you'll likely need in any program that works with
* DNA. Includes stuff for amino acids as well.
*
* Assumes that DNA is stored as a character.
* The DNA it generates will include the bases
* as lowercase tcag. It will generally accept
* uppercase as well, and also 'n' or 'N' or '-'
* for unknown bases.
*
* Amino acids are stored as single character upper case.
*
* This file is copyright 2002 Jim Kent, but license is hereby
* granted for all use - public, private or commercial. */
#include "common.h"
#include "dnautil.h"
struct codonTable
/* The dread codon table. */
{
DNA *codon; /* Lower case. */
AA protCode; /* Upper case. The "Standard" code */
AA mitoCode; /* Upper case. Vertebrate Mitochondrial translations */
};
struct codonTable codonTable[] =
/* The master codon/protein table. */
{
{"ttt", 'F', 'F',},
{"ttc", 'F', 'F',},
{"tta", 'L', 'L',},
{"ttg", 'L', 'L',},
{"tct", 'S', 'S',},
{"tcc", 'S', 'S',},
{"tca", 'S', 'S',},
{"tcg", 'S', 'S',},
{"tat", 'Y', 'Y',},
{"tac", 'Y', 'Y',},
{"taa", 0, 0,},
{"tag", 0, 0,},
{"tgt", 'C', 'C',},
{"tgc", 'C', 'C',},
{"tga", 0, 'W',},
{"tgg", 'W', 'W',},
{"ctt", 'L', 'L',},
{"ctc", 'L', 'L',},
{"cta", 'L', 'L',},
{"ctg", 'L', 'L',},
{"cct", 'P', 'P',},
{"ccc", 'P', 'P',},
{"cca", 'P', 'P',},
{"ccg", 'P', 'P',},
{"cat", 'H', 'H',},
{"cac", 'H', 'H',},
{"caa", 'Q', 'Q',},
{"cag", 'Q', 'Q',},
{"cgt", 'R', 'R',},
{"cgc", 'R', 'R',},
{"cga", 'R', 'R',},
{"cgg", 'R', 'R',},
{"att", 'I', 'I',},
{"atc", 'I', 'I',},
{"ata", 'I', 'M',},
{"atg", 'M', 'M',},
{"act", 'T', 'T',},
{"acc", 'T', 'T',},
{"aca", 'T', 'T',},
{"acg", 'T', 'T',},
{"aat", 'N', 'N',},
{"aac", 'N', 'N',},
{"aaa", 'K', 'K',},
{"aag", 'K', 'K',},
{"agt", 'S', 'S',},
{"agc", 'S', 'S',},
{"aga", 'R', 0,},
{"agg", 'R', 0,},
{"gtt", 'V', 'V',},
{"gtc", 'V', 'V',},
{"gta", 'V', 'V',},
{"gtg", 'V', 'V',},
{"gct", 'A', 'A',},
{"gcc", 'A', 'A',},
{"gca", 'A', 'A',},
{"gcg", 'A', 'A',},
{"gat", 'D', 'D',},
{"gac", 'D', 'D',},
{"gaa", 'E', 'E',},
{"gag", 'E', 'E',},
{"ggt", 'G', 'G',},
{"ggc", 'G', 'G',},
{"gga", 'G', 'G',},
{"ggg", 'G', 'G',},
};
/* A table that gives values 0 for t
1 for c
2 for a
3 for g
* (which is order aa's are in biochemistry codon tables)
* and gives -1 for all others. */
int ntVal[256];
int ntValLower[256]; /* NT values only for lower case. */
int ntValUpper[256]; /* NT values only for upper case. */
int ntVal5[256];
int ntValNoN[256]; /* Like ntVal, but with T_BASE_VAL in place of -1 for nonexistent ones. */
DNA valToNt[(N_BASE_VAL|MASKED_BASE_BIT)+1];
/* convert tables for bit-4 indicating masked */
int ntValMasked[256];
DNA valToNtMasked[256];
static boolean inittedNtVal = FALSE;
static void initNtVal()
{
if (!inittedNtVal)
{
int i;
for (i=0; i<ArraySize(ntVal); i++)
{
ntValUpper[i] = ntValLower[i] = ntVal[i] = -1;
ntValNoN[i] = T_BASE_VAL;
if (isspace(i) || isdigit(i))
ntVal5[i] = ntValMasked[i] = -1;
else
{
ntVal5[i] = N_BASE_VAL;
ntValMasked[i] = (islower(i) ? (N_BASE_VAL|MASKED_BASE_BIT) : N_BASE_VAL);
}
}
ntVal5['t'] = ntVal5['T'] = ntValNoN['t'] = ntValNoN['T'] = ntVal['t'] = ntVal['T'] =
ntValLower['t'] = ntValUpper['T'] = T_BASE_VAL;
ntVal5['u'] = ntVal5['U'] = ntValNoN['u'] = ntValNoN['U'] = ntVal['u'] = ntVal['U'] =
ntValLower['u'] = ntValUpper['U'] = U_BASE_VAL;
ntVal5['c'] = ntVal5['C'] = ntValNoN['c'] = ntValNoN['C'] = ntVal['c'] = ntVal['C'] =
ntValLower['c'] = ntValUpper['C'] = C_BASE_VAL;
ntVal5['a'] = ntVal5['A'] = ntValNoN['a'] = ntValNoN['A'] = ntVal['a'] = ntVal['A'] =
ntValLower['a'] = ntValUpper['A'] = A_BASE_VAL;
ntVal5['g'] = ntVal5['G'] = ntValNoN['g'] = ntValNoN['G'] = ntVal['g'] = ntVal['G'] =
ntValLower['g'] = ntValUpper['G'] = G_BASE_VAL;
valToNt[T_BASE_VAL] = valToNt[T_BASE_VAL|MASKED_BASE_BIT] = 't';
valToNt[C_BASE_VAL] = valToNt[C_BASE_VAL|MASKED_BASE_BIT] = 'c';
valToNt[A_BASE_VAL] = valToNt[A_BASE_VAL|MASKED_BASE_BIT] = 'a';
valToNt[G_BASE_VAL] = valToNt[G_BASE_VAL|MASKED_BASE_BIT] = 'g';
valToNt[N_BASE_VAL] = valToNt[N_BASE_VAL|MASKED_BASE_BIT] = 'n';
/* masked values */
ntValMasked['T'] = T_BASE_VAL;
ntValMasked['U'] = U_BASE_VAL;
ntValMasked['C'] = C_BASE_VAL;
ntValMasked['A'] = A_BASE_VAL;
ntValMasked['G'] = G_BASE_VAL;
ntValMasked['t'] = T_BASE_VAL|MASKED_BASE_BIT;
ntValMasked['u'] = U_BASE_VAL|MASKED_BASE_BIT;
ntValMasked['c'] = C_BASE_VAL|MASKED_BASE_BIT;
ntValMasked['a'] = A_BASE_VAL|MASKED_BASE_BIT;
ntValMasked['g'] = G_BASE_VAL|MASKED_BASE_BIT;
valToNtMasked[T_BASE_VAL] = 'T';
valToNtMasked[C_BASE_VAL] = 'C';
valToNtMasked[A_BASE_VAL] = 'A';
valToNtMasked[G_BASE_VAL] = 'G';
valToNtMasked[N_BASE_VAL] = 'N';
valToNtMasked[T_BASE_VAL|MASKED_BASE_BIT] = 't';
valToNtMasked[C_BASE_VAL|MASKED_BASE_BIT] = 'c';
valToNtMasked[A_BASE_VAL|MASKED_BASE_BIT] = 'a';
valToNtMasked[G_BASE_VAL|MASKED_BASE_BIT] = 'g';
valToNtMasked[N_BASE_VAL|MASKED_BASE_BIT] = 'n';
inittedNtVal = TRUE;
}
}
/* Returns one letter code for protein,
* 0 for stop codon or X for bad input,
* The "Standard" Code */
AA lookupCodon(DNA *dna)
{
int ix;
int i;
char c;
if (!inittedNtVal)
initNtVal();
ix = 0;
for (i=0; i<3; ++i)
{
int bv = ntVal[(int)dna[i]];
if (bv<0)
return 'X';
ix = (ix<<2) + bv;
}
c = codonTable[ix].protCode;
return c;
}
boolean isStopCodon(DNA *dna)
/* Return TRUE if it's a stop codon. */
{
return lookupCodon(dna) == 0;
}
boolean isKozak(char *dna, int dnaSize, int pos)
/* Return TRUE if it's a Kozak compatible start. */
{
if (lookupCodon(dna+pos) != 'M')
{
return FALSE;
}
if (pos + 3 < dnaSize)
{
if (ntVal[(int)dna[pos+3]] == G_BASE_VAL)
return TRUE;
}
if (pos >= 3)
{
int c = ntVal[(int)dna[pos-3]];
if (c == A_BASE_VAL || c == G_BASE_VAL)
return TRUE;
}
return FALSE;
}
boolean isReallyStopCodon(char *dna, boolean selenocysteine)
/* Return TRUE if it's really a stop codon, even considering
* possibilility of selenocysteine. */
{
if (selenocysteine)
{
/* Luckily the mitochondria *also* replaces TGA with
* something else, even though it isn't selenocysteine */
return lookupMitoCodon(dna) == 0;
}
else
{
return lookupCodon(dna) == 0;
}
}
/* Returns one letter code for protein,
* 0 for stop codon or X for bad input,
* Vertebrate Mitochondrial Code */
AA lookupMitoCodon(DNA *dna)
{
int ix;
int i;
char c;
if (!inittedNtVal)
initNtVal();
ix = 0;
for (i=0; i<3; ++i)
{
int bv = ntVal[(int)dna[i]];
if (bv<0)
return 'X';
ix = (ix<<2) + bv;
}
c = codonTable[ix].mitoCode;
c = toupper(c);
return c;
}
Codon codonVal(DNA *start)
/* Return value from 0-63 of codon starting at start.
* Returns -1 if not a codon. */
{
int v1,v2,v3;
if ((v1 = ntVal[(int)start[0]]) < 0)
return -1;
if ((v2 = ntVal[(int)start[1]]) < 0)
return -1;
if ((v3 = ntVal[(int)start[2]]) < 0)
return -1;
return ((v1<<4) + (v2<<2) + v3);
}
DNA *valToCodon(int val)
/* Return codon corresponding to val (0-63) */
{
assert(val >= 0 && val < 64);
return codonTable[val].codon;
}
void dnaTranslateSome(DNA *dna, char *out, int outSize)
/* Translate DNA upto a stop codon or until outSize-1 amino acids,
* whichever comes first. Output will be zero terminated. */
{
int i;
int dnaSize;
int protSize = 0;
outSize -= 1; /* Room for terminal zero */
dnaSize = strlen(dna);
for (i=0; i<dnaSize-2; i+=3)
{
if (protSize >= outSize)
break;
if ((out[protSize++] = lookupCodon(dna+i)) == 0)
break;
}
out[protSize] = 0;
}
/* A little array to help us decide if a character is a
* nucleotide, and if so convert it to lower case. */
char ntChars[256];
static void initNtChars()
{
static boolean initted = FALSE;
if (!initted)
{
zeroBytes(ntChars, sizeof(ntChars));
ntChars['a'] = ntChars['A'] = 'a';
ntChars['c'] = ntChars['C'] = 'c';
ntChars['g'] = ntChars['G'] = 'g';
ntChars['t'] = ntChars['T'] = 't';
ntChars['n'] = ntChars['N'] = 'n';
ntChars['u'] = ntChars['U'] = 'u';
ntChars['-'] = 'n';
initted = TRUE;
}
}
char ntMixedCaseChars[256];
static void initNtMixedCaseChars()
{
static boolean initted = FALSE;
if (!initted)
{
zeroBytes(ntMixedCaseChars, sizeof(ntMixedCaseChars));
ntMixedCaseChars['a'] = 'a';
ntMixedCaseChars['A'] = 'A';
ntMixedCaseChars['c'] = 'c';
ntMixedCaseChars['C'] = 'C';
ntMixedCaseChars['g'] = 'g';
ntMixedCaseChars['G'] = 'G';
ntMixedCaseChars['t'] = 't';
ntMixedCaseChars['T'] = 'T';
ntMixedCaseChars['n'] = 'n';
ntMixedCaseChars['N'] = 'N';
ntMixedCaseChars['u'] = 'u';
ntMixedCaseChars['U'] = 'U';
ntMixedCaseChars['-'] = 'n';
initted = TRUE;
}
}
/* Another array to help us do complement of DNA */
DNA ntCompTable[256];
static boolean inittedCompTable = FALSE;
static void initNtCompTable()
{
zeroBytes(ntCompTable, sizeof(ntCompTable));
ntCompTable[' '] = ' ';
ntCompTable['-'] = '-';
ntCompTable['='] = '=';
ntCompTable['a'] = 't';
ntCompTable['c'] = 'g';
ntCompTable['g'] = 'c';
ntCompTable['t'] = 'a';
ntCompTable['u'] = 'a';
ntCompTable['n'] = 'n';
ntCompTable['-'] = '-';
ntCompTable['.'] = '.';
ntCompTable['A'] = 'T';
ntCompTable['C'] = 'G';
ntCompTable['G'] = 'C';
ntCompTable['T'] = 'A';
ntCompTable['U'] = 'A';
ntCompTable['N'] = 'N';
ntCompTable['R'] = 'Y';
ntCompTable['Y'] = 'R';
ntCompTable['M'] = 'K';
ntCompTable['K'] = 'M';
ntCompTable['S'] = 'S';
ntCompTable['W'] = 'W';
ntCompTable['V'] = 'B';
ntCompTable['H'] = 'D';
ntCompTable['D'] = 'H';
ntCompTable['B'] = 'V';
ntCompTable['X'] = 'N';
ntCompTable['r'] = 'y';
ntCompTable['y'] = 'r';
ntCompTable['s'] = 's';
ntCompTable['w'] = 'w';
ntCompTable['m'] = 'k';
ntCompTable['k'] = 'm';
ntCompTable['v'] = 'b';
ntCompTable['h'] = 'd';
ntCompTable['d'] = 'h';
ntCompTable['b'] = 'v';
ntCompTable['x'] = 'n';
ntCompTable['('] = ')';
ntCompTable[')'] = '(';
inittedCompTable = TRUE;
}
/* Complement DNA (not reverse). */
void complement(DNA *dna, long length)
{
int i;
if (!inittedCompTable) initNtCompTable();
for (i=0; i<length; ++i)
{
*dna = ntCompTable[(int)*dna];
++dna;
}
}
/* Reverse complement DNA. */
void reverseComplement(DNA *dna, long length)
{
reverseBytes(dna, length);
complement(dna, length);
}
/* Reverse offset - return what will be offset (0 based) to
* same member of array after array is reversed. */
long reverseOffset(long offset, long arraySize)
{
return arraySize-1 - offset;
}
/* Switch start/end (zero based half open) coordinates
* to opposite strand. */
void reverseIntRange(int *pStart, int *pEnd, int size)
{
int temp;
temp = *pStart;
*pStart = size - *pEnd;
*pEnd = size - temp;
}
/* Switch start/end (zero based half open) coordinates
* to opposite strand. */
void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size)
{
unsigned temp;
temp = *pStart;
*pStart = size - *pEnd;
*pEnd = size - temp;
}
+char *reverseComplementSlashSeparated(char *alleleStr)
+/* Given a slash-separated series of sequences (a common representation of variant alleles),
+ * returns a slash-sep series with the reverse complement of each sequence (if it is a
+ * nucleotide sequence).
+ * Special behavior to support dbSNP's variant allele conventions:
+ * 1. Reverse the order of sequences (to maintain alphabetical ordering).
+ * 2. If alleleStr begins with "-/", then after reversing, move "-/" back to the beginning. */
+{
+int len = strlen(alleleStr);
+char choppyCopy[len+1];
+safecpy(choppyCopy, sizeof(choppyCopy), alleleStr);
+char *alleles[len];
+int alCount = chopByChar(choppyCopy, '/', alleles, ArraySize(alleles));
+char *outStr = needMem(len+1);
+int i;
+for (i = alCount-1; i >= 0; i--)
+ {
+ char *allele = alleles[i];
+ int alLen = strlen(allele);
+ if (isAllNt(allele, alLen))
+ reverseComplement(allele, alLen);
+ if (i != alCount-1)
+ safecat(outStr, len+1, "/");
+ safecat(outStr, len+1, allele);
+ }
+if (startsWith("-/", alleleStr))
+ {
+ // Keep "-/" at the beginning:
+ memmove(outStr+2, outStr, len-2);
+ outStr[0] = '-';
+ outStr[1] = '/';
+ }
+return outStr;
+}
+
int cmpDnaStrings(DNA *a, DNA *b)
/* Compare using screwy non-alphabetical DNA order TCGA */
{
for (;;)
{
DNA aa = *a++;
DNA bb = *b++;
if (aa != bb)
return ntVal[(int)aa] - ntVal[(int)bb];
if (aa == 0)
break;
}
return 0;
}
/* Convert T's to U's */
void toRna(DNA *dna)
{
DNA c;
for (;;)
{
c = *dna;
if (c == 't')
*dna = 'u';
else if (c == 'T')
*dna = 'U';
else if (c == 0)
break;
++dna;
}
}
char *skipIgnoringDash(char *a, int size, bool skipTrailingDash)
/* Count size number of characters, and any
* dash characters. */
{
while (size > 0)
{
if (*a++ != '-')
--size;
}
if (skipTrailingDash)
while (*a == '-')
++a;
return a;
}
int countNonDash(char *a, int size)
/* Count number of non-dash characters. */
{
int count = 0;
int i;
for (i=0; i<size; ++i)
if (a[i] != '-')
++count;
return count;
}
int nextPowerOfFour(long x)
/* Return next power of four that would be greater or equal to x.
* For instance if x < 4, return 1, if x < 16 return 2....
* (From biological point of view how many bases are needed to
* code this number.) */
{
int count = 1;
while (x > 4)
{
count += 1;
x >>= 2;
}
return count;
}
long dnaOrAaFilteredSize(char *raw, char filter[256])
/* Return how long DNA will be after non-DNA is filtered out. */
{
char c;
long count = 0;
dnaUtilOpen();
while ((c = *raw++) != 0)
{
if (filter[(int)c]) ++count;
}
return count;
}
void dnaOrAaFilter(char *in, char *out, char filter[256])
/* Run chars through filter. */
{
char c;
dnaUtilOpen();
while ((c = *in++) != 0)
{
if ((c = filter[(int)c]) != 0) *out++ = c;
}
*out++ = 0;
}
long dnaFilteredSize(char *rawDna)
/* Return how long DNA will be after non-DNA is filtered out. */
{
return dnaOrAaFilteredSize(rawDna, ntChars);
}
void dnaFilter(char *in, DNA *out)
/* Filter out non-DNA characters and change to lower case. */
{
dnaOrAaFilter(in, out, ntChars);
}
void dnaFilterToN(char *in, DNA *out)
/* Change all non-DNA characters to N. */
{
DNA c;
initNtChars();
while ((c = *in++) != 0)
{
if ((c = ntChars[(int)c]) != 0) *out++ = c;
else *out++ = 'n';
}
*out++ = 0;
}
void dnaMixedCaseFilter(char *in, DNA *out)
/* Filter out non-DNA characters but leave case intact. */
{
dnaOrAaFilter(in, out, ntMixedCaseChars);
}
long aaFilteredSize(char *raw)
/* Return how long aa will be after non-aa chars is filtered out. */
{
return dnaOrAaFilteredSize(raw, aaChars);
}
void aaFilter(char *in, DNA *out)
/* Filter out non-aa characters and change to upper case. */
{
dnaOrAaFilter(in, out, aaChars);
}
void upperToN(char *s, int size)
/* Turn upper case letters to N's. */
{
char c;
int i;
for (i=0; i<size; ++i)
{
c = s[i];
if (isupper(c))
s[i] = 'n';
}
}
void lowerToN(char *s, int size)
/* Turn lower case letters to N's. */
{
char c;
int i;
for (i=0; i<size; ++i)
{
c = s[i];
if (islower(c))
s[i] = 'N';
}
}
void dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4])
/* Count up frequency of occurance of each base and store
* results in histogram. */
{
int val;
zeroBytes(histogram, 4*sizeof(int));
while (--dnaSize >= 0)
{
if ((val = ntVal[(int)*dna++]) >= 0)
++histogram[val];
}
}
bits64 basesToBits64(char *dna, int size)
/* Convert dna of given size (up to 32) to binary representation */
{
if (size > 32)
errAbort("basesToBits64 called on %d bases, can only go up to 32", size);
bits64 result = 0;
int i;
for (i=0; i<size; ++i)
{
result <<= 2;
result += ntValNoN[(int)dna[i]];
}
return result;
}
bits32 packDna16(DNA *in)
/* pack 16 bases into a word */
{
bits32 out = 0;
int count = 16;
int bVal;
while (--count >= 0)
{
bVal = ntValNoN[(int)*in++];
out <<= 2;
out += bVal;
}
return out;
}
bits16 packDna8(DNA *in)
/* Pack 8 bases into a short word */
{
bits16 out = 0;
int count = 8;
int bVal;
while (--count >= 0)
{
bVal = ntValNoN[(int)*in++];
out <<= 2;
out += bVal;
}
return out;
}
UBYTE packDna4(DNA *in)
/* Pack 4 bases into a UBYTE */
{
UBYTE out = 0;
int count = 4;
int bVal;
while (--count >= 0)
{
bVal = ntValNoN[(int)*in++];
out <<= 2;
out += bVal;
}
return out;
}
void unpackDna(bits32 *tiles, int tileCount, DNA *out)
/* Unpack DNA. Expands to 16x tileCount in output. */
{
int i, j;
bits32 tile;
for (i=0; i<tileCount; ++i)
{
tile = tiles[i];
for (j=15; j>=0; --j)
{
out[j] = valToNt[tile & 0x3];
tile >>= 2;
}
out += 16;
}
}
void unpackDna4(UBYTE *tiles, int byteCount, DNA *out)
/* Unpack DNA. Expands to 4x byteCount in output. */
{
int i, j;
UBYTE tile;
for (i=0; i<byteCount; ++i)
{
tile = tiles[i];
for (j=3; j>=0; --j)
{
out[j] = valToNt[tile & 0x3];
tile >>= 2;
}
out += 4;
}
}
static void checkSizeTypes()
/* Make sure that some of our predefined types are the right size. */
{
assert(sizeof(UBYTE) == 1);
assert(sizeof(WORD) == 2);
assert(sizeof(bits32) == 4);
assert(sizeof(bits16) == 2);
}
int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize)
/* Given a gap in genome from iStart to iEnd, return
* Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
* intron. Assumes DNA is lower cased. */
{
if (iEnd - iStart < minIntronSize)
return 0;
if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g')
{
return 1;
}
else if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c')
{
return -1;
}
else
return 0;
}
int intronOrientation(DNA *iStart, DNA *iEnd)
/* Given a gap in genome from iStart to iEnd, return
* Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
* intron. Assumes DNA is lower cased. */
{
return intronOrientationMinSize(iStart, iEnd, 32);
}
int dnaScore2(DNA a, DNA b)
/* Score match between two bases (relatively crudely). */
{
if (a == 'n' || b == 'n') return 0;
if (a == b) return 1;
else return -1;
}
int dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore,
char ignore)
/* Compare two sequences (without inserts or deletions) and score. */
{
int i;
int score = 0;
for (i=0; i<size; ++i)
{
char aa = a[i];
char bb = b[i];
if (aa == ignore || bb == ignore)
continue;
if (aa == bb)
score += matchScore;
else
score += mismatchScore;
}
return score;
}
int dnaScoreMatch(DNA *a, DNA *b, int size)
/* Compare two pieces of DNA base by base. Total mismatches are
* subtracted from total matches and returned as score. 'N's
* neither hurt nor help score. */
{
return dnaOrAaScoreMatch(a, b, size, 1, -1, 'n');
}
int aaScore2(AA a, AA b)
/* Score match between two amino acids (relatively crudely). */
{
if (a == 'X' || b == 'X') return 0;
if (a == b) return 2;
else return -1;
}
int aaScoreMatch(AA *a, AA *b, int size)
/* Compare two peptides aa by aa. */
{
return dnaOrAaScoreMatch(a, b, size, 2, -1, 'X');
}
void writeSeqWithBreaks(FILE *f, char *letters, int letterCount, int maxPerLine)
/* Write out letters with newlines every maxLine. */
{
int lettersLeft = letterCount;
int lineSize;
while (lettersLeft > 0)
{
lineSize = lettersLeft;
if (lineSize > maxPerLine)
lineSize = maxPerLine;
mustWrite(f, letters, lineSize);
fputc('\n', f);
letters += lineSize;
lettersLeft -= lineSize;
}
}
static int findTailPolyAMaybeMask(DNA *dna, int size, boolean doMask,
boolean loose)
/* Identify PolyA at end; mask to 'n' if specified. This allows a few
* non-A's as noise to be trimmed too. Returns number of bases trimmed.
* Leaves first two bases of PolyA in case there's a taa stop codon. */
{
int i;
int score = 10;
int bestScore = 10;
int bestPos = -1;
int trimSize = 0;
for (i=size-1; i>=0; --i)
{
DNA b = dna[i];
if (b == 'n' || b == 'N')
continue;
if (score > 20) score = 20;
if (b == 'a' || b == 'A')
{
score += 1;
if (score >= bestScore)
{
bestScore = score;
bestPos = i;
}
else if (loose && score >= (bestScore - 8))
{
/* If loose, keep extending even if score isn't back up to best. */
bestPos = i;
}
}
else
{
score -= 10;
}
if (score < 0)
{
break;
}
}
if (bestPos >= 0)
{
trimSize = size - bestPos - 2; // Leave two for aa in taa stop codon
if (trimSize > 0)
{
if (doMask)
for (i=size - trimSize; i<size; ++i)
dna[i] = 'n';
}
else
trimSize = 0;
}
return trimSize;
}
int tailPolyASizeLoose(DNA *dna, int size)
/* Return size of PolyA at end (if present). This allows a few non-A's as
* noise to be trimmed too, but skips first two aa for taa stop codon.
* It is less conservative in extending the polyA region than maskTailPolyA. */
{
return findTailPolyAMaybeMask(dna, size, FALSE, TRUE);
}
int maskTailPolyA(DNA *dna, int size)
/* Convert PolyA at end to n. This allows a few non-A's as noise to be
* trimmed too. Returns number of bases trimmed. Leaves very last a. */
{
return findTailPolyAMaybeMask(dna, size, TRUE, FALSE);
}
static int findHeadPolyTMaybeMask(DNA *dna, int size, boolean doMask,
boolean loose)
/* Return size of PolyT at start (if present); mask to 'n' if specified.
* This allows a few non-T's as noise to be trimmed too, but skips last
* two tt for revcomp'd taa stop codon. */
{
int i;
int score = 10;
int bestScore = 10;
int bestPos = -1;
int pastPoly = 0;
int trimSize = 0;
for (i=0; i<size; ++i)
{
DNA b = dna[i];
if (b == 'n' || b == 'N')
continue;
if (score > 20) score = 20;
if (b == 't' || b == 'T')
{
score += 1;
if (score >= bestScore)
{
bestScore = score;
bestPos = i;
}
else if (loose && score >= (bestScore - 8))
{
/* If loose, keep extending even if score isn't back up to best. */
bestPos = i;
}
}
else
{
score -= 10;
}
if (score < 0)
{
pastPoly = i;
break;
}
}
if (bestPos >= 0)
{
trimSize = bestPos+1 - 2; // Leave two for aa in taa stop codon
if (trimSize > 0)
{
if (doMask)
memset(dna, 'n', trimSize);
}
else
trimSize = 0;
}
return trimSize;
}
int headPolyTSizeLoose(DNA *dna, int size)
/* Return size of PolyT at start (if present). This allows a few non-T's as
* noise to be trimmed too, but skips last two tt for revcomp'd taa stop
* codon.
* It is less conservative in extending the polyA region than maskHeadPolyT. */
{
return findHeadPolyTMaybeMask(dna, size, FALSE, TRUE);
}
int maskHeadPolyT(DNA *dna, int size)
/* Convert PolyT at start. This allows a few non-T's as noise to be
* trimmed too. Returns number of bases trimmed. */
{
return findHeadPolyTMaybeMask(dna, size, TRUE, FALSE);
}
boolean isDna(char *poly, int size)
/* Return TRUE if letters in poly are at least 90% ACGTNU- */
{
int i;
int dnaCount = 0;
dnaUtilOpen();
for (i=0; i<size; ++i)
{
if (ntChars[(int)poly[i]])
dnaCount += 1;
}
return (dnaCount >= round(0.9 * size));
}
boolean isAllNt(char *seq, int size)
/* Return TRUE if all letters in seq are ACGTNU-. */
{
int i;
dnaUtilOpen();
for (i=0; i<size-1; ++i)
{
if (ntChars[(int)seq[i]] == 0)
return FALSE;
}
return TRUE;
}
boolean isAllDna(char *poly, int size)
/* Return TRUE if size is great than 1 and letters in poly are 100% ACGTNU- */
{
if (size <= 1)
return FALSE;
return isAllNt(poly, size);
}
/* Tables to convert from 0-20 to ascii single letter representation
* of proteins. */
int aaVal[256];
AA valToAa[21];
AA aaChars[256]; /* 0 except for value aa characters. Converts to upper case rest. */
struct aminoAcidTable
/* A little info about each amino acid. */
{
int ix;
char letter;
char abbreviation[3];
char *name;
};
struct aminoAcidTable aminoAcidTable[] =
{
{0, 'A', "ala", "alanine"},
{1, 'C', "cys", "cysteine"},
{2, 'D', "asp", "aspartic acid"},
{3, 'E', "glu", "glutamic acid"},
{4, 'F', "phe", "phenylalanine"},
{5, 'G', "gly", "glycine"},
{6, 'H', "his", "histidine"},
{7, 'I', "ile", "isoleucine"},
{8, 'K', "lys", "lysine"},
{9, 'L', "leu", "leucine"},
{10, 'M', "met", "methionine"},
{11, 'N', "asn", "asparagine"},
{12, 'P', "pro", "proline"},
{13, 'Q', "gln", "glutamine"},
{14, 'R', "arg", "arginine"},
{15, 'S', "ser", "serine"},
{16, 'T', "thr", "threonine"},
{17, 'V', "val", "valine"},
{18, 'W', "trp", "tryptophan"},
{19, 'Y', "tyr", "tyrosine"},
{20, 'X', "ter", "termination"},
};
char *aaAbbr(int i)
/* return pointer to AA abbrevation */
{
return(aminoAcidTable[i].abbreviation);
}
char aaLetter(int i)
/* return AA letter */
{
return(aminoAcidTable[i].letter);
}
static void initAaVal()
/* Initialize aaVal and valToAa tables. */
{
int i;
char c, lowc;
for (i=0; i<ArraySize(aaVal); ++i)
aaVal[i] = -1;
for (i=0; i<ArraySize(aminoAcidTable); ++i)
{
c = aminoAcidTable[i].letter;
lowc = tolower(c);
aaVal[(int)c] = aaVal[(int)lowc] = i;
aaChars[(int)c] = aaChars[(int)lowc] = c;
valToAa[i] = c;
}
aaChars['x'] = aaChars['X'] = 'X';
}
void dnaUtilOpen()
/* Initialize stuff herein. */
{
static boolean opened = FALSE;
if (!opened)
{
checkSizeTypes();
initNtVal();
initAaVal();
initNtChars();
initNtMixedCaseChars();
initNtCompTable();
opened = TRUE;
}
}