ad3a176457ad1d21a0fedc47f349ec3484e751f1 angie Fri Feb 9 16:27:51 2018 -0800 Cleaning up some old ugliness about the size parameter to isAllDna and isAllNt. hgc's printSnpAlignment code that parsed snpNNN.fa was using lineSize as length but lineSize is length+1. Then isAllDna was written with "i<size-1" as the loop test instead of "i < size". I didn't fix that properly when I separated out isAllNt from isAllDna. Later, I (re?)discovered that isAllNt needed length+1 as its size and just added some FIXME comments. Thanks Brian R for prodding me to actually fix it. refs #20895 diff --git src/lib/dnautil.c src/lib/dnautil.c index 6d81ba5..9ee6d78 100644 --- src/lib/dnautil.c +++ src/lib/dnautil.c @@ -1,1256 +1,1248 @@ /* Some stuff that you'll likely need in any program that works with * DNA. Includes stuff for amino acids as well. * * Assumes that DNA is stored as a character. * The DNA it generates will include the bases * as lowercase tcag. It will generally accept * uppercase as well, and also 'n' or 'N' or '-' * for unknown bases. * * Amino acids are stored as single character upper case. * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "dnautil.h" struct codonTable /* The dread codon table. */ { DNA *codon; /* Lower case. */ AA protCode; /* Upper case. The "Standard" code */ AA mitoCode; /* Upper case. Vertebrate Mitochondrial translations */ AA uniqCode; /* unique code for each codon */ }; struct codonTable codonTable[] = /* The master codon/protein table. */ { {"ttt", 'F', 'F', 'a'}, {"ttc", 'F', 'F', 'b'}, {"tta", 'L', 'L', 'c'}, {"ttg", 'L', 'L', 'd'}, {"tct", 'S', 'S', 'e'}, {"tcc", 'S', 'S', 'f'}, {"tca", 'S', 'S', 'g'}, {"tcg", 'S', 'S', 'h'}, {"tat", 'Y', 'Y', 'i'}, {"tac", 'Y', 'Y', 'j'}, {"taa", 0, 0, 'k'}, {"tag", 0, 0, 'l'}, {"tgt", 'C', 'C', 'm'}, {"tgc", 'C', 'C', 'n'}, {"tga", 0, 'W', 'o'}, {"tgg", 'W', 'W', 'p'}, {"ctt", 'L', 'L', 'q'}, {"ctc", 'L', 'L', 'r'}, {"cta", 'L', 'L', 's'}, {"ctg", 'L', 'L', 't'}, {"cct", 'P', 'P', 'u'}, {"ccc", 'P', 'P', 'v'}, {"cca", 'P', 'P', 'w'}, {"ccg", 'P', 'P', 'x'}, {"cat", 'H', 'H', 'y'}, {"cac", 'H', 'H', 'z'}, {"caa", 'Q', 'Q', 'A'}, {"cag", 'Q', 'Q', 'B'}, {"cgt", 'R', 'R', 'C'}, {"cgc", 'R', 'R', 'D'}, {"cga", 'R', 'R', 'E'}, {"cgg", 'R', 'R', 'F'}, {"att", 'I', 'I', 'G'}, {"atc", 'I', 'I', 'H'}, {"ata", 'I', 'M', 'I'}, {"atg", 'M', 'M', 'J'}, {"act", 'T', 'T', 'K'}, {"acc", 'T', 'T', 'L'}, {"aca", 'T', 'T', 'M'}, {"acg", 'T', 'T', 'N'}, {"aat", 'N', 'N', 'O'}, {"aac", 'N', 'N', 'P'}, {"aaa", 'K', 'K', 'Q'}, {"aag", 'K', 'K', 'R'}, {"agt", 'S', 'S', 'S'}, {"agc", 'S', 'S', 'T'}, {"aga", 'R', 0, 'U'}, {"agg", 'R', 0, 'V'}, {"gtt", 'V', 'V', 'W'}, {"gtc", 'V', 'V', 'X'}, {"gta", 'V', 'V', 'Y'}, {"gtg", 'V', 'V', 'Z'}, {"gct", 'A', 'A', '1'}, {"gcc", 'A', 'A', '2'}, {"gca", 'A', 'A', '3'}, {"gcg", 'A', 'A', '4'}, {"gat", 'D', 'D', '5'}, {"gac", 'D', 'D', '6'}, {"gaa", 'E', 'E', '7'}, {"gag", 'E', 'E', '8'}, {"ggt", 'G', 'G', '9'}, {"ggc", 'G', 'G', '0'}, {"gga", 'G', 'G', '@'}, {"ggg", 'G', 'G', '$'}, }; /* A table that gives values 0 for t 1 for c 2 for a 3 for g * (which is order aa's are in biochemistry codon tables) * and gives -1 for all others. */ int ntVal[256]; int ntValLower[256]; /* NT values only for lower case. */ int ntValUpper[256]; /* NT values only for upper case. */ int ntVal5[256]; int ntValNoN[256]; /* Like ntVal, but with T_BASE_VAL in place of -1 for nonexistent ones. */ DNA valToNt[(N_BASE_VAL|MASKED_BASE_BIT)+1]; /* convert tables for bit-4 indicating masked */ int ntValMasked[256]; DNA valToNtMasked[256]; static boolean inittedNtVal = FALSE; static void initNtVal() { if (!inittedNtVal) { int i; for (i=0; i<ArraySize(ntVal); i++) { ntValUpper[i] = ntValLower[i] = ntVal[i] = -1; ntValNoN[i] = T_BASE_VAL; if (isspace(i) || isdigit(i)) ntVal5[i] = ntValMasked[i] = -1; else { ntVal5[i] = N_BASE_VAL; ntValMasked[i] = (islower(i) ? (N_BASE_VAL|MASKED_BASE_BIT) : N_BASE_VAL); } } ntVal5['t'] = ntVal5['T'] = ntValNoN['t'] = ntValNoN['T'] = ntVal['t'] = ntVal['T'] = ntValLower['t'] = ntValUpper['T'] = T_BASE_VAL; ntVal5['u'] = ntVal5['U'] = ntValNoN['u'] = ntValNoN['U'] = ntVal['u'] = ntVal['U'] = ntValLower['u'] = ntValUpper['U'] = U_BASE_VAL; ntVal5['c'] = ntVal5['C'] = ntValNoN['c'] = ntValNoN['C'] = ntVal['c'] = ntVal['C'] = ntValLower['c'] = ntValUpper['C'] = C_BASE_VAL; ntVal5['a'] = ntVal5['A'] = ntValNoN['a'] = ntValNoN['A'] = ntVal['a'] = ntVal['A'] = ntValLower['a'] = ntValUpper['A'] = A_BASE_VAL; ntVal5['g'] = ntVal5['G'] = ntValNoN['g'] = ntValNoN['G'] = ntVal['g'] = ntVal['G'] = ntValLower['g'] = ntValUpper['G'] = G_BASE_VAL; valToNt[T_BASE_VAL] = valToNt[T_BASE_VAL|MASKED_BASE_BIT] = 't'; valToNt[C_BASE_VAL] = valToNt[C_BASE_VAL|MASKED_BASE_BIT] = 'c'; valToNt[A_BASE_VAL] = valToNt[A_BASE_VAL|MASKED_BASE_BIT] = 'a'; valToNt[G_BASE_VAL] = valToNt[G_BASE_VAL|MASKED_BASE_BIT] = 'g'; valToNt[N_BASE_VAL] = valToNt[N_BASE_VAL|MASKED_BASE_BIT] = 'n'; /* masked values */ ntValMasked['T'] = T_BASE_VAL; ntValMasked['U'] = U_BASE_VAL; ntValMasked['C'] = C_BASE_VAL; ntValMasked['A'] = A_BASE_VAL; ntValMasked['G'] = G_BASE_VAL; ntValMasked['t'] = T_BASE_VAL|MASKED_BASE_BIT; ntValMasked['u'] = U_BASE_VAL|MASKED_BASE_BIT; ntValMasked['c'] = C_BASE_VAL|MASKED_BASE_BIT; ntValMasked['a'] = A_BASE_VAL|MASKED_BASE_BIT; ntValMasked['g'] = G_BASE_VAL|MASKED_BASE_BIT; valToNtMasked[T_BASE_VAL] = 'T'; valToNtMasked[C_BASE_VAL] = 'C'; valToNtMasked[A_BASE_VAL] = 'A'; valToNtMasked[G_BASE_VAL] = 'G'; valToNtMasked[N_BASE_VAL] = 'N'; valToNtMasked[T_BASE_VAL|MASKED_BASE_BIT] = 't'; valToNtMasked[C_BASE_VAL|MASKED_BASE_BIT] = 'c'; valToNtMasked[A_BASE_VAL|MASKED_BASE_BIT] = 'a'; valToNtMasked[G_BASE_VAL|MASKED_BASE_BIT] = 'g'; valToNtMasked[N_BASE_VAL|MASKED_BASE_BIT] = 'n'; inittedNtVal = TRUE; } } /* Returns one letter code for protein, * 0 for stop codon or X for bad input, * The "Standard" Code */ AA lookupCodon(DNA *dna) { int ix; int i; char c; if (!inittedNtVal) initNtVal(); ix = 0; for (i=0; i<3; ++i) { int bv = ntVal[(int)dna[i]]; if (bv<0) return 'X'; ix = (ix<<2) + bv; } c = codonTable[ix].protCode; return c; } boolean isStopCodon(DNA *dna) /* Return TRUE if it's a stop codon. */ { return lookupCodon(dna) == 0; } boolean isKozak(char *dna, int dnaSize, int pos) /* Return TRUE if it's a Kozak compatible start. */ { if (lookupCodon(dna+pos) != 'M') { return FALSE; } if (pos + 3 < dnaSize) { if (ntVal[(int)dna[pos+3]] == G_BASE_VAL) return TRUE; } if (pos >= 3) { int c = ntVal[(int)dna[pos-3]]; if (c == A_BASE_VAL || c == G_BASE_VAL) return TRUE; } return FALSE; } boolean isReallyStopCodon(char *dna, boolean selenocysteine) /* Return TRUE if it's really a stop codon, even considering * possibilility of selenocysteine. */ { if (selenocysteine) { /* Luckily the mitochondria *also* replaces TGA with * something else, even though it isn't selenocysteine */ return lookupMitoCodon(dna) == 0; } else { return lookupCodon(dna) == 0; } } /* Returns one letter code for protein, * 0 for stop codon or X for bad input, * Vertebrate Mitochondrial Code */ AA lookupMitoCodon(DNA *dna) { int ix; int i; char c; if (!inittedNtVal) initNtVal(); ix = 0; for (i=0; i<3; ++i) { int bv = ntVal[(int)dna[i]]; if (bv<0) return 'X'; ix = (ix<<2) + bv; } c = codonTable[ix].mitoCode; c = toupper(c); return c; } AA lookupUniqCodon(DNA *dna) { int ix; int i; char c; if (!inittedNtVal) initNtVal(); ix = 0; for (i=0; i<3; ++i) { int bv = ntVal[(int)dna[i]]; if (bv<0) return 'X'; ix = (ix<<2) + bv; } c = codonTable[ix].uniqCode; c = toupper(c); return c; } Codon codonVal(DNA *start) /* Return value from 0-63 of codon starting at start. * Returns -1 if not a codon. */ { int v1,v2,v3; if ((v1 = ntVal[(int)start[0]]) < 0) return -1; if ((v2 = ntVal[(int)start[1]]) < 0) return -1; if ((v3 = ntVal[(int)start[2]]) < 0) return -1; return ((v1<<4) + (v2<<2) + v3); } DNA *valToCodon(int val) /* Return codon corresponding to val (0-63) */ { assert(val >= 0 && val < 64); return codonTable[val].codon; } void dnaTranslateSome(DNA *dna, char *out, int outSize) /* Translate DNA upto a stop codon or until outSize-1 amino acids, * whichever comes first. Output will be zero terminated. */ { int i; int dnaSize; int protSize = 0; outSize -= 1; /* Room for terminal zero */ dnaSize = strlen(dna); for (i=0; i<dnaSize-2; i+=3) { if (protSize >= outSize) break; if ((out[protSize++] = lookupCodon(dna+i)) == 0) break; } out[protSize] = 0; } /* A little array to help us decide if a character is a * nucleotide, and if so convert it to lower case. */ char ntChars[256]; static void initNtChars() { static boolean initted = FALSE; if (!initted) { zeroBytes(ntChars, sizeof(ntChars)); ntChars['a'] = ntChars['A'] = 'a'; ntChars['c'] = ntChars['C'] = 'c'; ntChars['g'] = ntChars['G'] = 'g'; ntChars['t'] = ntChars['T'] = 't'; ntChars['n'] = ntChars['N'] = 'n'; ntChars['u'] = ntChars['U'] = 'u'; ntChars['-'] = 'n'; initted = TRUE; } } char ntMixedCaseChars[256]; static void initNtMixedCaseChars() { static boolean initted = FALSE; if (!initted) { zeroBytes(ntMixedCaseChars, sizeof(ntMixedCaseChars)); ntMixedCaseChars['a'] = 'a'; ntMixedCaseChars['A'] = 'A'; ntMixedCaseChars['c'] = 'c'; ntMixedCaseChars['C'] = 'C'; ntMixedCaseChars['g'] = 'g'; ntMixedCaseChars['G'] = 'G'; ntMixedCaseChars['t'] = 't'; ntMixedCaseChars['T'] = 'T'; ntMixedCaseChars['n'] = 'n'; ntMixedCaseChars['N'] = 'N'; ntMixedCaseChars['u'] = 'u'; ntMixedCaseChars['U'] = 'U'; ntMixedCaseChars['-'] = 'n'; initted = TRUE; } } /* Another array to help us do complement of DNA */ DNA ntCompTable[256]; static boolean inittedCompTable = FALSE; static void initNtCompTable() { zeroBytes(ntCompTable, sizeof(ntCompTable)); ntCompTable[' '] = ' '; ntCompTable['-'] = '-'; ntCompTable['='] = '='; ntCompTable['a'] = 't'; ntCompTable['c'] = 'g'; ntCompTable['g'] = 'c'; ntCompTable['t'] = 'a'; ntCompTable['u'] = 'a'; ntCompTable['n'] = 'n'; ntCompTable['-'] = '-'; ntCompTable['.'] = '.'; ntCompTable['A'] = 'T'; ntCompTable['C'] = 'G'; ntCompTable['G'] = 'C'; ntCompTable['T'] = 'A'; ntCompTable['U'] = 'A'; ntCompTable['N'] = 'N'; ntCompTable['R'] = 'Y'; ntCompTable['Y'] = 'R'; ntCompTable['M'] = 'K'; ntCompTable['K'] = 'M'; ntCompTable['S'] = 'S'; ntCompTable['W'] = 'W'; ntCompTable['V'] = 'B'; ntCompTable['H'] = 'D'; ntCompTable['D'] = 'H'; ntCompTable['B'] = 'V'; ntCompTable['X'] = 'N'; ntCompTable['r'] = 'y'; ntCompTable['y'] = 'r'; ntCompTable['s'] = 's'; ntCompTable['w'] = 'w'; ntCompTable['m'] = 'k'; ntCompTable['k'] = 'm'; ntCompTable['v'] = 'b'; ntCompTable['h'] = 'd'; ntCompTable['d'] = 'h'; ntCompTable['b'] = 'v'; ntCompTable['x'] = 'n'; ntCompTable['('] = ')'; ntCompTable[')'] = '('; inittedCompTable = TRUE; } /* Complement DNA (not reverse). */ void complement(DNA *dna, long length) { int i; if (!inittedCompTable) initNtCompTable(); for (i=0; i<length; ++i) { *dna = ntCompTable[(int)*dna]; ++dna; } } /* Reverse complement DNA. */ void reverseComplement(DNA *dna, long length) { reverseBytes(dna, length); complement(dna, length); } /* Reverse offset - return what will be offset (0 based) to * same member of array after array is reversed. */ long reverseOffset(long offset, long arraySize) { return arraySize-1 - offset; } /* Switch start/end (zero based half open) coordinates * to opposite strand. */ void reverseIntRange(int *pStart, int *pEnd, int size) { int temp; temp = *pStart; *pStart = size - *pEnd; *pEnd = size - temp; } /* Switch start/end (zero based half open) coordinates * to opposite strand. */ void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size) { unsigned temp; temp = *pStart; *pStart = size - *pEnd; *pEnd = size - temp; } char *reverseComplementSlashSeparated(char *alleleStr) /* Given a slash-separated series of sequences (a common representation of variant alleles), * returns a slash-sep series with the reverse complement of each sequence (if it is a * nucleotide sequence). * Special behavior to support dbSNP's variant allele conventions: * 1. Reverse the order of sequences (to maintain alphabetical ordering). * 2. If alleleStr begins with "-/", then after reversing, move "-/" back to the beginning. */ { int len = strlen(alleleStr); char choppyCopy[len+1]; safecpy(choppyCopy, sizeof(choppyCopy), alleleStr); char *alleles[len]; int alCount = chopByChar(choppyCopy, '/', alleles, ArraySize(alleles)); char *outStr = needMem(len+1); int i; for (i = alCount-1; i >= 0; i--) { char *allele = alleles[i]; int alLen = strlen(allele); if (isAllNt(allele, alLen)) reverseComplement(allele, alLen); if (i != alCount-1) safecat(outStr, len+1, "/"); safecat(outStr, len+1, allele); } if (startsWith("-/", alleleStr)) { // Keep "-/" at the beginning: memmove(outStr+2, outStr, len-2); outStr[0] = '-'; outStr[1] = '/'; } return outStr; } int cmpDnaStrings(DNA *a, DNA *b) /* Compare using screwy non-alphabetical DNA order TCGA */ { for (;;) { DNA aa = *a++; DNA bb = *b++; if (aa != bb) return ntVal[(int)aa] - ntVal[(int)bb]; if (aa == 0) break; } return 0; } /* Convert T's to U's */ void toRna(DNA *dna) { DNA c; for (;;) { c = *dna; if (c == 't') *dna = 'u'; else if (c == 'T') *dna = 'U'; else if (c == 0) break; ++dna; } } char *skipIgnoringDash(char *a, int size, bool skipTrailingDash) /* Count size number of characters, and any * dash characters. */ { while (size > 0) { if (*a++ != '-') --size; } if (skipTrailingDash) while (*a == '-') ++a; return a; } int countNonDash(char *a, int size) /* Count number of non-dash characters. */ { int count = 0; int i; for (i=0; i<size; ++i) if (a[i] != '-') ++count; return count; } int nextPowerOfFour(long x) /* Return next power of four that would be greater or equal to x. * For instance if x < 4, return 1, if x < 16 return 2.... * (From biological point of view how many bases are needed to * code this number.) */ { int count = 1; while (x > 4) { count += 1; x >>= 2; } return count; } long dnaOrAaFilteredSize(char *raw, char filter[256]) /* Return how long DNA will be after non-DNA is filtered out. */ { char c; long count = 0; dnaUtilOpen(); while ((c = *raw++) != 0) { if (filter[(int)c]) ++count; } return count; } void dnaOrAaFilter(char *in, char *out, char filter[256]) /* Run chars through filter. */ { char c; dnaUtilOpen(); while ((c = *in++) != 0) { if ((c = filter[(int)c]) != 0) *out++ = c; } *out++ = 0; } long dnaFilteredSize(char *rawDna) /* Return how long DNA will be after non-DNA is filtered out. */ { return dnaOrAaFilteredSize(rawDna, ntChars); } void dnaFilter(char *in, DNA *out) /* Filter out non-DNA characters and change to lower case. */ { dnaOrAaFilter(in, out, ntChars); } void dnaFilterToN(char *in, DNA *out) /* Change all non-DNA characters to N. */ { DNA c; initNtChars(); while ((c = *in++) != 0) { if ((c = ntChars[(int)c]) != 0) *out++ = c; else *out++ = 'n'; } *out++ = 0; } void dnaMixedCaseFilter(char *in, DNA *out) /* Filter out non-DNA characters but leave case intact. */ { dnaOrAaFilter(in, out, ntMixedCaseChars); } long aaFilteredSize(char *raw) /* Return how long aa will be after non-aa chars is filtered out. */ { return dnaOrAaFilteredSize(raw, aaChars); } void aaFilter(char *in, DNA *out) /* Filter out non-aa characters and change to upper case. */ { dnaOrAaFilter(in, out, aaChars); } void upperToN(char *s, int size) /* Turn upper case letters to N's. */ { char c; int i; for (i=0; i<size; ++i) { c = s[i]; if (isupper(c)) s[i] = 'n'; } } void lowerToN(char *s, int size) /* Turn lower case letters to N's. */ { char c; int i; for (i=0; i<size; ++i) { c = s[i]; if (islower(c)) s[i] = 'N'; } } void dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4]) /* Count up frequency of occurance of each base and store * results in histogram. */ { int val; zeroBytes(histogram, 4*sizeof(int)); while (--dnaSize >= 0) { if ((val = ntVal[(int)*dna++]) >= 0) ++histogram[val]; } } bits64 basesToBits64(char *dna, int size) /* Convert dna of given size (up to 32) to binary representation */ { if (size > 32) errAbort("basesToBits64 called on %d bases, can only go up to 32", size); bits64 result = 0; int i; for (i=0; i<size; ++i) { result <<= 2; result += ntValNoN[(int)dna[i]]; } return result; } bits32 packDna16(DNA *in) /* pack 16 bases into a word */ { bits32 out = 0; int count = 16; int bVal; while (--count >= 0) { bVal = ntValNoN[(int)*in++]; out <<= 2; out += bVal; } return out; } bits16 packDna8(DNA *in) /* Pack 8 bases into a short word */ { bits16 out = 0; int count = 8; int bVal; while (--count >= 0) { bVal = ntValNoN[(int)*in++]; out <<= 2; out += bVal; } return out; } UBYTE packDna4(DNA *in) /* Pack 4 bases into a UBYTE */ { UBYTE out = 0; int count = 4; int bVal; while (--count >= 0) { bVal = ntValNoN[(int)*in++]; out <<= 2; out += bVal; } return out; } void unpackDna(bits32 *tiles, int tileCount, DNA *out) /* Unpack DNA. Expands to 16x tileCount in output. */ { int i, j; bits32 tile; for (i=0; i<tileCount; ++i) { tile = tiles[i]; for (j=15; j>=0; --j) { out[j] = valToNt[tile & 0x3]; tile >>= 2; } out += 16; } } void unpackDna4(UBYTE *tiles, int byteCount, DNA *out) /* Unpack DNA. Expands to 4x byteCount in output. */ { int i, j; UBYTE tile; for (i=0; i<byteCount; ++i) { tile = tiles[i]; for (j=3; j>=0; --j) { out[j] = valToNt[tile & 0x3]; tile >>= 2; } out += 4; } } static void checkSizeTypes() /* Make sure that some of our predefined types are the right size. */ { assert(sizeof(UBYTE) == 1); assert(sizeof(WORD) == 2); assert(sizeof(bits32) == 4); assert(sizeof(bits16) == 2); } int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize) /* Given a gap in genome from iStart to iEnd, return * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no * intron. Assumes DNA is lower cased. */ { if (iEnd - iStart < minIntronSize) return 0; if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g') { return 1; } else if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c') { return -1; } else return 0; } int intronOrientation(DNA *iStart, DNA *iEnd) /* Given a gap in genome from iStart to iEnd, return * Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no * intron. Assumes DNA is lower cased. */ { return intronOrientationMinSize(iStart, iEnd, 32); } int dnaScore2(DNA a, DNA b) /* Score match between two bases (relatively crudely). */ { if (a == 'n' || b == 'n') return 0; if (a == b) return 1; else return -1; } int dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore, char ignore) /* Compare two sequences (without inserts or deletions) and score. */ { int i; int score = 0; for (i=0; i<size; ++i) { char aa = a[i]; char bb = b[i]; if (aa == ignore || bb == ignore) continue; if (aa == bb) score += matchScore; else score += mismatchScore; } return score; } int dnaScoreMatch(DNA *a, DNA *b, int size) /* Compare two pieces of DNA base by base. Total mismatches are * subtracted from total matches and returned as score. 'N's * neither hurt nor help score. */ { return dnaOrAaScoreMatch(a, b, size, 1, -1, 'n'); } int aaScore2(AA a, AA b) /* Score match between two amino acids (relatively crudely). */ { if (a == 'X' || b == 'X') return 0; if (a == b) return 2; else return -1; } int aaScoreMatch(AA *a, AA *b, int size) /* Compare two peptides aa by aa. */ { return dnaOrAaScoreMatch(a, b, size, 2, -1, 'X'); } void writeSeqWithBreaks(FILE *f, char *letters, int letterCount, int maxPerLine) /* Write out letters with newlines every maxLine. */ { int lettersLeft = letterCount; int lineSize; while (lettersLeft > 0) { lineSize = lettersLeft; if (lineSize > maxPerLine) lineSize = maxPerLine; mustWrite(f, letters, lineSize); fputc('\n', f); letters += lineSize; lettersLeft -= lineSize; } } static int findTailPolyAMaybeMask(DNA *dna, int size, boolean doMask, boolean loose) /* Identify PolyA at end; mask to 'n' if specified. This allows a few * non-A's as noise to be trimmed too. Returns number of bases trimmed. * Leaves first two bases of PolyA in case there's a taa stop codon. */ { int i; int score = 10; int bestScore = 10; int bestPos = -1; int trimSize = 0; for (i=size-1; i>=0; --i) { DNA b = dna[i]; if (b == 'n' || b == 'N') continue; if (score > 20) score = 20; if (b == 'a' || b == 'A') { score += 1; if (score >= bestScore) { bestScore = score; bestPos = i; } else if (loose && score >= (bestScore - 8)) { /* If loose, keep extending even if score isn't back up to best. */ bestPos = i; } } else { score -= 10; } if (score < 0) { break; } } if (bestPos >= 0) { trimSize = size - bestPos - 2; // Leave two for aa in taa stop codon if (trimSize > 0) { if (doMask) for (i=size - trimSize; i<size; ++i) dna[i] = 'n'; } else trimSize = 0; } return trimSize; } int tailPolyASizeLoose(DNA *dna, int size) /* Return size of PolyA at end (if present). This allows a few non-A's as * noise to be trimmed too, but skips first two aa for taa stop codon. * It is less conservative in extending the polyA region than maskTailPolyA. */ { return findTailPolyAMaybeMask(dna, size, FALSE, TRUE); } int maskTailPolyA(DNA *dna, int size) /* Convert PolyA at end to n. This allows a few non-A's as noise to be * trimmed too. Returns number of bases trimmed. Leaves very last a. */ { return findTailPolyAMaybeMask(dna, size, TRUE, FALSE); } static int findHeadPolyTMaybeMask(DNA *dna, int size, boolean doMask, boolean loose) /* Return size of PolyT at start (if present); mask to 'n' if specified. * This allows a few non-T's as noise to be trimmed too, but skips last * two tt for revcomp'd taa stop codon. */ { int i; int score = 10; int bestScore = 10; int bestPos = -1; int trimSize = 0; for (i=0; i<size; ++i) { DNA b = dna[i]; if (b == 'n' || b == 'N') continue; if (score > 20) score = 20; if (b == 't' || b == 'T') { score += 1; if (score >= bestScore) { bestScore = score; bestPos = i; } else if (loose && score >= (bestScore - 8)) { /* If loose, keep extending even if score isn't back up to best. */ bestPos = i; } } else { score -= 10; } if (score < 0) { break; } } if (bestPos >= 0) { trimSize = bestPos+1 - 2; // Leave two for aa in taa stop codon if (trimSize > 0) { if (doMask) memset(dna, 'n', trimSize); } else trimSize = 0; } return trimSize; } int headPolyTSizeLoose(DNA *dna, int size) /* Return size of PolyT at start (if present). This allows a few non-T's as * noise to be trimmed too, but skips last two tt for revcomp'd taa stop * codon. * It is less conservative in extending the polyA region than maskHeadPolyT. */ { return findHeadPolyTMaybeMask(dna, size, FALSE, TRUE); } int maskHeadPolyT(DNA *dna, int size) /* Convert PolyT at start. This allows a few non-T's as noise to be * trimmed too. Returns number of bases trimmed. */ { return findHeadPolyTMaybeMask(dna, size, TRUE, FALSE); } boolean isDna(char *poly, int size) /* Return TRUE if letters in poly are at least 90% ACGTNU- */ { int i; int dnaCount = 0; dnaUtilOpen(); for (i=0; i<size; ++i) { if (ntChars[(int)poly[i]]) dnaCount += 1; } return (dnaCount >= round(0.9 * size)); } boolean isAllNt(char *seq, int size) /* Return TRUE if all letters in seq are ACGTNU-. */ { int i; dnaUtilOpen(); -for (i=0; i<size-1; ++i) +for (i = 0; i < size; ++i) { if (ntChars[(int)seq[i]] == 0) return FALSE; } return TRUE; } -boolean isAllDna(char *poly, int size) -/* Return TRUE if size is great than 1 and letters in poly are 100% ACGTNU- */ -{ -if (size <= 1) - return FALSE; -return isAllNt(poly, size); -} - /* Tables to convert from 0-20 to ascii single letter representation * of proteins. */ int aaVal[256]; AA valToAa[21]; AA aaChars[256]; /* 0 except for value aa characters. Converts to upper case rest. */ struct aminoAcidTable /* A little info about each amino acid. */ { int ix; char letter; char abbreviation[3]; char *name; }; struct aminoAcidTable aminoAcidTable[] = { {0, 'A', "ala", "alanine"}, {1, 'C', "cys", "cysteine"}, {2, 'D', "asp", "aspartic acid"}, {3, 'E', "glu", "glutamic acid"}, {4, 'F', "phe", "phenylalanine"}, {5, 'G', "gly", "glycine"}, {6, 'H', "his", "histidine"}, {7, 'I', "ile", "isoleucine"}, {8, 'K', "lys", "lysine"}, {9, 'L', "leu", "leucine"}, {10, 'M', "met", "methionine"}, {11, 'N', "asn", "asparagine"}, {12, 'P', "pro", "proline"}, {13, 'Q', "gln", "glutamine"}, {14, 'R', "arg", "arginine"}, {15, 'S', "ser", "serine"}, {16, 'T', "thr", "threonine"}, {17, 'V', "val", "valine"}, {18, 'W', "trp", "tryptophan"}, {19, 'Y', "tyr", "tyrosine"}, {20, 'X', "ter", "termination"}, }; char *aaAbbr(int i) /* return pointer to AA abbrevation */ { return(aminoAcidTable[i].abbreviation); } char aaLetter(int i) /* return AA letter */ { return(aminoAcidTable[i].letter); } static void initAaVal() /* Initialize aaVal and valToAa tables. */ { int i; char c, lowc; for (i=0; i<ArraySize(aaVal); ++i) aaVal[i] = -1; for (i=0; i<ArraySize(aminoAcidTable); ++i) { c = aminoAcidTable[i].letter; lowc = tolower(c); aaVal[(int)c] = aaVal[(int)lowc] = i; aaChars[(int)c] = aaChars[(int)lowc] = c; valToAa[i] = c; } aaChars['x'] = aaChars['X'] = 'X'; } void dnaUtilOpen() /* Initialize stuff herein. */ { static boolean opened = FALSE; if (!opened) { checkSizeTypes(); initNtVal(); initAaVal(); initNtChars(); initNtMixedCaseChars(); initNtCompTable(); opened = TRUE; } } char aaAbbrToLetter(char *abbr) /* Convert an AA abbreviation such as "Ala", "Asp" etc., to its single letter code * such as "A", "D" etc. Return the null char '\0' if abbr is not found. */ { // Lowercase for comparison. char abbrLC[4]; safencpy(abbrLC, sizeof(abbrLC), abbr, 3); toLowerN(abbrLC, 3); int ix; for (ix = 0; ix < ArraySize(aminoAcidTable); ix++) { if (sameStringN(abbrLC, aminoAcidTable[ix].abbreviation, 3)) return aminoAcidTable[ix].letter; } return '\0'; } void aaToAbbr(char aa, char *abbrBuf, size_t abbrBufSize) /* Convert an AA single letter such as "A", "D" etc. to its abbreviation such as "Ala", "Asp" etc. * abbrBufSize must be at least 4. If aa is not found, "?%c?",aa is written into abbrBuf. */ { // Uppercase for comparison. char aaUC = toupper(aa); int ix; for (ix = 0; ix < ArraySize(aminoAcidTable); ix++) { if (aaUC == aminoAcidTable[ix].letter) { // safencpy(...3) is required here because aminoAcidTable.abbreviation is char[3] not [4] safencpy(abbrBuf, abbrBufSize, aminoAcidTable[ix].abbreviation, 3); abbrBuf[0] = toupper(abbrBuf[0]); return; } } safef(abbrBuf, abbrBufSize, "?%c?", aa); } void trimRefAlt(char *ref, char *alt, uint *pStart, uint *pEnd, int *pRefLen, int *pAltLen) /* If ref and alt have identical bases at beginning and/or end, trim those & update all params. */ { int trimStart = 0; while (ref[trimStart] != '\0' && alt[trimStart] != '\0' && ref[trimStart] == alt[trimStart]) trimStart++; int refLen = strlen(ref); int altLen = strlen(alt); int iR = refLen - 1, iA = altLen - 1, trimEnd = 0; while (iR >= trimStart && iA >= trimStart && ref[iR] == alt[iA]) { iR--; iA--; trimEnd++; } if (trimEnd) { *pEnd -= trimEnd; refLen -= trimEnd; altLen -= trimEnd; ref[refLen] = '\0'; alt[altLen] = '\0'; } if (trimStart) { *pStart += trimStart; refLen -= trimStart; altLen -= trimStart; memmove(ref, ref+trimStart, refLen+1); memmove(alt, alt+trimStart, altLen+1); } *pRefLen = refLen; *pAltLen = altLen; }