589685da775f5a122bb23d2b8eca372f53e4b5bc
angie
Wed Sep 10 16:34:34 2014 -0700
Changes to snpNcbiToUcsc to handle b141 on hg19 and hg38:- When checking range{I,D,S}* locTypes, use strlen(refNCBI) as length only if
refNCBI contains only nucleotides.
- Checking for overlapping variants at the same location (checkCluster ->
DuplicateObserved, MixedObserved) used to be limited to insertions because
those are easier to check. Now we check all variants that have
all-nucleotide observed. Libified some code from hgTracks that can
reverse-complement a slash-separated string of alleles, e.g. G/T -> A/C,
-/AG -> -/CT.
- SNP IDs are getting too huge (and sparse) to use as indexes into a
statically allocated array; use a hash.
- Erroneous NULL frequencies in dbSNP's SNPAlleleFreq need to be detected
and ignored.
refs #13309
diff --git src/inc/dnautil.h src/inc/dnautil.h
index 94b25af..7282d43 100644
--- src/inc/dnautil.h
+++ src/inc/dnautil.h
@@ -1,263 +1,268 @@
/* Some stuff that you'll likely need in any program that works with
* DNA. Includes stuff for amino acids as well.
*
* Assumes that DNA is stored as a character.
* The DNA it generates will include the bases
* as lowercase tcag. It will generally accept
* uppercase as well, and also 'n' or 'N' or '-'
* for unknown bases.
*
* Amino acids are stored as single character upper case.
*
* This file is copyright 2002 Jim Kent, but license is hereby
* granted for all use - public, private or commercial. */
#ifndef DNAUTIL_H
#define DNAUTIL_H
void dnaUtilOpen(); /* Good idea to call this before using any arrays
* here. */
/* Numerical values for bases. */
#define T_BASE_VAL 0
#define U_BASE_VAL 0
#define C_BASE_VAL 1
#define A_BASE_VAL 2
#define G_BASE_VAL 3
#define N_BASE_VAL 4 /* Used in 1/2 byte representation. */
typedef char DNA;
typedef char AA;
typedef char BIOPOL; /* Biological polymer. */
/* A little array to help us decide if a character is a
* nucleotide, and if so convert it to lower case.
* Contains zeroes for characters that aren't used
* in DNA sequence. */
extern DNA ntChars[256];
extern AA aaChars[256];
/* An array that converts alphabetical DNA representation
* to numerical one: X_BASE_VAL as above. For charaters
* other than [atgcATGC], has -1. */
extern int ntVal[256];
extern int aaVal[256];
extern int ntValLower[256]; /* NT values only for lower case. */
extern int ntValUpper[256]; /* NT values only for upper case. */
/* Like ntVal, but with T_BASE_VAL in place of -1 for nonexistent nucleotides. */
extern int ntValNoN[256];
/* Like ntVal but with N_BASE_VAL in place of -1 for 'n', 'x', '-', etc. */
extern int ntVal5[256];
/* Inverse array - takes X_BASE_VAL int to a DNA char
* value. */
extern DNA valToNt[];
/* Similar array that doesn't convert to lower case. */
extern DNA ntMixedCaseChars[256];
/* Another array to help us do complement of DNA */
extern DNA ntCompTable[256];
/* Arrays to convert between lower case indicating repeat masking, and
* a 1/2 byte representation where the 4th bit indicates if the characeter
* is masked. Uses N_BASE_VAL for `n', `x', etc.
*/
#define MASKED_BASE_BIT 8
extern int ntValMasked[256];
extern DNA valToNtMasked[256];
/*Complement DNA (not reverse)*/
void complement(DNA *dna, long length);
/* Reverse complement DNA. */
void reverseComplement(DNA *dna, long length);
/* Reverse offset - return what will be offset (0 based) to
* same member of array after array is reversed. */
long reverseOffset(long offset, long arraySize);
/* Switch start/end (zero based half open) coordinates
* to opposite strand. */
void reverseIntRange(int *pStart, int *pEnd, int size);
/* Switch start/end (zero based half open) coordinates
* to opposite strand. */
void reverseUnsignedRange(unsigned *pStart, unsigned *pEnd, int size);
+char *reverseComplementSlashSeparated(char *alleleStr);
+/* Given a slash-separated series of sequences (a common representation of variant alleles),
+ * returns a slash-sep series with the reverse complement of each sequence (if it is a
+ * nucleotide sequence), also reversing the order of sequences. */
+
enum dnaCase {dnaUpper,dnaLower,dnaMixed,};
/* DNA upper, lower, or mixed case? */
/* Convert T's to U's */
void toRna(DNA *dna);
int cmpDnaStrings(DNA *a, DNA *b);
/* Compare using screwy non-alphabetical DNA order TCGA */
typedef char Codon; /* Our codon type. */
/* Return single letter code (upper case) for protein.
* Returns X for bad input, 0 for stop codon.
* The "Standard" Code */
AA lookupCodon(DNA *dna);
boolean isStopCodon(DNA *dna);
/* Return TRUE if it's a stop codon. */
boolean isKozak(char *dna, int dnaSize, int pos);
/* Return TRUE if it's a Kozak compatible start, using a relatively
* weak definition (either A/G 3 bases before or G after) . */
boolean isReallyStopCodon(char *dna, boolean selenocysteine);
/* Return TRUE if it's really a stop codon, even considering
* possibilility of selenocysteine. */
/* Returns one letter code for protein,
* 0 for stop codon or X for bad input,
* Vertebrate Mitochondrial Code */
AA lookupMitoCodon(DNA *dna);
Codon codonVal(DNA *start);
/* Return value from 0-63 of codon starting at start.
* Returns -1 if not a codon. */
DNA *valToCodon(int val);
/* Return codon corresponding to val (0-63) */
extern char *aaAbbr(int i);
/* return pointer to AA abbrevation */
extern char aaLetter(int i);
/* return AA letter */
void dnaTranslateSome(DNA *dna, char *out, int outSize);
/* Translate DNA upto a stop codon or until outSize-1 amino acids,
* whichever comes first. Output will be zero terminated. */
char *skipIgnoringDash(char *a, int size, bool skipTrailingDash);
/* Count size number of characters, and any
* dash characters. */
int countNonDash(char *a, int size);
/* Count number of non-dash characters. */
int nextPowerOfFour(long x);
/* Return next power of four that would be greater or equal to x.
* For instance if x < 4, return 1, if x < 16 return 2....
* (From biological point of view how many bases are needed to
* code this number.) */
long dnaFilteredSize(char *rawDna);
/* Return how long DNA will be after non-DNA is filtered out. */
long aaFilteredSize(char *rawDna);
/* Return how long peptide will be after non-peptide is filtered out. */
void dnaFilter(char *in, DNA *out);
/* Filter out non-DNA characters. */
void aaFilter(char *in, DNA *out);
/* Filter out non-peptide characters. */
void dnaFilterToN(char *in, DNA *out);
/* Change all non-DNA characters to N. */
void upperToN(char *s, int size);
/* Turn upper case letters to N's. */
void lowerToN(char *s, int size);
/* Turn lower case letters to N's. */
void dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4]);
/* Count up frequency of occurance of each base and store
* results in histogram. Use X_BASE_VAL to index histogram. */
void dnaMixedCaseFilter(char *in, DNA *out);
/* Filter out non-DNA characters but leave case intact. */
bits64 basesToBits64(char *dna, int size);
/* Convert dna of given size (up to 32) to binary representation */
bits32 packDna16(DNA *in);
/* pack 16 bases into a word */
bits16 packDna8(DNA *in);
/* Pack 8 bases into a short word */
UBYTE packDna4(DNA *in);
/* Pack 4 bases into a UBYTE */
void unpackDna(bits32 *tiles, int tileCount, DNA *out);
/* Unpack DNA. Expands to 16x tileCount in output. */
void unpackDna4(UBYTE *tiles, int byteCount, DNA *out);
/* Unpack DNA. Expands to 4x byteCount in output. */
void unalignedUnpackDna(bits32 *tiles, int start, int size, DNA *unpacked);
/* Unpack into out, even though not starting/stopping on tile
* boundaries. */
int intronOrientationMinSize(DNA *iStart, DNA *iEnd, int minIntronSize);
/* Given a gap in genome from iStart to iEnd, return
* Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
* intron. */
int intronOrientation(DNA *iStart, DNA *iEnd);
/* Given a gap in genome from iStart to iEnd, return
* Return 1 for GT/AG intron between left and right, -1 for CT/AC, 0 for no
* intron. Assumes minIntronSize of 32. */
int dnaScore2(DNA a, DNA b);
/* Score match between two bases (relatively crudely). */
int dnaScoreMatch(DNA *a, DNA *b, int size);
/* Compare two pieces of DNA base by base. Total mismatches are
* subtracted from total matches and returned as score. 'N's
* neither hurt nor help score. */
int aaScore2(AA a, AA b);
/* Score match between two bases (relatively crudely). */
int aaScoreMatch(AA *a, AA *b, int size);
/* Compare two peptides aa by aa. */
int dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore,
char ignore);
/* Compare two sequences (without inserts or deletions) and score. */
void writeSeqWithBreaks(FILE *f, char *letters, int letterCount, int maxPerLine);
/* Write out letters with newlines every maxLine. */
int tailPolyASizeLoose(DNA *dna, int size);
/* Return size of PolyA at end (if present). This allows a few non-A's as
* noise to be trimmed too, but skips first two aa for taa stop codon.
* It is less conservative in extending the polyA region than maskTailPolyA. */
int headPolyTSizeLoose(DNA *dna, int size);
/* Return size of PolyT at start (if present). This allows a few non-T's as
* noise to be trimmed too, but skips last two tt for revcomp'd taa stop
* codon.
* It is less conservative in extending the polyA region than maskHeadPolyT. */
int maskTailPolyA(DNA *dna, int size);
/* Convert PolyA at end to n. This allows a few non-A's as noise to be
* trimmed too. Returns number of bases trimmed. */
int maskHeadPolyT(DNA *dna, int size);
/* Convert PolyT at start. This allows a few non-T's as noise to be
* trimmed too. Returns number of bases trimmed. */
boolean isDna(char *poly, int size);
/* Return TRUE if letters in poly are at least 90% ACGTNU- */
boolean isAllDna(char *poly, int size);
/* Return TRUE if size is great than 1 and all letters in poly are 100% ACGTNU- */
boolean isAllNt(char *seq, int size);
/* Return TRUE if all letters in seq are ACGTNU-. */
#endif /* DNAUTIL_H */