src/inc/genoFind.h 1.81
1.81 2009/10/08 18:09:38 kent
Moving repMatch calculations from blat to library so can be shared with gfServer.
Index: src/inc/genoFind.h
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/inc/genoFind.h,v
retrieving revision 1.80
retrieving revision 1.81
diff -b -B -U 1000000 -r1.80 -r1.81
--- src/inc/genoFind.h 10 Feb 2009 21:58:56 -0000 1.80
+++ src/inc/genoFind.h 8 Oct 2009 18:09:38 -0000 1.81
@@ -1,380 +1,383 @@
/* genoFind.h - Interface to modules for fast finding of sequence
* matches. */
/* Copyright 2001-2002 Jim Kent. All rights reserved. */
#ifndef GENOFIND_H
#define GENOFIND_H
#ifndef DNASEQ_H
#include "dnaseq.h"
#endif
#ifndef FUZZYFIND_H
#include "fuzzyFind.h"
#endif
#ifndef HASH_H
#include "hash.h"
#endif
#ifndef ALITYPE_H
#include "aliType.h"
#endif
#ifndef LOCALMEM_H
#include "localmem.h"
#endif
#ifndef BITS_H
#include "bits.h"
#endif
#ifndef AXT_H
#include "axt.h"
#endif
enum gfConstants {
gfMinMatch = 2,
gfMaxGap = 2,
gfTileSize = 11,
gfMaxTileUse = 1024,
gfPepMaxTileUse = 30000,
};
struct gfSeqSource
/* Where a block of sequence comes from. */
{
struct gfSeqSource *next;
char *fileName; /* Name of file. */
bioSeq *seq; /* Sequences. Usually either this or fileName is NULL. */
bits32 start,end; /* Position within merged sequence. */
Bits *maskedBits; /* If non-null contains repeat-masking info. */
};
struct gfHit
/* A genoFind hit. */
{
struct gfHit *next;
bits32 qStart; /* Where it hits in query. */
bits32 tStart; /* Where it hits in target. */
bits32 diagonal; /* tStart + qSize - qStart. */
};
/* gfHits are free'd with simple freeMem or slFreeList. */
struct gfClump
/* A clump of hits. */
/* Note: for clumps from regular (blat) queries, tStart and tEnd include
* target->start, but for clumps from gfPcrClumps(), tStart and tEnd have
* already had target->start subtracted. So tStart and tEnd in PCR clumps
* are relative to that target sequence (not the collection of all target
* sequences). */
{
struct gfClump *next; /* Next clump. */
bits32 qStart, qEnd; /* Position in query. */
struct gfSeqSource *target; /* Target source sequence. */
bits32 tStart, tEnd; /* Position in target. */
int hitCount; /* Number of hits. */
struct gfHit *hitList; /* List of hits. Not allocated here. */
int queryCoverage; /* Number of bases covered in query (thx AG!) */
};
void gfClumpFree(struct gfClump **pClump);
/* Free a single clump. */
void gfClumpFreeList(struct gfClump **pList);
/* Free a list of dynamically allocated gfClump's */
struct genoFind
/* An index of all K-mers in the genome. */
{
int maxPat; /* Max # of times pattern can occur
* before it is ignored. */
int minMatch; /* Minimum number of tile hits needed
* to trigger a clump hit. */
int maxGap; /* Max gap between tiles in a clump. */
int tileSize; /* Size of each N-mer. */
int stepSize; /* Spacing between N-mers. */
int tileSpaceSize; /* Number of N-mer values. */
int tileMask; /* 1-s for each N-mer. */
int sourceCount; /* Count of source files. */
struct gfSeqSource *sources; /* List of sequence sources. */
bool isPep; /* Is a peptide. */
bool allowOneMismatch; /* Allow a single mismatch? */
int segSize; /* Index is segmented if non-zero. */
bits32 totalSeqSize; /* Total size of all sequences. */
bits32 *listSizes; /* Size of list for each N-mer */
void *allocated; /* Storage space for all lists. */
bits32 **lists; /* A list for each N-mer. Used if
* isSegmented is false. */
bits16 **endLists; /* A more complex list for each N-mer.
* Used if isSegmented is true.
* Values come in groups of threes.
* The first is the packed last few
* letters of the tile. The next two
* are the offset in the genome. This
* would be a struct but that would take
* 8 bytes instead of 6, or nearly an
* extra gigabyte of RAM. */
};
void genoFindFree(struct genoFind **pGenoFind);
/* Free up a genoFind index. */
struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name);
/* Find target of given name. Return NULL if none. */
/* --- Stuff for saving results ---- */
struct gfOutput
/* A polymorphic object to help us write many file types. */
{
struct gfOutput *next;
void *data; /* Type-specific data pointer. Must be freeMem'able */
void (*out)(char *chromName, int chromSize, int chromOffset,
struct ffAli *ali, bioSeq *tSeq, struct hash *t3Hash, bioSeq *qSeq,
boolean qIsRc, boolean tIsRc,
enum ffStringency stringency, int minMatch, struct gfOutput *out);
/* This is the type of a client provided function to save an alignment.
* The parameters are:
* chromName - name of target (aka genomic or database) sequence.
* chromSize - size of target sequence.
* chromOffset - offset of genoSequence in target.
* ffAli - alignment with pointers into tSeq/qSeq or in
* translated target case, into t3Hash.
* tSeq - part of target sequence in normal case. In translated
* target case look at t3Hash instead.
* t3Hash - used only in translated target case. A hash keyed by
* target sequence name with values *lists* of trans3 structures.
* This hash can be searched to find both the translated and
* untranslated versions of the bits of the target that are in
* memory. (You can assume at this point all parts needed for
* output are indeed in memory.)
* qSeq - query sequence (this isn't segmented at all).
* isRc - True if query is reverse complemented.
* stringency - ffCdna, etc. I'm hoping to move this elsewhere.
* minMatch - minimum score to output. Also should be moved elsewhere.
* outputData - custom data for specific output function.
* The interface is a bit complex - partly from the demands of translated
* output, and partly from trying not to have the entire target sequence in
* memory.
*/
void (*queryOut)(struct gfOutput *out, FILE *f);
/* Called for each query */
void (*fileHead)(struct gfOutput *out, FILE *f);
/* Write file header if any */
boolean reportTargetStrand; /* Report target as well as query strand? */
struct hash *maskHash; /* associates target sequence name and mask. */
int minGood; /* Minimum sequence identity in thousandths. */
boolean qIsProt; /* Query is peptide. */
boolean tIsProt; /* Target is peptide. */
int queryIx; /* Index of query */
boolean includeTargetFile; /* Prefix file: to target sequence name. */
};
struct gfOutput *gfOutputAny(char *format,
int goodPpt, boolean qIsProt, boolean tIsProt,
boolean noHead, char *databaseName,
int databaseSeqCount, double databaseLetters,
double minIdentity, FILE *f);
/* Initialize output in a variety of formats in file or memory.
* Parameters:
* format - either 'psl', 'pslx', 'blast', 'wublast', 'axt'
* goodPpt - minimum identity of alignments to output in parts per thousand
* qIsProt - true if query side is a protein.
* tIsProt - true if target (database) side is a protein.
* noHead - if true suppress header in psl/pslx output.
* databaseName - name of database. Only used for blast output
* databaseSeq - number of sequences in database - only for blast
* databaseLetters - number of bases/aas in database - only blast
* FILE *f - file.
*/
struct gfOutput *gfOutputPsl(int goodPpt,
boolean qIsProt, boolean tIsProt, FILE *f,
boolean saveSeq, boolean noHead);
/* Set up psl/pslx output */
struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt,
boolean tIsProt, FILE *f);
/* Setup output for axt format. */
struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt,
boolean tIsProt);
/* Setup output for in memory axt output. */
struct gfOutput *gfOutputBlast(int goodPpt,
boolean qIsProt, boolean tIsProt,
char *databaseName, int databaseSeqCount, double databaseLetters,
char *blastType, /* blast, blast8, blast9, wublast, or xml */
double minIdentity, FILE *f);
/* Setup output for blast/wublast format. */
void gfOutputQuery(struct gfOutput *out, FILE *f);
/* Finish writing out results for a query to file. */
void gfOutputHead(struct gfOutput *out, FILE *f);
/* Write out header if any. */
void gfOutputFree(struct gfOutput **pOut);
/* Free up output. */
/* -------- Routines to build up index ------------ */
void gfCheckTileSize(int tileSize, boolean isPep);
/* Check that tile size is legal. Abort if not. */
struct genoFind *gfIndexSeq(bioSeq *seqList,
int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
boolean isPep, boolean allowOneMismatch, boolean maskUpper,
int stepSize);
/* Make index for all seqs in list.
* minMatch - minimum number of matching tiles to trigger alignments
* maxGap - maximum deviation from diagonal of tiles
* tileSize - size of tile in nucleotides
* maxPat - maximum use of tile to not be considered a repeat
* oocFile - .ooc format file that lists repeat tiles. May be NULL.
* isPep - TRUE if indexing proteins, FALSE for DNA.
* maskUpper - Mask out upper case sequence (currently only for nucleotides).
* stepSize - space between tiles. Zero means default (which is tileSize).
* For DNA sequences upper case bits will be unindexed. */
struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[],
int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
boolean allowOneMismatch, int stepSize);
/* Make index for all .nib and .2bits in list.
* minMatch - minimum number of matching tiles to trigger alignments
* maxGap - maximum deviation from diagonal of tiles
* tileSize - size of tile in nucleotides
* maxPat - maximum use of tile to not be considered a repeat
* oocFile - .ooc format file that lists repeat tiles. May be NULL.
* allowOneMismatch - allow one mismatch in a tile.
* stepSize - space between tiles. Zero means default (which is tileSize). */
void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3],
int fileCount, char *fileNames[],
int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
boolean allowOneMismatch, boolean mask, int stepSize);
/* Make translated (6 frame) index for all .nib and .2bit files. */
/* -------- Routines to scan index for homolgous areas ------------ */
struct gfClump *gfFindClumps(struct genoFind *gf, struct dnaSeq *seq,
struct lm *lm, int *retHitCount);
/* Find clumps associated with one sequence. */
struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq,
Bits *qMaskBits, int qMaskOffset,
struct lm *lm, int *retHitCount);
/* Find clumps associated with one sequence soft-masking seq according to qMaskBits */
struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq,
Bits *qMaskBits, int qMaskOffset, struct lm *lm,
struct gfSeqSource *target, int tMin, int tMax);
/* Find hits restricted to one particular region.
* The hits returned by this will be in target sequence
* coordinates rather than concatenated whole genome
* coordinates as hits inside of clumps usually are. */
void gfTransFindClumps(struct genoFind *gfs[3], aaSeq *seq, struct gfClump *clumps[3], struct lm *lm, int *retHitCount);
/* Find clumps associated with one sequence in three translated reading frames. */
void gfTransTransFindClumps(struct genoFind *gfs[3], aaSeq *seqs[3],
struct gfClump *clumps[3][3], struct lm *lm, int *retHitCount);
/* Find clumps associated with three sequences in three translated
* reading frames. Used for translated/translated protein comparisons. */
void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f);
/* Print out info on clump. This routine subtracts clump->target->start
* from clump->tStart and from clump->tEnd for printing, so that printed
* coords are relative to that target sequence. */
void gfAlignAaClumps(struct genoFind *gf, struct gfClump *clumpList, aaSeq *seq,
boolean isRc, int minMatch, struct gfOutput *out);
/* Convert gfClumps to an actual alignment that gets saved via
* outFunction/outData. */
void gfFindAlignAaTrans(struct genoFind *gfs[3], aaSeq *qSeq, struct hash *t3Hash,
boolean tIsRc, int minMatch, struct gfOutput *out);
/* Look for qSeq alignment in three translated reading frames. Save alignment
* via outFunction/outData. */
/* --- Some routines for dealing with gfServer at a low level ---- */
char *gfSignature();
/* Return signature that starts each command to gfServer. Helps defend
* server from confused clients. */
void gfCatchPipes();
/* Set up to catch broken pipe signals. */
int gfReadMulti(int sd, void *vBuf, size_t size);
/* Read in until all is read or there is an error. */
/* --- Some routines for dealing with gfServer at a high level ---- */
struct hash *gfFileCacheNew();
/* Create hash for storing info on .nib and .2bit files. */
void gfFileCacheFree(struct hash **pCache);
/* Free up resources in cache. */
void gfAlignStrand(int *pConn, char *nibDir, struct dnaSeq *seq,
boolean isRc, int minMatch,
struct hash *tFileCache, struct gfOutput *out);
/* Search genome on server with one strand of other sequence to find homology.
* Then load homologous bits of genome locally and do detailed alignment.
* Call 'outFunction' with each alignment that is found. gfSavePsl is a handy
* outFunction to use. */
void gfAlignTrans(int *pConn, char *nibDir, aaSeq *seq,
int minMatch, struct hash *tFileHash, struct gfOutput *out);
/* Search indexed translated genome on server with an amino acid sequence.
* Then load homologous bits of genome locally and do detailed alignment.
* Call 'outFunction' with each alignment that is found. */
void gfAlignTransTrans(int *pConn, char *nibDir, struct dnaSeq *seq,
boolean qIsRc, int minMatch, struct hash *tFileCache,
struct gfOutput *out, boolean isRna);
/* Search indexed translated genome on server with an dna sequence. Translate
* this sequence in three frames. Load homologous bits of genome locally
* and do detailed alignment. Call 'outFunction' with each alignment
* that is found. */
int gfConnect(char *hostName, char *portName);
/* Set up our network connection to server. */
+int gfDefaultRepMatch(int tileSize, int stepSize, boolean protTiles);
+/* Figure out appropriate step repMatch value. */
+
void gfMakeOoc(char *outName, char *files[], int fileCount,
int tileSize, bits32 maxPat, enum gfType tType);
/* Count occurences of tiles in seqList and make a .ooc file. */
void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf,
boolean isRc, int minScore, Bits *qMaskBits, struct gfOutput *out,
boolean fastMap, boolean band);
/* Chop up query into pieces, align each, and stitch back
* together again. */
void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3],
struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna,
int minScore, struct gfOutput *out);
/* Chop up query into pieces, align each in translated space, and stitch back
* together again as nucleotides. */
struct gfClump *gfPcrClumps(struct genoFind *gf,
char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
int minDistance, int maxDistance);
/* Find possible PCR hits. The fPrimer and rPrimer are on opposite strands.
* Note: unlike clumps from other query functions, PCR clumps from this
* function have already had clump->target->start subtracted from
* clump->tStart and clump->tEnd so that the coords are relative to that
* target sequence (not the collection of all target sequences). */
-#define gfVersion "34x4" /* Current BLAT version number */
+#define gfVersion "34x5" /* Current BLAT version number */
#endif /* GENOFIND_H */