ef98cc65cd9626157087f5ad3dd6eac5012c50d8 galt Thu Jan 24 18:13:10 2013 -0800 Add ability to tolerate paths in .2bit filename with gfServer. Paths are removed leaving only the filename. Clients are on other machines and therefore cannnot use them in general. diff --git src/inc/genoFind.h src/inc/genoFind.h index fb6fa0c..8cc84de 100644 --- src/inc/genoFind.h +++ src/inc/genoFind.h @@ -1,385 +1,385 @@ /* genoFind.h - Interface to modules for fast finding of sequence * matches. */ /* Copyright 2001-2002 Jim Kent. All rights reserved. */ #ifndef GENOFIND_H #define GENOFIND_H #ifndef DNASEQ_H #include "dnaseq.h" #endif #ifndef FUZZYFIND_H #include "fuzzyFind.h" #endif #ifndef HASH_H #include "hash.h" #endif #ifndef ALITYPE_H #include "aliType.h" #endif #ifndef LOCALMEM_H #include "localmem.h" #endif #ifndef BITS_H #include "bits.h" #endif #ifndef AXT_H #include "axt.h" #endif enum gfConstants { gfMinMatch = 2, gfMaxGap = 2, gfTileSize = 11, gfMaxTileUse = 1024, gfPepMaxTileUse = 30000, }; struct gfSeqSource /* Where a block of sequence comes from. */ { struct gfSeqSource *next; char *fileName; /* Name of file. */ bioSeq *seq; /* Sequences. Usually either this or fileName is NULL. */ bits32 start,end; /* Position within merged sequence. */ Bits *maskedBits; /* If non-null contains repeat-masking info. */ }; struct gfHit /* A genoFind hit. */ { struct gfHit *next; bits32 qStart; /* Where it hits in query. */ bits32 tStart; /* Where it hits in target. */ bits32 diagonal; /* tStart + qSize - qStart. */ }; /* gfHits are free'd with simple freeMem or slFreeList. */ struct gfClump /* A clump of hits. */ /* Note: for clumps from regular (blat) queries, tStart and tEnd include * target->start, but for clumps from gfPcrClumps(), tStart and tEnd have * already had target->start subtracted. So tStart and tEnd in PCR clumps * are relative to that target sequence (not the collection of all target * sequences). */ { struct gfClump *next; /* Next clump. */ bits32 qStart, qEnd; /* Position in query. */ struct gfSeqSource *target; /* Target source sequence. */ bits32 tStart, tEnd; /* Position in target. */ int hitCount; /* Number of hits. */ struct gfHit *hitList; /* List of hits. Not allocated here. */ int queryCoverage; /* Number of bases covered in query (thx AG!) */ }; void gfClumpFree(struct gfClump **pClump); /* Free a single clump. */ void gfClumpFreeList(struct gfClump **pList); /* Free a list of dynamically allocated gfClump's */ struct genoFind /* An index of all K-mers in the genome. */ { int maxPat; /* Max # of times pattern can occur * before it is ignored. */ int minMatch; /* Minimum number of tile hits needed * to trigger a clump hit. */ int maxGap; /* Max gap between tiles in a clump. */ int tileSize; /* Size of each N-mer. */ int stepSize; /* Spacing between N-mers. */ int tileSpaceSize; /* Number of N-mer values. */ int tileMask; /* 1-s for each N-mer. */ int sourceCount; /* Count of source files. */ struct gfSeqSource *sources; /* List of sequence sources. */ bool isPep; /* Is a peptide. */ bool allowOneMismatch; /* Allow a single mismatch? */ int segSize; /* Index is segmented if non-zero. */ bits32 totalSeqSize; /* Total size of all sequences. */ bits32 *listSizes; /* Size of list for each N-mer */ void *allocated; /* Storage space for all lists. */ bits32 **lists; /* A list for each N-mer. Used if * isSegmented is false. */ bits16 **endLists; /* A more complex list for each N-mer. * Used if isSegmented is true. * Values come in groups of threes. * The first is the packed last few * letters of the tile. The next two * are the offset in the genome. This * would be a struct but that would take * 8 bytes instead of 6, or nearly an * extra gigabyte of RAM. */ }; void genoFindFree(struct genoFind **pGenoFind); /* Free up a genoFind index. */ struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name); /* Find target of given name. Return NULL if none. */ /* --- Stuff for saving results ---- */ struct gfOutput /* A polymorphic object to help us write many file types. */ { struct gfOutput *next; void *data; /* Type-specific data pointer. Must be freeMem'able */ void (*out)(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, bioSeq *tSeq, struct hash *t3Hash, bioSeq *qSeq, boolean qIsRc, boolean tIsRc, enum ffStringency stringency, int minMatch, struct gfOutput *out); /* This is the type of a client provided function to save an alignment. * The parameters are: * chromName - name of target (aka genomic or database) sequence. * chromSize - size of target sequence. * chromOffset - offset of genoSequence in target. * ffAli - alignment with pointers into tSeq/qSeq or in * translated target case, into t3Hash. * tSeq - part of target sequence in normal case. In translated * target case look at t3Hash instead. * t3Hash - used only in translated target case. A hash keyed by * target sequence name with values *lists* of trans3 structures. * This hash can be searched to find both the translated and * untranslated versions of the bits of the target that are in * memory. (You can assume at this point all parts needed for * output are indeed in memory.) * qSeq - query sequence (this isn't segmented at all). * isRc - True if query is reverse complemented. * stringency - ffCdna, etc. I'm hoping to move this elsewhere. * minMatch - minimum score to output. Also should be moved elsewhere. * outputData - custom data for specific output function. * The interface is a bit complex - partly from the demands of translated * output, and partly from trying not to have the entire target sequence in * memory. */ void (*queryOut)(struct gfOutput *out, FILE *f); /* Called for each query */ void (*fileHead)(struct gfOutput *out, FILE *f); /* Write file header if any */ boolean reportTargetStrand; /* Report target as well as query strand? */ struct hash *maskHash; /* associates target sequence name and mask. */ int minGood; /* Minimum sequence identity in thousandths. */ boolean qIsProt; /* Query is peptide. */ boolean tIsProt; /* Target is peptide. */ int queryIx; /* Index of query */ boolean includeTargetFile; /* Prefix file: to target sequence name. */ }; struct gfOutput *gfOutputAny(char *format, int goodPpt, boolean qIsProt, boolean tIsProt, boolean noHead, char *databaseName, int databaseSeqCount, double databaseLetters, double minIdentity, FILE *f); /* Initialize output in a variety of formats in file or memory. * Parameters: * format - either 'psl', 'pslx', 'blast', 'wublast', 'axt' * goodPpt - minimum identity of alignments to output in parts per thousand * qIsProt - true if query side is a protein. * tIsProt - true if target (database) side is a protein. * noHead - if true suppress header in psl/pslx output. * databaseName - name of database. Only used for blast output * databaseSeq - number of sequences in database - only for blast * databaseLetters - number of bases/aas in database - only blast * FILE *f - file. */ struct gfOutput *gfOutputPsl(int goodPpt, boolean qIsProt, boolean tIsProt, FILE *f, boolean saveSeq, boolean noHead); /* Set up psl/pslx output */ struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt, boolean tIsProt, FILE *f); /* Setup output for axt format. */ struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt, boolean tIsProt); /* Setup output for in memory axt output. */ struct gfOutput *gfOutputBlast(int goodPpt, boolean qIsProt, boolean tIsProt, char *databaseName, int databaseSeqCount, double databaseLetters, char *blastType, /* blast, blast8, blast9, wublast, or xml */ double minIdentity, FILE *f); /* Setup output for blast/wublast format. */ void gfOutputQuery(struct gfOutput *out, FILE *f); /* Finish writing out results for a query to file. */ void gfOutputHead(struct gfOutput *out, FILE *f); /* Write out header if any. */ void gfOutputFree(struct gfOutput **pOut); /* Free up output. */ /* -------- Routines to build up index ------------ */ void gfCheckTileSize(int tileSize, boolean isPep); /* Check that tile size is legal. Abort if not. */ struct genoFind *gfIndexSeq(bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean allowOneMismatch, boolean maskUpper, int stepSize); /* Make index for all seqs in list. * minMatch - minimum number of matching tiles to trigger alignments * maxGap - maximum deviation from diagonal of tiles * tileSize - size of tile in nucleotides * maxPat - maximum use of tile to not be considered a repeat * oocFile - .ooc format file that lists repeat tiles. May be NULL. * isPep - TRUE if indexing proteins, FALSE for DNA. * maskUpper - Mask out upper case sequence (currently only for nucleotides). * stepSize - space between tiles. Zero means default (which is tileSize). * For DNA sequences upper case bits will be unindexed. */ struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean allowOneMismatch, int stepSize); /* Make index for all .nib and .2bits in list. * minMatch - minimum number of matching tiles to trigger alignments * maxGap - maximum deviation from diagonal of tiles * tileSize - size of tile in nucleotides * maxPat - maximum use of tile to not be considered a repeat * oocFile - .ooc format file that lists repeat tiles. May be NULL. * allowOneMismatch - allow one mismatch in a tile. * stepSize - space between tiles. Zero means default (which is tileSize). */ void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean allowOneMismatch, boolean mask, int stepSize); /* Make translated (6 frame) index for all .nib and .2bit files. */ /* -------- Routines to scan index for homolgous areas ------------ */ struct gfClump *gfFindClumps(struct genoFind *gf, struct dnaSeq *seq, struct lm *lm, int *retHitCount); /* Find clumps associated with one sequence. */ struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount); /* Find clumps associated with one sequence soft-masking seq according to qMaskBits */ struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, struct gfSeqSource *target, int tMin, int tMax); /* Find hits restricted to one particular region. * The hits returned by this will be in target sequence * coordinates rather than concatenated whole genome * coordinates as hits inside of clumps usually are. */ void gfTransFindClumps(struct genoFind *gfs[3], aaSeq *seq, struct gfClump *clumps[3], struct lm *lm, int *retHitCount); /* Find clumps associated with one sequence in three translated reading frames. */ void gfTransTransFindClumps(struct genoFind *gfs[3], aaSeq *seqs[3], struct gfClump *clumps[3][3], struct lm *lm, int *retHitCount); /* Find clumps associated with three sequences in three translated * reading frames. Used for translated/translated protein comparisons. */ void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f); /* Print out info on clump. This routine subtracts clump->target->start * from clump->tStart and from clump->tEnd for printing, so that printed * coords are relative to that target sequence. */ void gfAlignAaClumps(struct genoFind *gf, struct gfClump *clumpList, aaSeq *seq, boolean isRc, int minMatch, struct gfOutput *out); /* Convert gfClumps to an actual alignment that gets saved via * outFunction/outData. */ void gfFindAlignAaTrans(struct genoFind *gfs[3], aaSeq *qSeq, struct hash *t3Hash, boolean tIsRc, int minMatch, struct gfOutput *out); /* Look for qSeq alignment in three translated reading frames. Save alignment * via outFunction/outData. */ /* --- Some routines for dealing with gfServer at a low level ---- */ char *gfSignature(); /* Return signature that starts each command to gfServer. Helps defend * server from confused clients. */ void gfCatchPipes(); /* Set up to catch broken pipe signals. */ int gfReadMulti(int sd, void *vBuf, size_t size); /* Read in until all is read or there is an error. */ /* --- Some routines for dealing with gfServer at a high level ---- */ struct hash *gfFileCacheNew(); /* Create hash for storing info on .nib and .2bit files. */ void gfFileCacheFree(struct hash **pCache); /* Free up resources in cache. */ void gfAlignStrand(int *pConn, char *nibDir, struct dnaSeq *seq, boolean isRc, int minMatch, struct hash *tFileCache, struct gfOutput *out); /* Search genome on server with one strand of other sequence to find homology. * Then load homologous bits of genome locally and do detailed alignment. * Call 'outFunction' with each alignment that is found. gfSavePsl is a handy * outFunction to use. */ void gfAlignTrans(int *pConn, char *nibDir, aaSeq *seq, int minMatch, struct hash *tFileHash, struct gfOutput *out); /* Search indexed translated genome on server with an amino acid sequence. * Then load homologous bits of genome locally and do detailed alignment. * Call 'outFunction' with each alignment that is found. */ void gfAlignTransTrans(int *pConn, char *nibDir, struct dnaSeq *seq, boolean qIsRc, int minMatch, struct hash *tFileCache, struct gfOutput *out, boolean isRna); /* Search indexed translated genome on server with an dna sequence. Translate * this sequence in three frames. Load homologous bits of genome locally * and do detailed alignment. Call 'outFunction' with each alignment * that is found. */ int gfConnect(char *hostName, char *portName); /* Set up our network connection to server. */ int gfDefaultRepMatch(int tileSize, int stepSize, boolean protTiles); /* Figure out appropriate step repMatch value. */ void gfMakeOoc(char *outName, char *files[], int fileCount, int tileSize, bits32 maxPat, enum gfType tType); /* Count occurences of tiles in seqList and make a .ooc file. */ void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf, boolean isRc, int minScore, Bits *qMaskBits, struct gfOutput *out, boolean fastMap, boolean band); /* Chop up query into pieces, align each, and stitch back * together again. */ void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3], struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna, int minScore, struct gfOutput *out); /* Chop up query into pieces, align each in translated space, and stitch back * together again as nucleotides. */ struct gfClump *gfPcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minDistance, int maxDistance); /* Find possible PCR hits. The fPrimer and rPrimer are on opposite strands. * Note: unlike clumps from other query functions, PCR clumps from this * function have already had clump->target->start subtracted from * clump->tStart and clump->tEnd so that the coords are relative to that * target sequence (not the collection of all target sequences). */ #define MAXSINGLEPIECESIZE 5000 /* maximum size of a single piece */ -#define gfVersion "35x1" /* Current BLAT version number */ +#define gfVersion "35x2" /* Current BLAT version number */ #endif /* GENOFIND_H */