fc93e4649cf7f0c6b3ac1cf3bf176e1b39012cd5 hartera Sat Jan 3 15:58:54 2015 -0800 Changed code to use only 2bit files instead of nib files. diff --git src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c index 628f500..a99877f 100644 --- src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c +++ src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c @@ -1,29 +1,29 @@ /* pslCDnaGenomeMatch - check if retroGene aligns better to parent or retroGene */ /* Copyright (C) 2011 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "memalloc.h" #include "psl.h" #include "hdb.h" #include "axt.h" #include "dnautil.h" #include "dnaseq.h" -#include "nib.h" +#include "twoBit.h" #include "fa.h" #include "dlist.h" #include "dystring.h" #include "binRange.h" #include "options.h" #include "genePred.h" #include "genePredReader.h" #include "dystring.h" #include "pseudoGeneLink.h" #include "verbose.h" #include "nibTwo.h" #include "bed.h" #include "snp125.h" #include "pipeline.h" #define MINDIFF 5 @@ -34,35 +34,36 @@ static char na[3] = "NA"; struct axtScoreScheme *ss = NULL; /* blastz scoring matrix */ struct hash *snpHash = NULL, *mrnaHash = NULL, *faHash = NULL, *tHash = NULL, *species1Hash = NULL, *species2Hash = NULL; int maxGap = 100; int minDiff = MINDIFF; int notAlignPenalty = NOTALIGNEDFACTOR; int aliCount = 0; /* number of alignments read */ int mrnaCount = 0; /* number of mrna read */ int filterCount = 0; /* number of mrna where best hit can be determined */ int outputCount = 0; /* number of alignments written */ int verbosity = 1; int lociCounter = 0; bool passthru = FALSE; bool computeSS = FALSE; struct dlList *fileCache = NULL; -struct hash *nibHash = NULL; +//struct hash *nibHash = NULL; FILE *outFile = NULL; /* output tab sep file */ FILE *bedFile = NULL; /* output bed file */ FILE *scoreFile = NULL; /* output score file */ -struct twoBitFile *twoBitFile = NULL; +struct twoBitFile *cdnaSeqFile = NULL; +struct twoBitFile *genomeSeqFile = NULL; char *species1 = NULL; char *species2 = NULL; char *nibdir1 = NULL; char *nibdir2 = NULL; char *mrna1 = NULL; char *mrna2 = NULL; char *bedOut = NULL; char *scoreOut = NULL; char *snpFile = NULL; /* snp tab file (browser format)*/ int histogram[256][256]; /* histogram of counts for suff statistics, index is a,c,g,t,-,. */ float histoNorm[256][256]; /* normalized histogram for suff statistics, index is a,c,g,t,-,. */ /* command line */ static struct optionSpec optionSpecs[] = { {"species1", OPTION_STRING}, @@ -109,66 +110,57 @@ char **snpObs; /* snp observed bases */ char **snps; /* snp ids */ /* not used */ int retroLoc; /* genomic location of mismatch in retroGene */ int parentLoc; /* genomic location of mismatch in parent Gene */ char retroBase; /* contents of base in retroGene */ char parentBase; /* contents of base in parent gene */ }; struct alignment /* alignment with context info */ { struct alignment *next; struct psl *psl; /* alignment of mrna for this species */ char *db; /* name of database of species */ - char *nibDir; /* directory containing nib files */ + struct twoBitFile *twoBitSeqFile; /* open two bit file */ char *mrnaPath; /* path to location of mrna sequence */ struct dnaSeq *tSeq; /* sequence in genome */ struct dnaSeq *qSeq; /* mrna sequence */ char **qSequence; /* query sequence for each block */ char **tSequence; /* target sequence for each block */ }; struct cachedFile /* File in cache. */ { struct cachedFile *next; /* next in list. */ char *name; /* File name (allocated here) */ FILE *f; /* Open file. */ }; -struct seqFilePos -/* Where a sequence is in a file. */ - { - struct filePos *next; /* Next in list. */ - char *name; /* Sequence name. Allocated in hash. */ - char *file; /* Sequence file name, allocated in hash. */ - long pos; /* Position in fa file/size of nib. */ - bool isNib; /* True if a nib file. */ - }; void usage() /* Print usage instructions and exit. */ { errAbort( "pslCDnaGenomeMatch - check if retroGene aligns better to parent or retroGene \n" "usage:\n" - " pslCDnaGenomeMatch input.psl chrom.sizes cdna.2bit nibDir output.psl\n" + " pslCDnaGenomeMatch input.psl chrom.sizes cdna.2bit genome.2bit output.psl\n" "input.psl contains input mRNA/EST alignment psl file SORTED by qName\n" "chrom.sizes is a list of chromosome followed by size\n" - "cdna.2bit contains fasta sequences of all mrnas/EST \n" - "directory containing nibs of genome\n" + "cdna.2bit contains fasta sequences of all mrnas/EST in 2bit format\n" + "genome.2bit contains fasta genome sequences in 2bit format\n" "output.psl contains filtered alignments for best matches and cases where no filtering occurred.\n\n" "Options: \n" " -score=output.tab is output containing mismatch info\n" " -computeSS compute sufficient statistics instead of filtering\n" " -passthru if best hit cannot be decided, then pass through all alignments\n" " -minDiff=N minimum difference in score to filter out 2nd best hit (default %d)\n" " -notAlignPenalty=N score non-aligning bases with 1/N (default %d)\n" " -bedOut=bed output file of mismatches.\n" " -species1=psl file with alignment of mrna/EST to other species.\n" " -species2=psl file with alignment of mrna/EST to other species.\n" " -snp=snp.tab.gz contains snps with or without bin field in snp125 format\n" " -nibdir1=sequence of species 1 \n" " -nibdir2=sequence of species 2 \n" " -mrna1=fasta file with sequence of mrna/EST used in alignment 1\n" " -mrna2=fasta file with sequence of mrna/EST used in alignment 2\n", @@ -178,103 +170,99 @@ void alignFree(struct alignment **pEl) /* Free a single dynamically allocated alignment */ { struct alignment *el; struct psl *p; struct dnaSeq *tSeq; struct dnaSeq *qSeq; if ((el = *pEl) == NULL) return; p = el->psl; pslFree(&p); tSeq = el->tSeq; qSeq = el->qSeq; freeDnaSeq(&tSeq); freeDnaSeq(&qSeq); -//freeMem(el->db); freez(pEl); } void alignFreeList(struct alignment **pList) /* Free a list of dynamically allocated alignment's */ { struct alignment *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; //pslFreeList(&el->psl); alignFree(&el); } *pList = NULL; } -struct dnaSeq *nibInfoLoadSeq(struct nibInfo *nib, int start, int size) -/* Load in a sequence in mixed case from nib file. */ -{ -return nibLdPartMasked(NIB_MASK_MIXED, nib->fileName, nib->f, nib->size, - start, size); -} - bool purine(char a) { if (toupper(a) == 'A' || toupper(a) == 'G') return(TRUE); else return(FALSE); } bool transversion(char a, char b) { if (purine(a) != purine(b)) return TRUE; else return FALSE; } bool transition(char a, char b) { if (!transversion(a,b)) return TRUE; else return FALSE; } -struct dnaSeq *nibInfoLoadStrand(struct nibInfo *nib, char *seqName, int start, int end, +struct dnaSeq *twoBitGetSeqForStrand(struct twoBitFile *tbf, char *seqName, int start, int end, char strand) -/* Load in a mixed case sequence from nib file, from reverse strand if +/* Load in a mixed case sequence from from a twoBit file, from reverse strand if * strand is '-'. */ { struct dnaSeq *seq; int size = end - start; +int seqSize = twoBitSeqSize(tbf, seqName); assert(size >= 0); + if (strand == '-') { verbose(6,"before start %d end %d size %d %s\n", - start, end, nib->size, seqName); - reverseIntRange(&start, &end, nib->size); + start, end, size, seqName); + reverseIntRange(&start, &end, seqSize); verbose(6,"after start %d end %d size %d %s\n", - start, end, nib->size, seqName); + start, end, size, seqName); assert(start >= 0); - seq = nibInfoLoadSeq(nib, start, size); + seq = twoBitReadSeqFrag(tbf, seqName, start, end); + //seq = nibInfoLoadSeq(nib, start, size); // seq = nibTwoCacheSeqPart(ntc, seqName, // start, size, &retFullSeqSize); reverseComplement(seq->dna, seq->size); } else { - seq = nibInfoLoadSeq(nib, start, size); + seq = twoBitReadSeqFrag(tbf, seqName, start, end); + //seq = nibInfoLoadSeq(nib, start, size); // seq = nibTwoCacheSeqPart(ntc, seqName, // start, size, &retFullSeqSize); } return seq; } void addLoci(struct loci **lociList, struct psl *psl) /* add a new loci from psl, if not already in list */ { struct loci *loci = NULL; bool found = FALSE; if (!found) { AllocVar(loci); @@ -948,69 +936,70 @@ for (i = 0 ; i < 256 ; i++) for (j = 0 ; j < 256 ; j++) { histogram[i][j] = 0; histoNorm[i][j] = 0.0; } } void computeSuffStats(struct misMatch **misMatchList, struct alignment *align, struct loci *lociList) /* compute sufficient statistics from each alignment */ { int blockIx = 0; //int i, j, sum = 0; struct psl *psl = align->psl; -struct nibInfo *tNib = nibInfoFromCache(nibHash, align->nibDir, psl->tName); +//struct nibInfo *tNib = nibInfoFromCache(nibHash, align->nibDir, psl->tName); //nibTwoCache static struct dnaSeq *tSeq = NULL; int tStart = psl->tStart; int tEnd = psl->tEnd; //int misMatchCount = 0; char genomeStrand = psl->strand[1] == '-' ? '-' : '+'; if (genomeStrand == '-') reverseIntRange(&tStart, &tEnd, psl->tSize); for (blockIx=0; blockIx < psl->blockCount; ++blockIx) /* for each alignment block get sequence for both strands */ { struct snp125 *snp = NULL, *snpList = NULL; - if (hashFindVal(twoBitFile->hash, psl->qName) == NULL) + if (hashFindVal(cdnaSeqFile->hash, psl->qName) == NULL) { printf("skipping %s not found \n",psl->qName); return; } - int size = twoBitSeqSize(twoBitFile, psl->qName); + int size = twoBitSeqSize(cdnaSeqFile, psl->qName); int i = 0; int ts = psl->tStarts[blockIx]; int te = psl->tStarts[blockIx]+(psl->blockSizes[blockIx]); int qe = psl->qStarts[blockIx]+(psl->blockSizes[blockIx]); struct dnaSeq *qSeq = NULL; if (qe > size) { printf("skipping %s %d > %d \n",psl->qName, qe, size ); return; } - qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, psl->qStarts[blockIx], + qSeq = twoBitReadSeqFrag(cdnaSeqFile, psl->qName, psl->qStarts[blockIx], psl->qStarts[blockIx]+(psl->blockSizes[blockIx])); if (genomeStrand == '-') reverseIntRange(&ts, &te, psl->tSize); verbose(5," xyz %s t %s:%d-%d q %d-%d %s strand %c\n", psl->qName, psl->tName, ts, te, psl->qStarts[blockIx], psl->qStarts[blockIx]+psl->blockSizes[blockIx], psl->strand, genomeStrand); - tSeq = nibInfoLoadStrand(tNib, psl->tName, psl->tStarts[blockIx], + tSeq = twoBitGetSeqForStrand(align->twoBitSeqFile, psl->tName, + psl->tStarts[blockIx], psl->tStarts[blockIx]+(psl->blockSizes[blockIx]), genomeStrand); verbose(6,"tSeq %s len %d %d\n",tSeq->dna, tSeq->size, (int)strlen(tSeq->dna)); verbose(6,"qSeq %s\n",qSeq->dna); assert(psl->strand[0] == '+'); /* count match and mismatches in each alignment block */ for (i = 0 ; i < tSeq->size; i++) { char t = toupper(tSeq->dna[i]); char q = toupper(qSeq->dna[i]); int genomeStart = ts+i; //int mrnaLoc = psl->qStarts[blockIx]+i; if (q == 'n' || q == 'N') @@ -1040,81 +1029,81 @@ verbose(4," [%d] snp match %s %s %s:%d %s %s offset %d gs %c ts %d genomic%c %c valid %s\n", getLoci(lociList, psl->tName, psl->tStart), snp->name, psl->qName, snp->chrom, snp->chromStart, snp->observed, snp->strand, offset, genomeStrand, ts, genomeStrand, t, snp->valid); } } /* add indel for portions of mrna that do not align at all */ // for (i = 0 ; i < psl->qStarts[0] ; i++) // { // char q = toupper(qSeq->dna[i]); // } freeDnaSeq(&tSeq); freeDnaSeq(&qSeq); } } -void addOtherAlignments( struct alignment *alignList, struct hash *speciesHash, char *name, char *nibDir, char *mrnaPath) +void addOtherAlignments( struct alignment *alignList, struct hash *speciesHash, char *name, struct twoBitFile *twoBitFile, char *mrnaPath) { /* add alignemnts from other species to the list */ struct hashEl *speciesList = NULL , *el = NULL; struct alignment *align = NULL; if (speciesHash != NULL) speciesList = hashLookup(speciesHash,name); for (el = speciesList ; el != NULL ; el = el->next) { struct psl *psl = el->val; AllocVar(align); align->psl = psl; - align->nibDir = nibDir; + align->twoBitSeqFile = twoBitFile; align->mrnaPath = mrnaPath; slAddHead(&alignList, align); } slReverse(&alignList); } void fillinMatches(struct misMatch **misMatchList, struct alignment *align, struct loci *lociList) /* for each mismatch , lookup bases in other loci */ { int blockIx = 0; static struct dnaSeq *tSeq = NULL; struct psl *psl = align->psl; - struct nibInfo *tNib = nibInfoFromCache(nibHash, align->nibDir, psl->tName); + //struct nibInfo *tNib = nibInfoFromCache(nibHash, align->nibDir, psl->tName); int tStart = psl->tStart; int tEnd = psl->tEnd; char genomeStrand = psl->strand[1] == '-' ? '-' : '+'; int index = getLoci(lociList, psl->tName, psl->tStart); if (genomeStrand == '-') reverseIntRange(&tStart, &tEnd, psl->tSize); for (blockIx=0; blockIx < psl->blockCount; ++blockIx) /* for each alignment block get sequence for both strands */ { - struct dnaSeq *qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, psl->qStarts[blockIx], + struct dnaSeq *qSeq = twoBitReadSeqFrag(cdnaSeqFile, psl->qName, psl->qStarts[blockIx], psl->qStarts[blockIx]+(psl->blockSizes[blockIx])); int ts = psl->tStarts[blockIx]; int te = psl->tStarts[blockIx]+(psl->blockSizes[blockIx]); int qs = psl->qStarts[blockIx]; int qe = psl->qStarts[blockIx]+(psl->blockSizes[blockIx]); struct misMatch *mm = NULL; if (genomeStrand == '-') reverseIntRange(&ts, &te, psl->tSize); - tSeq = nibInfoLoadStrand(tNib, psl->tName, psl->tStarts[blockIx], + tSeq = twoBitGetSeqForStrand(align->twoBitSeqFile, psl->tName, psl->tStarts[blockIx], psl->tStarts[blockIx]+(psl->blockSizes[blockIx]), genomeStrand); for (mm = *misMatchList ; mm != NULL ; mm = mm->next) { /* i = offset within the block */ int i = mm->mrnaLoc - qs; int genomeStart = ts+i; if (genomeStrand == '-') genomeStart = te-i-1; /* if genomic chr not filled in yet (only mismatches are filled in ), and we are within the block, lookup genomic loc and base*/ if (index == mm->loci) { if (sameString(mm->chrom , na) && mm->mrnaLoc >= qs && mm->mrnaLoc < qe && i >= 0) { @@ -1163,53 +1152,53 @@ verbose(4," snp %s %s %s:%d offset %d %s %s gs %c ts %d genomic%c valid %s\n", snp->name, psl->qName, snp->chrom, snp->chromStart, offset, snp->observed, snp->strand, genomeStrand, ts, genomeStrand, snp->valid); } */ freeDnaSeq(&tSeq); freeDnaSeq(&qSeq); } } void buildMisMatches(struct misMatch **misMatchList, struct alignment *align, struct loci *lociList) /* build list of mismatches for each alignment (loci) */ { int blockIx = 0; struct psl *psl = align->psl; -struct nibInfo *tNib = nibInfoFromCache(nibHash, align->nibDir, psl->tName); +//struct nibInfo *tNib = nibInfoFromCache(nibHash, align->nibDir, psl->tName); static struct dnaSeq *tSeq = NULL; int tStart = psl->tStart; int tEnd = psl->tEnd; int misMatchCount = 0; int transitionCount = 0; int transversionCount = 0; char genomeStrand = psl->strand[1] == '-' ? '-' : '+'; int matchCount = 0; for (blockIx=0; blockIx < psl->blockCount; ++blockIx) /* for each alignment block get sequence for both strands */ { struct snp125 *snp = NULL, *snpList = NULL; - struct dnaSeq *qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, psl->qStarts[blockIx], + struct dnaSeq *qSeq = twoBitReadSeqFrag(cdnaSeqFile, psl->qName, psl->qStarts[blockIx], psl->qStarts[blockIx]+(psl->blockSizes[blockIx])); int i = 0; int ts = psl->tStarts[blockIx]; int te = psl->tStarts[blockIx]+(psl->blockSizes[blockIx]); if (genomeStrand == '-') reverseIntRange(&ts, &te, psl->tSize); - tSeq = nibInfoLoadStrand(tNib, psl->tName, psl->tStarts[blockIx], + tSeq = twoBitGetSeqForStrand(align->twoBitSeqFile, psl->tName, psl->tStarts[blockIx], psl->tStarts[blockIx]+(psl->blockSizes[blockIx]), genomeStrand); verbose(5," buildMisMatches for blk %s t %s:%d-%d q %d-%d %s strand %c\n", psl->qName, psl->tName, ts, te, psl->qStarts[blockIx], psl->qStarts[blockIx]+psl->blockSizes[blockIx], psl->strand, genomeStrand); verbose(6,"tSeq %s len %d %d\n",tSeq->dna, tSeq->size, (int)strlen(tSeq->dna)); verbose(6,"qSeq %s\n",qSeq->dna); assert(psl->strand[0] == '+'); /* check for mismatches in exon (alignment block)*/ for (i = 0 ; i < tSeq->size; i++) { char t = toupper(tSeq->dna[i]); char q = toupper(qSeq->dna[i]); @@ -1377,31 +1366,31 @@ if (align->psl->strand[1] == '+' || align->psl->strand[1] == '-') align->psl->strand[1] = '\0'; pslTabOut(align->psl, outFile); outputCount++; } filterCount++;; } misMatchFreeList(&misMatchList); } else if (computeSS) { } lociFreeList(&lociList); } -void pslCDnaGenomeMatch(char *inName, char *tNibDir) +void pslCDnaGenomeMatch(char *inName) { struct psl *psl = NULL, *pslList = NULL ; struct alignment *subList = NULL; char lastName[512]; unsigned int i , j; pslList = readPslList(inName); /* rev compl alignments so mrna is always on + strand */ /* Note: this should flip the loci also */ safef(lastName,sizeof(lastName), "%s", pslList->qName); psl = pslList; while (psl != NULL ) { bool rm = FALSE; @@ -1415,31 +1404,31 @@ } if (differentString(lastName, psl->qName) && subList != NULL && lastName != NULL) { slSort(&subList, pslCmpMatch); verbose(2, "sort on match pslList %d subList %d last %s\n",slCount(pslList), slCount(subList), lastName); doOneMrna(lastName, subList); alignFreeList(&subList); verbose(5, "3 pslList %d\n",slCount(pslList)); safef(lastName, sizeof(lastName), "%s", psl->qName); } AllocVar(align); newPsl = psl; psl = psl->next; align->psl = newPsl; align->db = "db"; - align->nibDir = tNibDir; + align->twoBitSeqFile = genomeSeqFile; align->mrnaPath = NULL; rm = slRemoveEl(&pslList, newPsl); if (rm) { slAddHead(&subList, align); verbose(5, "add %s %s\n",align->psl->qName, align->psl->tName); } } verbose(5,"last doOneMrna\n"); slSort(&subList, pslCmpMatch); doOneMrna(lastName, subList); alignFreeList(&subList); //pslFreeList(&pslList); verbose(1,"Wrote %d alignments out of %d \n", outputCount, aliCount); verbose(1,"%d out of %d mRNAs passed strict criteria (pass thru not counted)\n", filterCount, mrnaCount); @@ -1480,63 +1469,65 @@ getHistoNorm('n','a')+ getHistoNorm('n','c')+ getHistoNorm('n','g')+ getHistoNorm('n','t') ); for (i = 0 ; i < 128 ; i++) for (j = 0 ; j < 128 ; j++) if (getHistogram(i,j) > 0) fprintf(outFile, "%c %c = %f \n",(char)i, (char)j, getHistoNorm(i,j)) ; } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc != 6) usage(); -if (nibHash == NULL) - nibHash = hashNew(0); +//if (nibHash == NULL) +// nibHash = hashNew(0); lociCounter = 0; fileCache = newDlList(); verbosity = optionInt("verbose", verbosity); verboseSetLogFile("stdout"); verboseSetLevel(verbosity); ss = axtScoreSchemeDefault(); -twoBitFile = twoBitOpen(argv[3]); +cdnaSeqFile = twoBitOpen(argv[3]); +genomeSeqFile = twoBitOpen(argv[4]); outFile = fopen(argv[5],"w"); initSS(); minDiff = optionInt("minDiff", MINDIFF); notAlignPenalty = optionInt("notAlignPenalty", NOTALIGNEDFACTOR); snpFile = optionVal("snp", NULL); if (snpFile != NULL) { verbose(1,"Reading snps from %s\n",snpFile); snpHash = readSnpToBinKeeper(argv[2],snpFile); } scoreOut = optionVal("score", NULL); if (scoreOut != NULL) scoreFile = fopen(scoreOut,"w"); bedOut = optionVal("bedOut", NULL); if (bedOut != NULL) bedFile = fopen(bedOut,"w"); species1 = optionVal("species1", NULL); if (species1 != NULL) species1Hash = readPslQnameToHash(species1); species2 = optionVal("species2", NULL); if (species2 != NULL) species2Hash = readPslQnameToHash(species2); nibdir1 = optionVal("nibdir1", NULL); nibdir2 = optionVal("nibdir2", NULL); mrna1 = optionVal("mrna1", NULL); mrna2 = optionVal("mrna2", NULL); passthru = optionExists("passthru"); verbose(1,"Reading alignments from %s\n",argv[1]); -pslCDnaGenomeMatch(argv[1], argv[4]); +pslCDnaGenomeMatch(argv[1]); fclose(outFile); if (bedOut != NULL) fclose(bedFile); if (scoreOut != NULL) fclose(scoreFile); freeDlList(&fileCache); -twoBitClose(&twoBitFile); +twoBitClose(&cdnaSeqFile); +twoBitClose(&genomeSeqFile); return(0); }