src/hg/affyTransciptome/nmerAlign.c e70152e44cc66cc599ff6b699eb8adc07f3e656a

e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/hg/affyTransciptome/nmerAlign.c src/hg/affyTransciptome/nmerAlign.c
index bb72d75..9daae50 100644
--- src/hg/affyTransciptome/nmerAlign.c
+++ src/hg/affyTransciptome/nmerAlign.c
@@ -1,269 +1,272 @@
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
 #include "common.h"
 #include "hash.h"
 #include "dnaseq.h"
 #include "nib.h"
 #include "linefile.h"
 #include "dnautil.h"
 #include "fa.h"
 #include "cheapcgi.h"
 
 
 struct genomePos 
 {
     struct genomePos *next;     /* Next in list. */
     short chrom;                /* Number for chromosome. */
     unsigned int position;      /* Position in chrom. Always a 25mer so don't need end. */
 };
 
 struct chromInfo
 {
     struct chromInfo *next;    /* Next in list. */
     char *chrom;               /* Chrom. */
     char *fileName;            /* Name of file. */
 };
 
 struct chromInfo *ciList = NULL;  /* List of information about the chromsomes, should be length 1 now. */
 struct hash *bigHash = NULL;      /* Hash of all the nmers. */
 double totalCount = 0;            /* Total count of all unique nmers. */
 double allCount = 0;              /* Count of all nmers examined. */
 struct dnaSeq *genomeSeq = NULL;  /* Genomic sequence, usually a chromsome subset. */
 int genomeOffset = 0;             /* Offset in chromosome of gneomSeq. */
 FILE *commonFile = NULL;          /* File to print common oligos to. */
 int nmerSize = 0;                 /* Size of nmers to align. */
 
 void usage() 
 {
 errAbort("Program to find perfect alignments for nmerSize mers in a portion of the genome.\n"
 	 "usage:\n\t"
 	 "nmerAlign chromStart chromEnd chromFile.nib query.fa outFile.tab");
 }
 
 void initChromList(int numChrom, char *chrom[]) 
 /* Load all of the chromosome names into a list so that they can be
    named by the index in the list and thus require only 1 byte each. */
 {
 int i;
 struct chromInfo *ci = NULL;
 char *tmpMark = NULL;
 for(i=0; i<numChrom; i++) 
     {
     AllocVar(ci);
     ci->chrom = cloneString(chrom[i]);
     chopSuffix(ci->chrom);
     tmpMark = strrchr(ci->chrom, '/'); /* look for the end of the path. */
     if(tmpMark != NULL)
 	ci->chrom = tmpMark+1;
     ci->fileName = cloneString(chrom[i]);
     slAddHead(&ciList, ci);
     }
 slReverse(&ciList);
 }
 
 char *chromFromIndex(int i) 
 /* Translate the index of a chromosome into a name .*/
 {
 struct chromInfo *ci = slElementFromIx(ciList, i);
 if(ci == NULL)
     errAbort("count25Mers::chromFromIndex() - Couldn't find index %d in a list that is only %d long.\n", i, slCount(ciList));
 return ci->chrom;
 }
 
 int indexFromChrom(char *chrom) 
 /* Get the index for a given chromosome by name.  */
 {
 struct chromInfo *ci = NULL;
 int count =0;
 for(ci = ciList; ci != NULL; ci = ci->next) 
     {
     if(sameString(ci->chrom, chrom))
 	return count;
     count++;
     }
 errAbort("count25Mers::indexFromChrom() - Couldn't find chrom: %s in list.\n", chrom);
 }
 
 boolean masked(struct dnaSeq *seq, unsigned int start, unsigned int end)
 /* Return true if there are lower case bases or N's in sequence. */
 {
 unsigned int i = 0;
 for(i=start; i<end; i++) 
     {
     if(seq->dna[i] == 'N' || seq->dna[i] == 'n' || islower(seq->dna[i]))
 	return TRUE;
     }
 return FALSE;
 }
 
 void addToHash(struct dnaSeq *seq, unsigned int start, unsigned int end, double *chromCount, int chromOffset)
 /* Check to see if oligo from start to end is in hash, if so add a position
    to list, otherwise add a new hash element. */
 {
 char *buff = NULL;
 struct genomePos *gp = NULL; 
 struct genomePos *gpNew = NULL;
 buff = needMem((end-start+1)*sizeof(char));
 
 //snprintf(buff, (end-start+1), "%s", seq->dna+start);
 strncpy(buff, seq->dna+start, end-start);
 gp = hashFindVal(bigHash, buff);
 
 AllocVar(gpNew);
 gpNew->chrom = indexFromChrom(seq->name);
 gpNew->position = start + chromOffset;
 allCount++;
 if(gp == NULL) 
     {
     hashAddUnique(bigHash, buff, gpNew);
     (*chromCount)++;
     totalCount++;
     }
 else 
     {
     slAddTail(&gp, gpNew);
     }
 freez(&buff);
 }
 
 void countChromNMers(struct dnaSeq *seq, double *chromCount, int chromOffset)
 /* Count how many unique in genome so far unmasked Nmers occur on this chromsome. */
 {
 unsigned int i = 0;
 unsigned limit = seq->size - nmerSize;
 int size = 1000000;
 warn("Looking for %d dots.", limit/size);
 for(i=0; i < limit; i++) 
     {
     if(i%size == 0)
 	{
 	fprintf(stderr, ".");
 	fflush(stderr);
 	}
     if(!masked(seq, i, i+nmerSize))
 	{
 	addToHash(seq, i, i+nmerSize, chromCount, chromOffset);
 	}
     }
 fprintf(stderr, "\n");
 fflush(stderr);
 }
 
 void changeChromName(struct dnaSeq *chrom)
 /* Convert something like /here/there/chr22.nib:1-1000 to chr22 */
 {
 char *tmp = NULL;
 tmp = strrchr(chrom->name, '.');
 if(tmp != NULL)
     *tmp = '\0';
 tmp = strrchr(chrom->name, '/');
 if(tmp != NULL)
     {
     tmp = tmp+1;
     chrom->name = tmp;
     }
 }
 
 void reportCommonTiles(void *val)
 /* If there are more than 50 places where a tile occurs call it common and report it. */
 {
 struct genomePos *gp = val;
 int size = slCount(gp);
 if(size > 20)
     {
     char *buff = needMem(26*sizeof(char));
     strncpy(buff, genomeSeq->dna + (gp->position - genomeOffset), nmerSize);
     fprintf(commonFile, "%s\t%d\n", buff, size);
     freez(&buff);
     }
 }
 
 void buildBigHash(char *chromName, int chromStart, int chromEnd, char *outFile)
 {
 struct dnaSeq *chrom = NULL;
 double chromCount = 0;
 int seqSize = 0;
 FILE *f;
 char buff[1024];
 bigHash = newHash(20);
 snprintf(buff, sizeof(buff), "%s.common", outFile);
 commonFile = mustOpen(buff, "w");
 warn("Reading chromosome %s", chromFromIndex(0));
 nibOpenVerify(chromName, &f, &seqSize);
 chrom = nibLdPartMasked(NIB_MASK_MIXED, chromName, f, seqSize, chromStart, (chromEnd-chromStart));
 fclose(f);
 warn("Counting");
 changeChromName(chrom);
 countChromNMers(chrom, &chromCount, chromStart);
 warn("%f %dmers in chrom %s", chromCount, nmerSize, chromFromIndex(0));
 
 /* Do some reporting about common tiles. */
 genomeOffset = chromStart;
 genomeSeq = chrom;
 hashTraverseVals(bigHash, reportCommonTiles);
 genomeOffset = 0;
 genomeSeq = NULL;
 
 /* Cleanup. */
 dnaSeqFree(&chrom);
 warn("%f unique %dmers in %f examined.", totalCount, nmerSize, allCount);
 carefulClose(&commonFile);
 }
 
 void writeAlign(char *name, DNA *seq,  struct genomePos *gpList, FILE *out, char *strand)
 {
 struct genomePos *gp = NULL;
 for(gp = gpList; gp != NULL; gp = gp->next)
     {
     fprintf(out, "%s\t%d\t%d\t%s\t%s\t%s\n", chromFromIndex(gp->chrom), gp->position, gp->position+strlen(seq), name, strand, seq);
     }
 }
 
 void alignNMers(char *fastaFile, char *outFile)
 {
 struct lineFile *fa = lineFileOpen(fastaFile, TRUE);
 FILE *out = mustOpen(outFile, "w");
 DNA *dna = NULL;
 int seqSize = 0;
 int aligned = 0;
 int numSeq = 0;
 char *seqName = NULL;
 dnaUtilOpen();
 while(faSpeedReadNext(fa, &dna, &seqSize, &seqName))
     {
     boolean found = FALSE;
     struct genomePos *gpList = NULL, *gp = NULL;
     numSeq++;
     toUpperN(dna, strlen(dna));
     reverseBytes(dna, seqSize);
     gpList = hashFindVal(bigHash, dna);
     if(gpList != NULL)
 	{
 	aligned++;
 	found = TRUE;
 	}
     writeAlign(seqName, dna, gpList, out, "+");
     reverseComplement(dna, seqSize);
 
     gpList = hashFindVal(bigHash, dna);
     if(gpList != NULL && !found) 
 	aligned++;
     writeAlign(seqName, dna, gpList, out, "-");
     }
 warn("%d of %d sequences aligned.", aligned, numSeq);
 lineFileClose(&fa);
 carefulClose(&out);
 }
 
 int main(int argc, char *argv[]) 
 {
 unsigned int chromStart = 0, chromEnd = 0;
 cgiSpoof(&argc, argv);
 nmerSize = cgiOptionalInt("nmerSize", 25);
 if(argc != 6) 
     usage();
 initChromList(1, argv+3); 
 chromStart = atoi(argv[1]);
 chromEnd = atoi(argv[2]);
 buildBigHash(argv[3], chromStart, chromEnd, argv[5]);
 alignNMers(argv[4], argv[5]);
 return 0;
 }