3ff74a4107b59f4d14c1e06025077dd24900888b
braney
  Wed May 1 11:15:37 2024 -0700
allow underbars in names

diff --git src/hg/utils/paSNP/pa.c src/hg/utils/paSNP/pa.c
index 9d38ae3..9cd3ba3 100644
--- src/hg/utils/paSNP/pa.c
+++ src/hg/utils/paSNP/pa.c
@@ -1,410 +1,410 @@
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "phyloTree.h"
 #include "axt.h"
 #include "math.h"
 #include "pa.h"
 
 struct  hashContent
 {
 char *seqBuffer;
 char *position;
 };
 
 
 #define MAX_POSITION_SIZE	100*1024
 #define SEQBUFFER_SIZE		30*1024
 
 struct slName *getOrderList(char *file)
 /* Read in the species list. */
 {
 struct lineFile *lf = lineFileOpen(file, TRUE);
 char *row[1];
 struct slName *nameList = NULL;
 
 while (lineFileRow(lf, row))
     {
     struct slName *name = newSlName(row[0]);
 
     slAddHead(&nameList, name);
     }
 
 slReverse(&nameList);
 lineFileClose(&lf);
 
 return nameList;
 }
 
 int fillSeqHash(struct slName *list, struct hash *hash)
 /* Put sequences into sequence hash. */
 {
 struct slName *name;
 int count = 0;
 
 for(name = list; name; name = name->next)
     {
     struct hashContent *hc;
 
     AllocVar(hc);
     char *buffer = needMem( SEQBUFFER_SIZE);
 
     hc->seqBuffer = buffer;
     hc->position = needMem(MAX_POSITION_SIZE);
 
     hashAdd(hash, name->name, hc);
     count++;
     }
 
 return count;
 }
 
 static void clearSpecies(struct hash *seqHash, int exonSize)
 /* Empty the sequence buffers. */
 {
 struct hashCookie cook = hashFirst(seqHash);
 struct hashEl *hel;
 
 while((hel = hashNext(&cook)) != NULL)
     {
     struct hashContent *hc = hel->val;
     char *seqBuffer = hc->seqBuffer;
 
     memset(seqBuffer, '-', exonSize);
     }
 }
 
 boolean isDashXorZ(char aa)
 /* Is this aa a dash, X, or Z? */
 {
 if ( (aa == '-') || (aa == 'X') || (aa == 'Z') )
     return TRUE;
 
 return FALSE;
 }
 
 
 int globalCount = 0;
 
 void analyzeAlign(struct alignDetail *detail, alignFunc aFunc, 
     columnFunc cfunc,  void *closure)
 /* Analyze this one alignment. */
 {
 aFunc(detail, cfunc, closure);
 }
 
 void parseFasta(struct lineFile *lf, int numSpecies, struct slName *list,
     struct hash *seqHash, alignFunc afunc, columnFunc cfunc, void *closure)
 /* Parse an AA fasta, calling the alignment and column functions where appropriate. */
 {
 char *words[5000];
 int wordsRead;
 boolean expectGreat = TRUE;
 char *seqBuffer = NULL;
 struct alignDetail detail;
 struct slName *name;
 
 memset(&detail, 0, sizeof(detail));
 detail.numSpecies = numSpecies;
 detail.seqBuffers = needMem(numSpecies * sizeof(struct seqBuffer));
 int ii = 0;
 
 for(name = list; name; name = name->next)
     {
     struct hashContent *hc = hashMustFindVal(seqHash, name->name);
 
     detail.seqBuffers[ii].position = hc->position;
     detail.seqBuffers[ii].buffer = hc->seqBuffer;
     detail.seqBuffers[ii].species = name->name;
     detail.seqBuffers[ii].lastTwo[0] = '-';
     detail.seqBuffers[ii].lastTwo[1] = '-';
     ii++;
     }
 
 while( (wordsRead = lineFileChopNext(lf, words, sizeof(words)/sizeof(char *)) ))
     {
     if (expectGreat)
 	{
 	if (*words[0] != '>')
 	    errAbort("expect '>' as first char on line %d",lf->lineIx);
 
 	char *pName = words[0];
 	char *exonCountStr = strrchr(pName, '_');
 	if (exonCountStr == NULL)
 	    errAbort("expected to find underbar on line %d",lf->lineIx);
 	int newExonCount = atoi(exonCountStr + 1);
 	if (newExonCount == 0)
 	    errAbort("bad exon count on line %d",lf->lineIx);
 	*exonCountStr = 0;
 	char *exonNumStr = strrchr(pName, '_');
 	if (exonNumStr == NULL)
 	    errAbort("expected to find underbar on line %d",lf->lineIx);
 	int newExonNum = atoi(exonNumStr + 1);
 	*exonNumStr = 0;
-	char *species = strrchr(pName, '_');
+	char *species = strchr(pName, '_');
 	*species++ = 0;
 	if (species == NULL)
 	    errAbort("expected to find species underbar on line %d",lf->lineIx);
 	int newExonSize = atoi(words[1]);
 
 	pName++;
 
 	if (newExonSize > SEQBUFFER_SIZE)
 	    errAbort("exon is too big (%d) for constant SEQBUFFER (%d)",
 		newExonSize, SEQBUFFER_SIZE);
 	
 	if (newExonSize <= 0)
 	    errAbort("expected size > 0 (2nd arg)  on line %d",lf->lineIx);
 
 
 	if ((detail.proName == NULL) || 
 	    !sameString(pName, detail.proName)||
 	    (detail.exonNum != newExonNum) ||
 	    (detail.exonSize != newExonSize))
 	    {
 	    if (detail.proName != NULL)
 		analyzeAlign(&detail, afunc, cfunc, closure);
 	    else
 		detail.exonSize = SEQBUFFER_SIZE; // initialize whole buffer
 
 	    struct seqBuffer *sb = detail.seqBuffers;
 	    struct seqBuffer *lastSb = &detail.seqBuffers[detail.numSpecies];
 
 	    for(; sb < lastSb; sb++)
 		{
 		if (detail.exonSize == 1)
 		    {
 		    sb->lastTwo[0] = '-';
 		    sb->lastTwo[1] = sb->buffer[detail.exonSize-1];
 		    }
 		else
 		    memcpy(sb->lastTwo, &sb->buffer[detail.exonSize-2], 2); 
 		}
 	    clearSpecies(seqHash, detail.exonSize);
 
 	    freez(&detail.proName);
 	    detail.proName = cloneString(pName);
 
 	    detail.exonNum = newExonNum;
 	    detail.exonCount = newExonCount;
 	    detail.exonSize = newExonSize;
 	    detail.startFrame = atoi(words[2]);
 	    detail.endFrame = atoi(words[3]);
 	    freez(&detail.geneName);
 	    if (wordsRead == 6)
 		detail.geneName = cloneString(words[5]);
 	    }
 
 	struct hashContent *hc = hashMustFindVal(seqHash, species);
 	seqBuffer = hc->seqBuffer;
 	if (strlen(words[4]) >  MAX_POSITION_SIZE)
 	    errAbort("overflowed position buffer have %d need %d",
 		MAX_POSITION_SIZE, (int)strlen(words[4]));
 
 	strcpy(hc->position, words[4]);
 
 	expectGreat = FALSE;
 	}
     else
 	{
 	expectGreat = TRUE;
 	if (wordsRead != 1)
 	    errAbort("expect only one word with sequence on line %d\n",lf->lineIx);
 	if (strlen(words[0]) != detail.exonSize)
 	    errAbort("expect exonSize to be %d on line %d\n",detail.exonSize,lf->lineIx);
 	memcpy(seqBuffer, words[0], detail.exonSize);
 	}
     }
 if (detail.proName != NULL)
     analyzeAlign(&detail, afunc, cfunc, closure);
 }
 
 
 void fullColumns( struct alignDetail *detail, columnFunc cfunc, void *closure)
 /* Call column function on columns that are fully populated. */
 {
 int ii;
 
 for(ii = 0; ii < detail->exonSize; ii++)
     {
     int jj;
 
     for(jj= 0; jj < detail->numSpecies; jj++)
 	{
 	struct seqBuffer *sb = &detail->seqBuffers[jj];
 
 	if (isDashXorZ( sb->buffer[ii]))
 	    break;
 	}
 
     if (jj == detail->numSpecies)
 	(*cfunc)(detail, ii, closure);
     }
 }
 
 void binColumns( struct alignDetail *detail, columnFunc cfunc, void *closure)
 /* Call column function on columns that have exactly two values. */
 {
 int ii;
 
 for(ii = 0; ii < detail->exonSize; ii++)
     {
     int jj;
     char firstVal = 0;
     char anotherVal = 0;
 
     for(jj= 0; jj < detail->numSpecies; jj++)
 	{
 	struct seqBuffer *sb = &detail->seqBuffers[jj];
 	char ch = sb->buffer[ii];
 
 	if (isDashXorZ( ch ))
 	    break;
 
 	if (firstVal == 0)
 	    {
 	    firstVal = ch;
 	    }
 	else if (firstVal != ch)
 	    {
 	    if (anotherVal == 0)
 		anotherVal = ch;
 	    else
 		{
 		if (anotherVal != ch)
 		    break;
 		}
 	    }
 	}
 
     if ((anotherVal != 0) && (jj == detail->numSpecies))
 	(*cfunc)(detail, ii, closure);
     }
 }
 
 void allColumns( struct alignDetail *detail, columnFunc cfunc, void *closure)
 /* Call column function on all columns. */
 {
 int ii;
 
 for(ii = 0; ii < detail->exonSize; ii++)
     (*cfunc)(detail, ii, closure);
 }
 
 void parseAli(char *orderFileName, char *fastaFile, 
     alignFunc afunc, columnFunc cfunc, void *closure)
 /* Parse a file of alignments. */
 {
 struct hash *seqHash = newHash(5);
 struct slName *list = getOrderList(orderFileName);
 int numSpecies = fillSeqHash(list, seqHash);
 struct lineFile *lf = lineFileOpen(fastaFile, TRUE);
 
 parseFasta(lf, numSpecies, list, seqHash, afunc, cfunc,  closure);
 slNameFreeList(&list);
 }
 
 struct hash *getOrderHash(char *file)
 /* Parse species order file into a hash. */
 {
 struct lineFile *lf = lineFileOpen(file, TRUE);
 char *row[1];
 int count = 0;
 struct hash *hash = newHash(0);
 while (lineFileRow(lf, row))
     {
     hashAddInt(hash, row[0], count);
     count++;
     }
 lineFileClose(&lf);
 return hash;
 }
 
 struct aaMap *allocAAMap()
 /* Allocate space for a amino acid counts. */
 {
 struct aaMap *ret;
 
 AllocVar(ret);
 
 ret->empty = needMem(sizeof(int) * 256);
 ret->map = needMem(sizeof(double) * 256);
 
 return ret;
 }
 
 struct aaMap *readAAMap(char *fileName)
 /* Read in a amino acid counts from a file. */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *words[1024];
 int wordsRead;
 int ii;
 struct aaMap *am;
 
 am = allocAAMap();
 
 for(ii=0; ii < 256; ii++)
     am->empty[ii] = TRUE;
 
 while( (wordsRead = lineFileChopNext(lf, words, sizeof(words)/sizeof(char *))) > 0)
     {
     if (wordsRead != 2)
 	errAbort("expect two words on line %d of %s\n", lf->lineIx, fileName);
 
     if (strlen(words[0]) > 1)
 	errAbort("expect first word to be one char on line %d of %s\n", lf->lineIx, fileName);
 
     int aa = *words[0];
     am->map[aa] = atof(words[1]);
     am->empty[aa] = FALSE;
     }
 lineFileClose(&lf);
 
 return am;
 }
 
 char *getPosString(char *oldPos, char strand, int resNum, int sfn, int efn)
 /* Generated a position sequence in UCSC standard format. */
 {
 static char buffer[10*1024];
 
 char *chrom = oldPos;
 char *startPos = strchr(oldPos,':');
 *startPos = 0;
 startPos++;
 
 int start = atoi(startPos);
 
 char *endPos = strchr(startPos,'-');
 endPos++;
 
 int end = atoi(endPos);
 resNum *= 3;
 if (strand == '+')
     {
     start += resNum;
     if (sfn == 1)
 	start--;
     if (sfn == 2)
 	start++;
     end = start + 2;
     }
 else
     {
     end -= resNum;
     if (sfn == 2)
 	end--;
     if (sfn == 1)
 	end++;
     start = end - 2;
     }
 
 
 safef(buffer, sizeof buffer, "%s:%d-%d", chrom, start, end);
 startPos--;
 *startPos = ':';
 
 return buffer;
 }