src/hg/txCds/txCdsEvFromProtein/txCdsEvFromProtein.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/txCds/txCdsEvFromProtein/txCdsEvFromProtein.c src/hg/txCds/txCdsEvFromProtein/txCdsEvFromProtein.c
index 3ac5903..93fa3d2 100644
--- src/hg/txCds/txCdsEvFromProtein/txCdsEvFromProtein.c
+++ src/hg/txCds/txCdsEvFromProtein/txCdsEvFromProtein.c
@@ -1,449 +1,449 @@
 /* txCdsEvFromProtein - Convert transcript/protein alignments and other evidence into a transcript CDS evidence (tce) file. */
 
 /* Copyright (C) 2007 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "fa.h"
 #include "psl.h"
 #include "genbank.h"
 #include "rangeTree.h"
 
 /* Variables set from command line. */
 char *refStatusFile = NULL;
 char *uniStatusFile = NULL;
 FILE *fUnmapped = NULL;
 char *defaultSource = "blatUniprot";
 struct hash *pepToRefHash = NULL;
 int dodgeStop = 0;
 double minCoverage = 0.75;
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "txCdsEvFromProtein - Convert transcript/protein alignments and other evidence \n"
   "into a transcript CDS evidence (tce) file\n"
   "usage:\n"
   "   txCdsEvFromProtein protein.fa txProtein.psl tx.fa output.tce\n"
   "options:\n"
   "   -refStatus=refStatus.tab - include two column file with refSeq status\n"
   "         Selectively overrides source field of output\n"
   "   -uniStatus=uniStatus.tab - include two column file with uniProt status\n"
   "         Selectively overrides source field of output\n"
   "   -source=name - Name to put in tce source field. Default is \"%s\"\n"
   "   -unmapped=name - Put info about why stuff didn't map here\n"
   "   -exceptions=xxx.exceptions - Include file with info on selenocysteine\n"
   "            and other exceptions.  You get this file by running\n"
   "            files txCdsRaExceptions on ra files parsed out of genbank flat\n"
   "            files.\n"
   "   -refToPep=refToPep.tab - Put refSeq mrna to protein mapping file here\n"
   "            Usually used with exceptions flag when processing refSeq\n"
   "   -dodgeStop=N - Dodge (put gaps in place of) up to this many stop codons\n"
   "            Remark about it in unmapped file though. Also allows frame shifts\n"
   "            This *only* is applied to refPep/refSeq alignments\n"
   "   -minCoverage=0.N - minimum coverage of protein to accept, default %g\n"
   , defaultSource, minCoverage
   );
 }
 
 struct hash *selenocysteineHash = NULL;
 struct hash *altStartHash = NULL;
 
 static struct optionSpec options[] = {
    {"refStatus", OPTION_STRING},
    {"uniStatus", OPTION_STRING},
    {"source", OPTION_STRING},
    {"unmapped", OPTION_STRING},
    {"exceptions", OPTION_STRING},
    {"refToPep", OPTION_STRING},
    {"dodgeStop", OPTION_INT},
    {"minCoverage", OPTION_DOUBLE},
    {NULL, 0},
 };
 
 void unmappedPrint(char *format, ...)
 /* Print out info to unmapped file if it exists. */
 {
 if (fUnmapped != NULL)
     {
     va_list args;
     va_start(args, format);
     vfprintf(fUnmapped, format, args);
     va_end(args);
     }
 }
 
 struct hash *hashTwoColumnFileReverse(char *fileName)
 /* Return hash of strings with keys from second column
  * in file, values from first column. */
 {
 struct hash *hash = hashNew(19);
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[2];
 while (lineFileRow(lf, row))
     hashAdd(hash, row[1], cloneString(row[0]));
 lineFileClose(&lf);
 return hash;
 }
 
 struct hash *getSourceHash()
 /* Return hash of sources to override default source. */
 {
 struct hash *sourceHash = hashNew(18);
 struct hash *typeHash = hashNew(0);
 if (refStatusFile != NULL)
     {
     struct lineFile *lf = lineFileOpen(refStatusFile, TRUE);
     char *row[2];
     while (lineFileRow(lf, row))
         {
 	char typeBuf[128];
 	safef(typeBuf, sizeof(typeBuf), "RefPep%s", row[1]);
 	char *type = hashStoreName(typeHash, typeBuf);
 	hashAdd(sourceHash, row[0], type);
 	}
     lineFileClose(&lf);
     }
 if (uniStatusFile != NULL)
     {
     struct lineFile *lf = lineFileOpen(uniStatusFile, TRUE);
     char *row[2];
     while (lineFileRow(lf, row))
         {
 	char *source;
 	if (row[1][0] == '1')
 	    source = "swissProt";
 	else
 	    source = "trembl";
 	hashAdd(sourceHash, row[0], source);
 	}
     lineFileClose(&lf);
     }
 verbose(2, "%d sources from %d elements\n", typeHash->elCount, sourceHash->elCount);
 return sourceHash;
 }
 
 void removeNegativeGaps(struct psl *psl)
 /* It can happen that there will be negative sized gaps in a 
  * translated psl.  This gets rid of these.  It's easier here
  * because we assume the strand is ++. */
 {
 int i, lastBlock = psl->blockCount-1;
 for (i=0; i<lastBlock; ++i)
     {
     int qBlockSize = psl->blockSizes[i];
     int tBlockSize = qBlockSize*3;
     int iqStart = psl->qStarts[i] + qBlockSize;
     int itStart = psl->tStarts[i] + tBlockSize;
     int iqEnd = psl->qStarts[i+1];
     int itEnd = psl->tStarts[i+1];
     int qGap = iqEnd - iqStart;
     int tGap = itEnd - itStart;
     if (qGap < 0)
 	{
 	psl->blockSizes[i] += qGap;
 	verbose(3, "%s - correcting %d size qGap.\n", psl->qName, qGap);
 	}
     else if (tGap < 0)
         {
 	int gapSize = (-tGap + 2)/3;
 	psl->blockSizes[i] -= gapSize;
 	verbose(3, "%s - correcting %d size tGap.\n", psl->qName, tGap);
 	}
     }
 }
 
 int aliCountStopCodons(struct psl *psl, struct dnaSeq *seq,
 	boolean selenocysteine)
 /* Return count of stop codons in alignment. */
 {
 int blockIx;
 int stopCount = 0;
 for (blockIx=0; blockIx<psl->blockCount; ++blockIx)
     {
     int qBlockSize = psl->blockSizes[blockIx];
     int tStart = psl->tStarts[blockIx];
     DNA *dna = seq->dna + tStart;
     int i;
     for (i=0; i<qBlockSize; ++i)
         {
 	if (isReallyStopCodon(dna, selenocysteine))
 	    {
 	    ++stopCount;
 	    }
 	dna += 3;
 	}
     }
 return stopCount;
 }
 
 
 int totalAminoAcids(struct psl *psl)
 /* Count total number of amino acids in psl. */
 {
 int i = 0, blockCount = psl->blockCount;
 int total = 0;
 for (i=0; i<blockCount; ++i)
     total += psl->blockSizes[i];
 return total;
 }
 
 int countUntilNextStop(char *dna, int maxCount, boolean selenocysteine)
 /* Count up number of codons until next stop */
 {
 int count;
 for (count=0; count<maxCount; ++count)
     {
     if (isReallyStopCodon(dna, selenocysteine))
         break;
     dna += 3;
     }
 return count;
 }
 
 
 void outputDodgedBlocks(struct psl *psl, aaSeq *protSeq, struct dnaSeq *txSeq, 
 	boolean selenocysteine, FILE *f)
 /* Write out block count and the actual blocks to file.  What makes this
  * fun is that we are inserting gaps to avoid stop codone. */
 {
 /* Build up list of ranges corresponding to parts of blocks 
  * broken up by stops. */
 struct range *rangeList = NULL, *range;
 int blockIx;
 for (blockIx = 0; blockIx < psl->blockCount; ++blockIx)
     {
     int tStart = psl->tStarts[blockIx];
     int size = psl->blockSizes[blockIx]*3;
     char *dna = txSeq->dna + tStart;
     int pos, brokenSize;
     for (pos = 0; pos<size; pos += brokenSize+3)
         {
 	brokenSize = 3*countUntilNextStop(dna+pos, (size-pos)/3, selenocysteine);
 	verbose(3, "blockIx %d, size %d, pos %d, size-pos %d, brokenSize %d\n", 
 		blockIx, pos, size, size-pos, brokenSize);
 	if (brokenSize > 0)
 	    {
 	    AllocVar(range);
 	    range->start = tStart+pos;
 	    range->end = range->start + brokenSize;
 	    slAddHead(&rangeList, range);
 	    }
 	}
     }
 slReverse(&rangeList);
 
 /* Output to file */
 fprintf(f, "%d\t", slCount(rangeList));	
 for (range = rangeList; range != NULL; range = range->next)
     fprintf(f, "%d,", range->start);
 fprintf(f, "\t");
 for (range = rangeList; range != NULL; range = range->next)
     fprintf(f, "%d,", range->end - range->start);
 fprintf(f, "\n");
 
 /* Clean up and go home. */
 slFreeList(&rangeList);
 }
 
 void mapAndOutput(char *source, aaSeq *protSeq, struct psl *psl, 
 	struct dnaSeq *txSeq, FILE *f)
 /* Attempt to define CDS in txSeq by looking at alignment between
  * it and a protein. */
 {
 boolean selenocysteine = FALSE;
 boolean doDodge = FALSE;
 int lDodgeStop = 0;
 if (hashLookup(selenocysteineHash, psl->qName))
     selenocysteine = TRUE;
 if (pepToRefHash != NULL)
     {
     char *refAcc = hashFindVal(pepToRefHash, psl->qName);
     if (refAcc != NULL)
 	{
         if (hashLookup(selenocysteineHash, refAcc))
 	    selenocysteine = TRUE;
 	if (endsWith(psl->tName, refAcc))
 	    lDodgeStop = dodgeStop;
 	}
     }
 	    
 if (!sameString(psl->strand, "++"))
     {
     unmappedPrint("%s alignment with %s funny strand %s\n", psl->qName, psl->tName,
     	psl->strand);
     return;
     }
 removeNegativeGaps(psl);
 int stopCount = aliCountStopCodons(psl, txSeq, selenocysteine);
 if (stopCount > 0)
     {
     if (lDodgeStop)
 	{
 	if (stopCount > lDodgeStop)
 	    {
 	    unmappedPrint("%s alignment with %s has too many (%d) stop codons\n", 
 	    	psl->qName, psl->tName, stopCount);
 	    return;
 	    }
 	else
 	    {
 	    unmappedPrint("%s alignment with %s dodging %d stop codons\n", 
 	    	psl->qName, psl->tName, stopCount);
 	    doDodge = TRUE;
 	    }
 	}
     else
 	{
 	unmappedPrint("%s alignment with %s has stop codons\n", psl->qName, psl->tName);
 	return;
 	}
     }
 if (psl->blockCount > 1)
     {
     /* Make sure that all blocks are in same frame. */
     int frame = psl->tStarts[0]%3;
     int i;
     for (i=1; i<psl->blockCount; ++i)
         if (frame != psl->tStarts[i]%3)
 	    {
 	    if (lDodgeStop > 0)
 		{
 		unmappedPrint("%s alignment with %s allowing frame shift\n", 
 		    psl->qName, psl->tName, stopCount);
 		}
 	    else
 		{
 		unmappedPrint("%s alignment with %s in multiple different frames\n", 
 		    psl->qName, psl->tName);
 		return;
 		}
 	    }
 
     }
 int mappedStart = psl->tStart;
 int mappedEnd = psl->tEnd;
 char *s = txSeq->dna + mappedStart;
 char *e = txSeq->dna + mappedEnd;
 
 /* If next base is a stop codon, add it to alignment. */
 if (mappedEnd + 3 <= psl->tSize && isStopCodon(e))
     {
     mappedEnd += 3;
     psl->blockSizes[psl->blockCount-1] += 1;
     psl->tEnd += 3;
     psl->qEnd += 1;
     }
 
 verbose(3, "%s s %c%c%c e %c%c%c %d %d \n", psl->qName, s[0], s[1], s[2], e[0], e[1], e[2], mappedStart, mappedEnd);
 int milliBad = pslCalcMilliBad(psl, FALSE);
 int score = 1000 - 10*milliBad;
 if (score <= 0)
     {
     unmappedPrint("%s too divergent compared to %s (%d score (of 1000), %d milliBad)\n", 
     	psl->qName, psl->tName, score, milliBad);
     return;
     }
 int totalCovAa = totalAminoAcids(psl);
 double protCoverage = (double)totalCovAa/psl->qSize;
 if (protCoverage < minCoverage)
     {
     unmappedPrint("%s covers too little of %s (%d of %d amino acids)\n",
     	psl->qName, psl->tName, totalCovAa, psl->qSize);
     return;
     }
 score *= protCoverage;
 double mrnaInternalCoverage = 3.0*totalCovAa/(psl->tEnd - psl->tStart);
 score *= mrnaInternalCoverage*mrnaInternalCoverage;
 fprintf(f, "%s\t", psl->tName);
 fprintf(f, "%d\t", mappedStart);
 fprintf(f, "%d\t", mappedEnd);
 fprintf(f, "%s\t", source);
 fprintf(f, "%s\t", psl->qName);
 fprintf(f, "%d\t", score);
 fprintf(f, "%d\t", startsWith("atg", s));
 fprintf(f, "%d\t", isStopCodon(e));
 if (doDodge)
     {
     outputDodgedBlocks(psl, protSeq, txSeq, selenocysteine, f);
     }
 else
     {
     fprintf(f, "%d\t", psl->blockCount);	
     int i;
     for (i=0; i < psl->blockCount; ++i)
 	fprintf(f, "%d,", psl->tStarts[i]);
     fprintf(f, "\t");
     for (i=0; i < psl->blockCount; ++i)
 	fprintf(f, "%d,", psl->blockSizes[i]*3);
     fprintf(f, "\n");
     }
 }
 
 void makeExceptionHashes()
 /* Create hash that has accessions using selanocysteine in it
  * if using the exceptions option.  Otherwise the hash will be
  * empty. */
 {
 char *fileName = optionVal("exceptions", NULL);
 if (fileName != NULL)
     genbankExceptionsHash(fileName, &selenocysteineHash, &altStartHash);
 else
     selenocysteineHash = altStartHash = hashNew(4);
 }
 
 void txCdsEvFromProtein(char *protFa, char *txProtPsl, char *txFa, char *output)
 /* txCdsEvFromProtein - Convert transcript/protein alignments and other evidence 
  * into a transcript CDS evidence (tce) file. */
 {
 struct hash *protHash = faReadAllIntoHash(protFa, dnaUpper);
 struct hash *txHash = faReadAllIntoHash(txFa, dnaLower);
 struct lineFile *lf = pslFileOpen(txProtPsl);
 struct hash *sourceHash = getSourceHash();
 FILE *f = mustOpen(output, "w");
 struct psl *psl;
 
 while ((psl = pslNext(lf)) != NULL)
     {
     struct dnaSeq *txSeq = hashFindVal(txHash, psl->tName);
     if (txSeq == NULL)
         errAbort("%s in %s but not %s\n", psl->tName, txProtPsl, txFa);
     aaSeq *protSeq = hashFindVal(protHash, psl->qName);
     if (protSeq == NULL)
         errAbort("%s in %s but not %s\n", psl->qName, txProtPsl, protFa);
     char *source = hashFindVal(sourceHash, psl->qName);
     if (source == NULL)
 	source = defaultSource;
     mapAndOutput(source, protSeq, psl, txSeq, f);
     }
 
 carefulClose(&f);
 lineFileClose(&lf);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 refStatusFile = optionVal("refStatus", refStatusFile);
 uniStatusFile = optionVal("uniStatus", uniStatusFile);
 defaultSource = optionVal("source", defaultSource);
 if (optionExists("unmapped"))
     fUnmapped = mustOpen(optionVal("unmapped", NULL), "w");
 if (optionExists("refToPep"))
     pepToRefHash = hashTwoColumnFileReverse(optionVal("refToPep", NULL));
 dodgeStop = optionInt("dodgeStop", dodgeStop);
 minCoverage = optionDouble("minCoverage", minCoverage);
 makeExceptionHashes();
 txCdsEvFromProtein(argv[1], argv[2], argv[3], argv[4]);
 carefulClose(&fUnmapped);
 return 0;
 }