src/hg/txCds/txCdsSuspect/txCdsSuspect.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/txCds/txCdsSuspect/txCdsSuspect.c src/hg/txCds/txCdsSuspect/txCdsSuspect.c
index 73290ed..d94c713 100644
--- src/hg/txCds/txCdsSuspect/txCdsSuspect.c
+++ src/hg/txCds/txCdsSuspect/txCdsSuspect.c
@@ -1,383 +1,383 @@
 /* txCdsSuspect - Flag cases where the CDS prediction is very suspicious, 
  * including CDSs that lie entirely in an intron or in the 3' UTR of another,
  * better looking transcript. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "bed.h"
 #include "rangeTree.h"
 #include "txInfo.h"
 #include "txCluster.h"
 #include "txGraph.h"
 #include "cdsEvidence.h"
 
 
 
 /* Globals set from command line. */
 FILE *fNiceProt = NULL;
 int minAaOverlap = 10;
 int minCdsOverlap;	/* Set to 3x minAaOverlap. */
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "txCdsSuspect - Flag cases where the CDS prediction is very suspicious, including\n"
   "CDSs that lie entirely in an intron or in the 3' UTR of another, better looking\n"
   "transcript.\n"
   "usage:\n"
   "   txCdsSuspect in.bed in.txg in.cluster in.info out.suspect out.info\n"
   "options:\n"
   "   -niceProt=name - Put the 'nice' proteins and what cluster they're in here.\n"
   "   -minAaOverlap=N - minimum number of amino acids to overlap with to consider\n"
   "                     a CDS pick compatible with one of the primary proteins for\n"
   "                     the cluster. Default %d\n"
   , minAaOverlap
   );
 }
 
 static struct optionSpec options[] = {
    {"niceProt", OPTION_STRING},
    {NULL, 0},
 };
 
 int overlapInSameFrame(struct bed *a, struct bed *b)
 /* Return amount of overlap between coding regions (in same frame)
  * between two beds. */
 {
 int overlap = 0;
 
 /* Allocate range trees for each frame. */
 struct rbTree *frameTrees[3];
 int frame;
 for (frame = 0; frame<3; ++frame)
     frameTrees[frame] = rangeTreeNew();
 
 /* Fill in frame trees with coding exons of a. */
 int cdsPos = 0;
 int block, blockCount = a->blockCount;
 for (block = 0; block < blockCount; ++block)
     {
     int start = a->chromStart + a->chromStarts[block];
     int end = start + a->blockSizes[block];
     start = max(start, a->thickStart);
     end = min(end, a->thickEnd);
     if (start < end)
 	{
 	int size = end - start;
 	int frame = (start - cdsPos)%3;
 	rangeTreeAdd(frameTrees[frame], start, end);
 	cdsPos += size;
 	}
     }
 
 /* Add up overlaps by comparing bed b against frameTrees */
 cdsPos = 0;
 blockCount = b->blockCount;
 for (block = 0; block < blockCount; ++block)
     {
     int start = b->chromStarts[block] + b->chromStart;
     int end = start + b->blockSizes[block];
     start = max(start, b->thickStart);
     end = min(end, b->thickEnd);
     if (start < end)
 	{
 	int size = end - start;
 	int frame = (start - cdsPos)%3;
 	overlap += rangeTreeOverlapSize(frameTrees[frame], start, end);
 	cdsPos += size;
 	}
     }
 
 /* Clean up and go home. */
 for (frame = 0; frame<3; ++frame)
     rangeTreeFree(&frameTrees[frame]);
 return overlap;
 }
 
 boolean allInUtr3(struct bed *bed, int start, int end)
 /* Return TRUE if interval start-end lies entirely in final
  * exon's UTR region. */
 {
 int utrStart, utrEnd;
 if (bed->strand[0] == '+')
     {
     int ix = bed->blockCount-1;
     utrStart = bed->chromStart + bed->chromStarts[ix];
     utrEnd = utrStart + bed->blockSizes[ix];
     utrStart = max(utrStart, bed->thickEnd);
     }
 else
     {
     utrStart = bed->chromStart + bed->chromStarts[0];
     utrEnd = utrStart + bed->blockSizes[0];
     utrEnd = min(utrEnd, bed->thickStart);
     }
 return (start >= utrStart && end <= utrEnd);
 }
 
 boolean allInIntron(struct bed *bed, int start, int end)
 /* Return TRUE if region lies entirely in an intron. */
 {
 int lastBlock = bed->blockCount - 1;
 int i;
 for (i=0; i<lastBlock; ++i)
     {
     int iStart = bed->chromStart + bed->chromStarts[i] + bed->blockSizes[i];
     int iEnd = bed->chromStart + bed->chromStarts[i+1];
     if (start >= iStart && end <= iEnd)
         return TRUE;
     }
 return FALSE;
 }
 
 int countCdsExons(struct bed *bed)
 /* Count number of coding exons */
 {
 int count = 0;
 int i;
 for (i=0; i<bed->blockCount; ++i)
     {
     int start = bed->chromStart + bed->chromStarts[i];
     int end = start + bed->blockSizes[i];
     if (rangeIntersection(start, end, bed->thickStart, bed->thickEnd) > 0)
         ++count;
     }
 return count;
 }
 
 void flagProblems(struct bed *txBed, struct slRef *goodRefList, 
 	struct txInfo *info, FILE *f)
 /* Write out info on problems with CDS of txBed, updating info structure 
  * writing to file. */
 {
 if (txBed->thickStart >= txBed->thickEnd)
     return; /* We only care about tx with CDS. */
 if (refOnList(goodRefList, txBed))
     return; /* Don't flag problems in our good set. */
 if (countCdsExons(txBed) > 1)
     return; /* We're mostly about retained introns. */
 struct slRef *goodRef;
 boolean cdsInIntron = FALSE;
 boolean cdsInUtr = FALSE;
 boolean cdsGoodOverlap = FALSE;
 for (goodRef = goodRefList; goodRef != NULL; goodRef = goodRef->next)
     {
     struct bed *goodBed = goodRef->val;
     if (overlapInSameFrame(txBed, goodBed) >= minCdsOverlap)
         cdsGoodOverlap = TRUE;
     if (allInUtr3(goodBed, txBed->thickStart, txBed->thickEnd))
         cdsInUtr = TRUE;
     if (allInIntron(goodBed, txBed->thickStart, txBed->thickEnd))
         cdsInIntron = TRUE;
     }
 if (!cdsGoodOverlap)
     {
     if (cdsInIntron)
 	{
         fprintf(f, "%s\tcdsInIntron\n", txBed->name);
 	info->cdsSingleInIntron = TRUE;
 	}
     if (cdsInUtr)
 	{
         fprintf(f, "%s\tcdsInUtr\n", txBed->name);
 	info->cdsSingleInUtr3 = TRUE;
 	}
     }
 }
 
 void txCdsSuspect(char *inBed, char *inTxg, char *inCluster, char *inInfo, 
 	char *outSuspect, char *outInfo)
 /* txCdsSuspect - Flag cases where the CDS prediction is very suspicious, including 
  * CDSs that lie entirely in an intron or in the 3' UTR of another, better looking 
  * transcript. */
 {
 /* Read input bed into hash.  Also calculate number with CDS set. */
 struct hash *bedHash = hashNew(16);
 struct bed *bed, *bedList = bedLoadNAll(inBed, 12);
 int txWithCdsCount = 0;
 for (bed = bedList; bed != NULL; bed = bed->next)
     {
     if (bed->thickStart < bed->thickEnd)
         txWithCdsCount += 1;
     hashAdd(bedHash, bed->name, bed);
     }
 verbose(2, "Read %d beds from %s\n", bedHash->elCount, inBed);
 
 /* Read input transcript graphs into list, and into a hash
  * keyed by transcript names. */
 struct hash *graphHash = hashNew(16);
 struct txGraph *txg, *txgList = txGraphLoadAll(inTxg);
 for (txg = txgList; txg != NULL; txg = txg->next)
     {
     int i;
     for (i=0; i<txg->sourceCount; ++i)
         hashAdd(graphHash, txg->sources[i].accession, txg);
     }
 verbose(2, "Read %d graphs (%d transcripts) from %s\n", slCount(txgList),
 	graphHash->elCount, inTxg);
 
 /* Read input protein cluster into list, and into a hash
  * keyed by transcript name */
 struct hash *clusterHash = hashNew(16);
 struct txCluster *cluster, *clusterList = txClusterLoadAll(inCluster);
 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
     {
     int i;
     for (i=0; i<cluster->txCount; ++i)
         hashAdd(clusterHash, cluster->txArray[i], cluster);
     }
 verbose(2, "Read %d protein clusters (%d transcripts) from  %s\n", 
 	slCount(clusterList), clusterHash->elCount, inCluster);
 
 /* Read in txInfo into a hash keyed by transcript name */
 struct hash *infoHash = hashNew(16);
 struct txInfo *info, *infoList = txInfoLoadAll(inInfo);
 for (info = infoList; info != NULL; info = info->next)
     hashAdd(infoHash, info->name, info);
 verbose(2, "Read info on %d transcripts from %s\n", infoHash->elCount, 
 	inInfo);
 
 /* Create a hash containing what looks to be the best protein-coding
  * transcript in each protein cluster.  This is keyed by cluster name
  * with a list of beds for values.  All refSeqs end up in this list.
  * If no refSeq in a cluster then the best by some heuristic ends up
  * there. */
 struct hash *bestInClusterHash = hashNew(16);
 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
     {
     /* Figure out best scoring one in cluster. */
     double bestScore = -BIGNUM;
     char *bestTx = NULL;
     int i;
     for (i=0; i<cluster->txCount; ++i)
         {
 	char *tx = cluster->txArray[i];
 	info = hashMustFindVal(infoHash, tx);
 	double score = txInfoCodingScore(info, TRUE);
 	if (score > bestScore)
 	    {
 	    bestTx = tx;
 	    bestScore = score;
 	    }
 	}
 
     /* Make list of beds for cluster, starting with the best, and
      * adding in any refSeqs. */
     struct bed *bestBed = hashMustFindVal(bedHash, bestTx);
     struct bed *bedList = NULL;
     slAddHead(&bedList, bestBed);
     for (i=0; i<cluster->txCount; ++i)
         {
 	char *tx = cluster->txArray[i];
 	info = hashMustFindVal(infoHash, tx);
 	if (info->isRefSeq && bestTx != tx)
 	    {
 	    struct bed *bed = hashMustFindVal(bedHash, tx);
 	    slAddHead(&bedList, bed);
 	    }
 	}
     slReverse(&bedList);
 
     /* Add bed list to hash, and optionally save it to file. */
     hashAdd(bestInClusterHash, cluster->name, bedList);
     if (fNiceProt != NULL)
         {
 	struct bed *bed;
 	for (bed = bedList; bed != NULL; bed = bed->next)
 	    fprintf(fNiceProt, "%s\t%s\n", cluster->name, bed->name);
 	}
     }
 verbose(2, "Picked best protein for each protein cluster\n");
 
 /* Loop through each transcript cluster (graph).  Make a list of
  * protein clusters associated with that graph. Armed with this
  * information call flagging routine on each transcript in the graph. */
 FILE *f = mustOpen(outSuspect, "w");
 for (txg = txgList; txg != NULL; txg = txg->next)
     {
     /* Build up list of protein clusters associated with transcript cluster. */
     struct slRef *protClusterRefList = NULL, *protClusterRef;
     int i;
     for (i=0; i<txg->sourceCount; ++i)
 	{
 	char *tx = txg->sources[i].accession;
 	struct txCluster *protCluster = hashFindVal(clusterHash, tx);
 	if (protCluster != NULL)
 	    refAddUnique(&protClusterRefList, protCluster);
 	}
 
     /* Figure out best scoring protein in RNA cluster, and set threshold
      * to eliminate ones scoring less than half this much. */
     double bestProtScore = 0;
     for (protClusterRef = protClusterRefList; protClusterRef != NULL;
     	protClusterRef = protClusterRef->next)
 	{
 	struct txCluster *protCluster = protClusterRef->val;
 	struct bed *bedList = hashMustFindVal(bestInClusterHash, protCluster->name);
 	struct bed *bed;
 	for (bed = bedList; bed != NULL; bed = bed->next)
 	    {
 	    struct txInfo *info = hashMustFindVal(infoHash, bed->name);
 	    double score = txInfoCodingScore(info, FALSE);
 	    bestProtScore = max(score, bestProtScore);
 	    }
 	}
     double protScoreThreshold = bestProtScore * 0.5;
 
     /* Get list of references to beds of best proteins in protein clusters. */
     struct slRef *protRefList = NULL;
     for (protClusterRef = protClusterRefList; protClusterRef != NULL;
     	protClusterRef = protClusterRef->next)
 	{
 	struct txCluster *protCluster = protClusterRef->val;
 	struct bed *bedList = hashMustFindVal(bestInClusterHash, protCluster->name);
 	struct bed *bed;
 	for (bed = bedList; bed != NULL; bed = bed->next)
 	    {
 	    struct txInfo *info = hashMustFindVal(infoHash, bed->name);
 	    double score = txInfoCodingScore(info, TRUE);
 	    if (score >= protScoreThreshold)
 		refAdd(&protRefList, bed);
 	    }
 	}
 
     /* Go do flagging for each transcript in RNA cluster. */
     for (i=0; i<txg->sourceCount; ++i)
         {
 	char *tx = txg->sources[i].accession;
 	struct bed *bed = hashMustFindVal(bedHash, tx);
 	struct txInfo *info = hashMustFindVal(infoHash, bed->name);
 	flagProblems(bed, protRefList, info, f);
 	}
     slFreeList(&protClusterRefList);
     }
 carefulClose(&f);
 
 /* Write out updated info file. */
 f = mustOpen(outInfo, "w");
 for (info = infoList; info != NULL; info = info->next)
     txInfoTabOut(info, f);
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 7)
     usage();
 char *fileName = optionVal("niceProt", NULL);
 if (fileName != NULL)
     fNiceProt = mustOpen(fileName, "w");
 minAaOverlap = optionInt("minAaOverlap", minAaOverlap);
 minCdsOverlap = minAaOverlap*3;
 txCdsSuspect(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
 carefulClose(&fNiceProt);
 return 0;
 }