src/hg/txCds/txInfoAssemble/txInfoAssemble.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/txCds/txInfoAssemble/txInfoAssemble.c src/hg/txCds/txInfoAssemble/txInfoAssemble.c
index 75e6627..439150c 100644
--- src/hg/txCds/txInfoAssemble/txInfoAssemble.c
+++ src/hg/txCds/txInfoAssemble/txInfoAssemble.c
@@ -1,551 +1,551 @@
 /* txInfoAssemble - Assemble information from various sources into txInfo table.. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "obscure.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "bed.h"
 #include "cdsEvidence.h"
 #include "binRange.h"
 #include "genbank.h"
 #include "psl.h"
 #include "txInfo.h"
 #include "txCommon.h"
 #include "rangeTree.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "txInfoAssemble - Assemble information from various sources into txInfo table.\n"
   "usage:\n"
   "   txInfoAssemble bestCds.bed best.tce predict.tce altSplice.bed txWalk.exceptions sizePolyA.tab all.psl flippedRna out.info\n"
   "options:\n"
   "   -xxx=XXX\n"
   );
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 struct minChromSize
 /* Associate chromosome and size. */
     {
     char *name;        /* Chromosome name, Not alloced here */
     int minSize;                
     };
     
 struct hash *chromMinSizeHash(struct bed *bedList)
 /* Return hash full of lower bounds on chromosome sizes, taken
  * from chromEnds on bedList (which just needs to be bed 3).
  * Typically this is used when you want to make bin-keepers on
  * these chromosomes.  */
 {
 struct bed *bed;
 struct hash *sizeHash = hashNew(16);
 for (bed = bedList; bed != NULL; bed = bed->next)
     {
     struct minChromSize *chrom = hashFindVal(sizeHash, bed->chrom);
     if (chrom == NULL)
         {
         lmAllocVar(sizeHash->lm, chrom);
         hashAddSaveName(sizeHash, bed->chrom, chrom, &chrom->name);
         }
     chrom->minSize = max(chrom->minSize, bed->chromEnd);
     }
 return sizeHash;
 }
 
 struct hash *bedsIntoHashOfKeepers(struct bed *bedList)
 /* Return a hash full of binKeepers, keyed by chromosome (or contig)
  * that contains the bedList */
 {
 struct hash *sizeHash = chromMinSizeHash(bedList);
 struct hash *keeperHash = hashNew(16);
 struct bed *bed;
 for (bed = bedList; bed != NULL; bed = bed->next)
     {
     struct binKeeper *keeper = hashFindVal(keeperHash, bed->chrom);
     if (keeper == NULL)
         {
 	struct minChromSize *chrom = hashMustFindVal(sizeHash, bed->chrom);
 	keeper = binKeeperNew(0, chrom->minSize);
 	hashAdd(keeperHash, chrom->name, keeper);
 	}
     binKeeperAdd(keeper, bed->chromStart, bed->chromEnd, bed);
     }
 hashFree(&sizeHash);
 return keeperHash;
 }
 
 boolean isRfam(char *accession)
 /* isRfam - determine if the sequence comes from Rfam */
 {
 /* If the sequence comes from Rfam, its source accession should begin              
  * with the letters RF, and then should have a string of digits                    
  * followed by a semicolon (and subsequent characters).  No other                  
  * known accessions match this pattern. */
 /* Note: At this time (9/15/11), Rfam accessions include five digits after         
  * the RF.  But I wouldn't want to bet on that never changing... */
 int ii;
 if (strlen(accession) >= 3)
     {
     if (accession[0] == 'R' && accession[1] == 'F' && isdigit(accession[2]))
         {
         for (ii = 3; ii < strlen(accession); ii++)
             {
             if (!isdigit(accession[ii]) && accession[ii] != ';')
                 {
                 break;
                 }
             if (accession[ii] == ';')
                 {
                 return(TRUE);
                 }
             }
         }
 
     }
 return(FALSE);
 }
 
 
 
 
 boolean isNonsenseMediatedDecayTarget(struct bed *bed)
 /* Return TRUE if there's an intron more than 55 bases past
  * end of CDS. */
 {
 const int nmdMax = 55;
 if (bed->thickStart >= bed->thickEnd)
     return FALSE;		/* Not even coding! */
 int blockCount = bed->blockCount;
 if (blockCount == 1)
     return FALSE;		/* No introns at all. */
 if (bed->strand[0] == '+')
     {
     /* ----===  ====  ====----    NMD FALSE  */
     /* ----===  ===-  --------    NMD FALSE  */
     /* ----===  =---  --------    NMD TRUE  */
     /* ----===  == -  --------    NMD FALSE  */
     /* Step backwards looking for first real intron (> 3 bases) */
     int i;
     for (i=bed->blockCount-2; i>=0; --i)
         {
 	int gapStart = bed->chromStarts[i] + bed->blockSizes[i] + bed->chromStart;
 	int gapEnd = bed->chromStarts[i+1] + bed->chromStart;
 	int gapSize = gapEnd - gapStart;
 	if (gapSize > 16)
 	    {
 	    int nmdStart = bed->thickEnd;
 	    int nmdEnd = gapStart;
 	    if (nmdStart < nmdEnd)
 		{
 		int nmdBases = 0;
 		int j;
 		for (j=0; j<bed->blockCount; ++j)
 		    {
 		    int start = bed->chromStart + bed->chromStarts[j];
 		    int end = start + bed->blockSizes[j];
 		    nmdBases += positiveRangeIntersection(nmdStart, nmdEnd, start, end);
 		    }
 		return nmdBases >= nmdMax;
 		}
 	    }
 	}
     }
 else
     {
     /* Step forward looking for first gap big enough to be an intron*/
     int i;
     int lastBlock = bed->blockCount-1;
     for (i=0; i<lastBlock; ++i)
         {
 	int gapStart = bed->chromStarts[i] + bed->blockSizes[i] + bed->chromStart;
 	int gapEnd = bed->chromStarts[i+1] + bed->chromStart;
 	int gapSize = gapEnd - gapStart;
 	if (gapSize > 16)
 	    {
 	    /* -----===   ====    ====----    NMD FALSE  */
 	    /* -----  -===  === ====------    NMD FALSE  */
 	    /* -----  ---=  === ====------    NMD TRUE  */
 	    int nmdStart = gapEnd;
 	    int nmdEnd = bed->thickStart;
 	    if (nmdStart < nmdEnd)
 		{
 		int nmdBases = 0;
 		int j;
 		for (j=0; j<bed->blockCount; ++j)
 		    {
 		    int start = bed->chromStart + bed->chromStarts[j];
 		    int end = start + bed->blockSizes[j];
 		    nmdBases += positiveRangeIntersection(nmdStart, nmdEnd, start, end);
 		    }
 		return nmdBases >= nmdMax;
 		}
 	    }
 	}
     }
 return FALSE;
 }
 
 boolean hasRetainedIntron(struct bed *bed, struct hash *altSpliceHash)
 /* See if any exons in bed enclose any retained introns in keeper-hash */
 {
 struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom);
 boolean gotOne = FALSE;
 if (keeper == NULL)
      return FALSE;
 int i;
 for (i=0; i<bed->blockCount; ++i)
     {
     int start = bed->chromStarts[i] + bed->chromStart;
     int end = start + bed->blockSizes[i];
     struct binElement *bin, *binList = binKeeperFind(keeper, start, end);
     for (bin = binList; bin != NULL; bin = bin->next)
         {
 	struct bed *intron = bin->val;
 	if (sameString(intron->name, "retainedIntron"))
 	    {
 	    if (intron->strand[0] == bed->strand[0] 
 		    && start < intron->chromStart && end > intron->chromEnd)
 		{
 		gotOne = TRUE;
 		break;
 		}
 	    }
 	}
     slFreeList(&binList);
     if (gotOne)
         break;
     }
 return gotOne;
 }
 
 int countMatchingIntrons(struct bed *bed, char *type, struct hash *altSpliceHash)
 /* Count number of introns of a particular type . */
 {
 struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom);
 if (keeper == NULL)
      return 0;
 int total = 0;
 int i, lastBlock = bed->blockCount-1;
 for (i=0; i<lastBlock; ++i)
     {
     int start = bed->chromStarts[i] + bed->blockSizes[i] + bed->chromStart;
     int end = bed->chromStart + bed->chromStarts[i+1];
     struct binElement *bin, *binList = binKeeperFind(keeper, start, end);
     for (bin = binList; bin != NULL; bin = bin->next)
         {
 	struct bed *intron = bin->val;
 	if (sameString(intron->name, type))
 	    {
 	    if (intron->strand[0] == bed->strand[0] 
 		    && start == intron->chromStart && end == intron->chromEnd)
 		{
 		if (end - start > 3)
 		    {
 		    ++total;
 		    break;
 		    }
 		}
 	    }
 	}
     slFreeList(&binList);
     }
 return total;
 }
 
 int countStrangeSplices(struct bed *bed, struct hash *altSpliceHash)
 /* Count number of introns with strange splice sites. */
 {
 return countMatchingIntrons(bed, "strangeSplice", altSpliceHash);
 }
 
 int countAtacIntrons(struct bed *bed, struct hash *altSpliceHash)
 /* Count number of introns with at/ac splice sites. */
 {
 return countMatchingIntrons(bed, "atacIntron", altSpliceHash);
 }
 
 int addIntronBleed(struct bed *bed, struct hash *altSpliceHash)
 /* Return the number of bases at start or end that bleed into introns
  * of other, probably better, transcripts. */
 {
 struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom);
 if (keeper == NULL)
      return 0;
 int i;
 int total = 0;
 int lastBlock = bed->blockCount-1;
 if (lastBlock == 0)
     return 0;	/* Single exon case. */
 /* This funny loop just checks first and last block. */
 for (i=0; i<=lastBlock; i += lastBlock)
     {
     int start = bed->chromStarts[i] + bed->chromStart;
     int end = start + bed->blockSizes[i];
     struct binElement *bin, *binList = binKeeperFind(keeper, start, end);
     for (bin = binList; bin != NULL; bin = bin->next)
         {
 	struct bed *bleeder = bin->val;
 	if (sameString(bleeder->name, "bleedingExon"))
 	    {
 	    if (bleeder->strand[0] == bed->strand[0])
 		{
 		if (i == 0)  /* First block, want start to be same */
 		    {
 		    if (bleeder->chromStart == start && bleeder->chromEnd < end)
 		        total += bleeder->chromEnd - bleeder->chromStart;
 		    break;
 		    }
 		else if (i == lastBlock)
 		    {
 		    if (bleeder->chromEnd == end && bleeder->chromStart > start)
 		        total += bleeder->chromEnd - bleeder->chromStart;
 		    break;
 		    }
 		}
 	    }
 	}
     slFreeList(&binList);
     }
 return total;
 }
 
 int pslBedOverlap(struct psl *psl, struct bed *bed)
 /* Return number of bases psl and bed overlap at the block level */
 {
 /* No overlap if on wrong chromosome or wrong strand. */
 if (psl->strand[0] != bed->strand[0])
     return 0;
 if (!sameString(psl->tName, bed->chrom)) 
     return 0;
 
 /* Build up range tree covering bed */
 struct rbTree *rangeTree = rangeTreeNew();
 int i;
 for (i=0; i<bed->blockCount; ++i)
     {
     int start = bed->chromStart + bed->chromStarts[i];
     int end = start + bed->blockSizes[i];
     rangeTreeAdd(rangeTree, start, end);
     }
 
 /* Loop through psl accumulating total overlap. */
 int totalOverlap = 0;
 for (i=0; i<psl->blockCount; ++i)
     {
     int start = psl->tStarts[i];
     int end = start + psl->blockSizes[i];
     totalOverlap += rangeTreeOverlapSize(rangeTree, start, end);
     }
 
 /* Clean up and return result. */
 rangeTreeFree(&rangeTree);
 return totalOverlap;
 }
 
 void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile,
 	char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile)
 /* txInfoAssemble - Assemble information from various sources into txInfo table.. */
 {
 /* Build up hash of evidence keyed by transcript name. */
 struct hash *cdsEvHash = hashNew(18);
 struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile);
 for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next)
     hashAddUnique(cdsEvHash, cdsEv->name, cdsEv);
 verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile);
 
 /* Build up hash of bestorf structures keyed by transcript name */
 struct hash *predictHash = hashNew(18);
 struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile);
 for (predict = predictList; predict != NULL; predict = predict->next)
      hashAddUnique(predictHash, predict->name, predict);
 verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile);
 
 /* Build up structure for random access of retained introns */
 struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6);
 verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile);
 struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList);
 
 /* Read in exception info. */
 struct hash *selenocysteineHash, *altStartHash;
 genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash);
 
 /* Read in polyA sizes */
 struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile);
 verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile);
 
 /* Read in psls */
 struct hash *pslHash = hashNew(20);
 struct psl *psl, *pslList = pslLoadAll(pslFile);
 for (psl = pslList; psl != NULL; psl = psl->next)
     hashAdd(pslHash, psl->qName, psl);
 verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile);
 
 /* Read in accessions that we flipped for better splice sites. */
 struct hash *flipHash = hashWordsInFile(flipFile, 0);
 
 /* Open primary gene input and output. */
 struct lineFile *lf = lineFileOpen(txBedFile, TRUE);
 FILE *f = mustOpen(outFile, "w");
 
 /* Main loop - process each gene */
 char *row[12];
 while (lineFileRow(lf, row))
     {
     struct bed *bed = bedLoad12(row);
     verbose(3, "Processing %s\n", bed->name);
 
     /* Initialize info to zero */
     struct txInfo info;
     ZeroVar(&info);
 
     /* Figure out name, sourceAcc, and isRefSeq from bed->name */
     info.name = bed->name;
     info.category = "n/a";
     if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL)
 	{
 	info.sourceAcc = cloneString(bed->name);
 	}
     else 
 	{
 	info.sourceAcc = txAccFromTempName(bed->name);
 	}
     info.isRefSeq = startsWith("NM_", info.sourceAcc);
 
     if (startsWith("antibody.", info.sourceAcc) 
 	|| startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc)
 	|| stringIn("tRNA", info.sourceAcc) != NULL)
         {
 	/* Fake up some things for antibody frag and CCDS that don't have alignments. */
 	info.sourceSize = bedTotalBlockSize(bed);
 	info.aliCoverage = 1.0;
 	info.aliIdRatio = 1.0;
 	info. genoMapCount = 1;
 	}
     else
 	{
 	/* Loop through all psl's associated with our RNA.  Figure out
 	 * our overlap with each, and pick best one. */
 	struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc);
 	if (firstPslHel == NULL)
 	    errAbort("%s is not in %s", info.sourceAcc, pslFile);
 	int mapCount = 0;
 	struct psl *psl, *bestPsl = NULL;
 	int coverage, bestCoverage = 0;
 	boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL);
 	for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel))
 	    {
 	    psl = hel->val;
 	    mapCount += 1;
 	    coverage = pslBedOverlap(psl, bed);
 	    if (coverage > bestCoverage)
 		{
 		bestCoverage = coverage;
 		bestPsl = psl;
 		}
 	    /* If we flipped it, try it on the opposite strand too. */
 	    if (isFlipped)
 		{
 		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
 		coverage = pslBedOverlap(psl, bed);
 		if (coverage > bestCoverage)
 		    {
 		    bestCoverage = coverage;
 		    bestPsl = psl;
 		    }
 		psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+');
 		}
 	    }
 	if (bestPsl == NULL)
 	    errAbort("%s has no overlapping alignments with %s in %s", 
 		    bed->name, info.sourceAcc, pslFile);
 
 	/* Figure out and save alignment statistics. */
 	int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0);
 	info.sourceSize = bestPsl->qSize - polyA;
 	info.aliCoverage = (double)bestCoverage / info.sourceSize;
 	info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/
 			    (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch);
 	info. genoMapCount = mapCount;
 	}
 
 
     /* Get orf size and start/end complete from cdsEv. */
     if (bed->thickStart < bed->thickEnd)
 	{
 	cdsEv = hashFindVal(cdsEvHash, bed->name);
 	if (cdsEv != NULL)
 	    {
 	    info.orfSize = cdsEv->end - cdsEv->start;
 	    info.startComplete = cdsEv->startComplete;
 	    info.endComplete = cdsEv->endComplete;
 	    }
 	}
 
     /* Get score from prediction. */
     predict = hashFindVal(predictHash, bed->name);
     if (predict != NULL)
         info.cdsScore = predict->score;
 
     /* Figure out nonsense-mediated-decay from bed itself. */
     info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed);
 
     /* Figure out if retained intron from bed and alt-splice keeper hash */
     info.retainedIntron = hasRetainedIntron(bed, altSpliceHash);
     info.strangeSplice = countStrangeSplices(bed, altSpliceHash);
     info.atacIntrons = countAtacIntrons(bed, altSpliceHash);
     info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash);
 
     /* Look up selenocysteine info. */
     info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL);
 
     /* Loop through bed looking for small gaps indicative of frame shift/stop */
     int i, lastBlock = bed->blockCount-1;
     int exonCount = 1;
     for (i=0; i < lastBlock; ++i)
         {
 	int gapStart = bed->chromStarts[i] + bed->blockSizes[i];
 	int gapEnd = bed->chromStarts[i+1];
 	int gapSize = gapEnd - gapStart;
 	switch (gapSize)
 	    {
 	    case 1:
 	    case 2:
 	        info.genomicFrameShift = TRUE;
 		break;
 	    case 3:
 	        info.genomicStop = TRUE;
 		break;
 	    default:
 	        exonCount += 1;
 		break;
 	    }
 	}
     info.exonCount = exonCount;
 
     /* Write info, free bed. */
     txInfoTabOut(&info, f);
     bedFree(&bed);
     }
 
 /* Clean up and go home. */
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 10)
     usage();
 txInfoAssemble(argv[1], argv[2], argv[3], argv[4], argv[5], 
 	argv[6], argv[7], argv[8], argv[9]);
 return 0;
 }