src/hg/txCds/txCdsToGene/txCdsToGene.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/txCds/txCdsToGene/txCdsToGene.c src/hg/txCds/txCdsToGene/txCdsToGene.c
index 6fbc4ba..4aff71d 100644
--- src/hg/txCds/txCdsToGene/txCdsToGene.c
+++ src/hg/txCds/txCdsToGene/txCdsToGene.c
@@ -1,426 +1,426 @@
 /* txCdsToGene - Convert transcript bed and best cdsEvidence to genePred and 
  * protein sequence.. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "localmem.h"
 #include "dystring.h"
 #include "options.h"
 #include "fa.h"
 #include "bed.h"
 #include "dnautil.h"
 #include "genbank.h"
 #include "genePred.h"
 #include "rangeTree.h"
 #include "cdsEvidence.h"
 
 /* Variables set from command line. */
 FILE *fBed = NULL;
 FILE *fTweaked = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "txCdsToGene - Convert transcript bed and best cdsEvidence to genePred and \n"
   "protein sequence.\n"
   "usage:\n"
   "   txCdsToGene in.bed in.fa in.tce out.gtf out.fa\n"
   "Where:\n"
   "   in.bed describes the genome position of transcripts, often from txWalk\n"
   "   in.fa contains the transcript sequence\n"
   "   in.tce is the best cdsEvidence (from txCdsPick) for the transcripts\n"
   "          For noncoding transcripts it need not have a line\n"
   "   out.gtf is the output gene predictions\n"
   "   out.fa is the output protein predictions\n"
   "options:\n"
   "   -bedOut=output.bed - Save bed (with thickStart/thickEnd set)\n"
   "   -exceptions=xxx.exceptions - Include file with info on selenocysteine\n"
   "            and other exceptions.  You get this file by running\n"
   "            files txCdsRaExceptions on ra files parsed out of genbank flat\n"
   "            files.\n"
   "   -tweaked=output.tweaked - Save list of places we had to tweak beds to dodge\n"
   "            frame shifts, etc. here\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"bedOut", OPTION_STRING},
    {"exceptions", OPTION_STRING},
    {"tweaked", OPTION_STRING},
    {NULL, 0},
 };
 
 struct hash *selenocysteineHash = NULL;
 struct hash *altStartHash = NULL;
 
 void makeExceptionHashes()
 /* Create hash that has accessions using selanocysteine in it
  * if using the exceptions option.  Otherwise the hash will be
  * empty. */
 {
 char *fileName = optionVal("exceptions", NULL);
 if (fileName != NULL)
     genbankExceptionsHash(fileName, &selenocysteineHash, &altStartHash);
 else
     selenocysteineHash = altStartHash = hashNew(4);
 }
 
 void flipExonList(struct range **pList, int regionSize)
 /* Flip exon list to other strand */
 {
 struct range *exon;
 for (exon = *pList; exon != NULL; exon = exon->next)
     reverseIntRange(&exon->start, &exon->end, regionSize);
 slReverse(pList);
 }
 
 struct range *bedToExonList(struct bed *bed, struct lm *lm)
 /* Convert bed to list of exons, allocated out of lm memory. 
  * These exons will be in coordinates relative to the bed start. */
 {
 struct range *exon, *exonList = NULL;
 int i;
 for (i=0; i<bed->blockCount; ++i)
     {
     lmAllocVar(lm, exon);
     exon->start = bed->chromStarts[i];
     exon->end = exon->start + bed->blockSizes[i];
     slAddHead(&exonList, exon);
     }
 slReverse(&exonList);
 return exonList;
 }
 
 struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed)
 /* Create a new broken-up that excludes part of gene between CDS breaks.  
  * Also jiggles cds->end coordinate to cope with the sequence we remove.
  * Deals with transcript to genome coordinate mapping including negative
  * strand.  Be afraid, be very afraid! */
 {
 /* Create range tree covering all breaks.  The coordinates here
  * are transcript coordinates.  While we're out it shrink outer CDS
  * since we are actually shrinking transcript. */
 struct rbTree *gapTree = rangeTreeNew();
 int bedSize = bed->chromEnd - bed->chromStart;
 struct lm *lm = gapTree->lm;	/* Convenient place to allocate memory. */
 int i, lastCds = cds->cdsCount-1;
 for (i=0; i<lastCds; ++i)
     {
     int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i];
     int gapEnd = cds->cdsStarts[i+1];
     int gapSize = gapEnd - gapStart;
     cds->end -= gapSize;
     rangeTreeAdd(gapTree, gapStart, gapEnd);
     }
 
 /* Get list of exons in bed, flipped to reverse strand if need be. */
 struct range *exon, *exonList = bedToExonList(bed, lm);
 if (bed->strand[0] == '-')
     flipExonList(&exonList, bedSize);
 
 /* Go through exon list, mapping each exon to transcript
  * coordinates. Check if exon needs breaking up, and if
  * so do so, as we copy it to new list. */
 /* Copy exons to new list, breaking them up if need be. */
 struct range *newList = NULL, *nextExon, *newExon;
 int txStartPos = 0, txEndPos;
 for (exon = exonList; exon != NULL; exon = nextExon)
     {
     txEndPos = txStartPos + exon->end - exon->start;
     nextExon = exon->next;
     struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos);
     if (gapList != NULL)
         {
 	verbose(3, "Splitting exon because of CDS gap\n");
 
 	/* Make up exons from current position up to next gap.  This is a little
 	 * complicated by possibly the gap starting before the exon. */
 	int exonStart = exon->start;
 	int txStart = txStartPos;
 	struct range *gap;
 	for (gap = gapList; gap != NULL; gap = gap->next)
 	    {
 	    int txEnd = gap->start;
 	    int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos);
 	    int exonSize = txEnd - txStart;
 	    if (exonSize > 0)
 		{
 		lmAllocVar(lm, newExon);
 		newExon->start = exonStart;
 		newExon->end = exonStart + exonSize;
 		slAddHead(&newList, newExon);
 		}
 	    else /* This case happens if gap starts before exon */
 	        {
 		exonSize = 0;
 		}
 
 	    /* Update current position in both transcript and genome space. */
 	    exonStart += exonSize + gapSize;
 	    txStart += exonSize + gapSize;
 	    }
 
 	/* Make up final exon from last gap to end, at least if we don't end in a gap. */
 	if (exonStart < exon->end)
 	    {
 	    lmAllocVar(lm, newExon);
 	    newExon->start = exonStart;
 	    newExon->end = exon->end;
 	    slAddHead(&newList, newExon);
 	    }
 	}
     else
         {
 	/* Easy case where we don't intersect any gaps. */
 	slAddHead(&newList, exon);
 	}
     txStartPos= txEndPos;
     }
 slReverse(&newList);
 
 /* Flip exons back to forward strand if need be */
 if (bed->strand[0] == '-')
     flipExonList(&newList, bedSize);
 
 /* Convert exons to bed12 */
 struct bed *newBed;
 AllocVar(newBed);
 newBed->chrom = cloneString(bed->chrom);
 newBed->chromStart = newList->start + bed->chromStart;
 newBed->chromEnd = newList->end + bed->chromStart;
 newBed->name  = cloneString(bed->name);
 newBed->score = bed->score;
 newBed->strand[0] = bed->strand[0];
 newBed->blockCount = slCount(newList);
 AllocArray(newBed->blockSizes,  newBed->blockCount);
 AllocArray(newBed->chromStarts,  newBed->blockCount);
 for (exon = newList, i=0; exon != NULL; exon = exon->next, i++)
     {
     newBed->chromStarts[i] = exon->start;
     newBed->blockSizes[i] = exon->end - exon->start;
     newBed->chromEnd = exon->end + bed->chromStart;
     }
 
 /* Clean up and go home. */
 rbTreeFree(&gapTree);
 return newBed;
 }
 
 void bedToGtf(struct bed *bed, char *exonSource, char *cdsSource, char *geneName, FILE *f)
 /* Write out bed as a section of a GTF file.
  * Parameters: 
  *      bed - a bed 12, ideally with thickStart/thickEnd set.
  *      exonSource - name to include as source for exons.  NULL is ok.
  *      cdsSource - name to include as source for CDS. NULL is ok.
  *      geneName - gene (as opposed to transcript name). NULL is ok.
  *      f - file to write to. */
 {
 /* Check input and supply defaults for any NULLs. */
 if (bed->blockCount == 0)
     errAbort("bed with no blocks passed to bedToGtf");
 if (exonSource == NULL)
     exonSource = ".";
 if (cdsSource == NULL)
     cdsSource = exonSource;
 if (geneName == NULL) 
     geneName = bed->name;
 
 /* Get bounds of CDS in relative coordinates. */
 int cdsStart = bed->thickStart - bed->chromStart;
 int cdsEnd = bed->thickEnd - bed->chromStart;
 boolean gotCds = (cdsStart != cdsEnd);
 
 /* Get exons in relative coordinates. */
 struct lm *lm = lmInit(0);
 struct range *exon, *exonList = bedToExonList(bed, lm);
 
 /* On minus strand flip relative coordinates. */
 int bedSize = bed->chromEnd - bed->chromStart;
 if (bed->strand[0] == '-')
     {
     flipExonList(&exonList, bedSize);
     reverseIntRange(&cdsStart, &cdsEnd, bedSize);
     }
 
 /* Loop though and output exons and coding regions. */
 int cdsPos = 0;	/* Track position within CDS */
 for (exon = exonList; exon != NULL; exon = exon->next)
     {
     int exonStart = exon->start;
     int exonEnd = exon->end;
     if (bed->strand[0] == '-')
         reverseIntRange(&exonStart, &exonEnd, bedSize);
     fprintf(f, "%s\t%s\texon\t%d\t%d\t.\t%s\t.\t",
     	bed->chrom, exonSource, exonStart + 1 + bed->chromStart, 
 	exonEnd + bed->chromStart, bed->strand);
     fprintf(f, "gene_id \"%s\"; transcript_id \"%s\";\n", geneName, bed->name);
     if (gotCds)
 	{
 	int exonCdsStart = max(exon->start, cdsStart);
 	int exonCdsEnd = min(exon->end, cdsEnd);
 	int exonCdsSize = exonCdsEnd - exonCdsStart;
 	if (exonCdsSize > 0)
 	    {
 	    int frame = cdsPos%3;
 	    int phase = (3-frame)%3;
 	    if (bed->strand[0] == '-')
 		reverseIntRange(&exonCdsStart, &exonCdsEnd, bedSize);
 	    fprintf(f, "%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\t", bed->chrom, cdsSource, 
 	    	exonCdsStart + 1 + bed->chromStart, 
 		exonCdsEnd + bed->chromStart, bed->strand, phase);
 	    fprintf(f, "gene_id \"%s\"; transcript_id \"%s\";\n", geneName, bed->name);
 	    cdsPos += exonCdsSize;
 	    }
 	}
     }
 lmCleanup(&lm);
 }
 
 boolean outputProtein(struct cdsEvidence *cds, struct dnaSeq *txSeq, FILE *f)
 /* Translate txSeq to protein guided by cds, and output to file.
  * The implementation is a little complicated by checking for internal
  * stop codons and other error conditions. Return True if a sequence was
  * generated and was not impacted by error conditions, False otherwise */
 {
 boolean selenocysteine = FALSE;
 if (selenocysteineHash != NULL)
     {
     if (hashLookup(selenocysteineHash, txSeq->name))
 	selenocysteine = TRUE;
     }
 struct dyString *dy = dyStringNew(4*1024);
 int blockIx;
 for (blockIx=0; blockIx<cds->cdsCount; ++blockIx)
     {
     DNA *dna = txSeq->dna + cds->cdsStarts[blockIx];
     int rnaSize = cds->cdsSizes[blockIx];
     if (rnaSize%3 != 0)
         {
 	warn("size of block (%d) #%d not multiple of 3 in %s (source %s %s)",
 		 rnaSize, blockIx, cds->name, cds->source, cds->accession);
 	return FALSE;
 	}
     int aaSize = rnaSize/3;
     int i;
     for (i=0; i<aaSize; ++i)
         {
 	AA aa = lookupCodon(dna);
 	if (aa == 0) 
 	    {
 	    aa = '*';
 	    if (selenocysteine)
 	        {
 		if (!isReallyStopCodon(dna, TRUE))
 		    aa = 'U';
 		}
 	    }
 	dyStringAppendC(dy, aa);
 	dna += 3;
 	}
     }
 int lastCharIx = dy->stringSize-1;
 if (dy->string[lastCharIx] == '*')
     {
     dy->string[lastCharIx] = 0;
     dy->stringSize = lastCharIx;
     }
 char *prematureStop = strchr(dy->string, '*');
 if (prematureStop != NULL)
     {
     warn("Stop codons in CDS at position %d for %s, (source %s %s)", 
 	 (int)(prematureStop - dy->string), cds->name, cds->source, cds->accession);
     return(FALSE);
     }
 else
     {
     faWriteNext(f, cds->name, dy->string, dy->stringSize);
     dyStringFree(&dy);
     return(TRUE);
     }
 }
 
 void txCdsToGene(char *txBed, char *txFa, char *txCds, char *outGtf, char *outFa)
 /* txCdsToGene - Convert transcript bed and best cdsEvidence to genePred and 
  * protein sequence. */
 {
 struct hash *txSeqHash = faReadAllIntoHash(txFa, dnaLower);
 verbose(2, "Read %d transcript sequences from %s\n", txSeqHash->elCount, txFa);
 struct hash *cdsHash = cdsEvidenceReadAllIntoHash(txCds);
 verbose(2, "Read %d cdsEvidence from %s\n", cdsHash->elCount, txCds);
 struct lineFile *lf = lineFileOpen(txBed, TRUE);
 FILE *fGtf = mustOpen(outGtf, "w");
 FILE *fFa = mustOpen(outFa, "w");
 char *row[12];
 while (lineFileRow(lf, row))
     {
     struct bed *bed = bedLoad12(row);
     verbose(2, "processing %s\n", bed->name);
     struct cdsEvidence *cds = hashFindVal(cdsHash, bed->name);
     struct dnaSeq *txSeq = hashFindVal(txSeqHash, bed->name);
     char *cdsSource = NULL;
     boolean freeOfCdsErrors = TRUE;
     if (txSeq == NULL)
         errAbort("%s is in %s but not %s", bed->name, txBed, txFa);
     if (cds != NULL)
 	{
         freeOfCdsErrors = outputProtein(cds, txSeq, fFa);
 	if (cds->cdsCount > 1)
 	    {
 	    struct bed *newBed = breakUpBedAtCdsBreaks(cds, bed);
 	    if (fTweaked)
 	        fprintf(fTweaked, "%s\n", newBed->name);
 	    bedFree(&bed);
 	    bed = newBed;
 	    }
 	cdsSource = cds->accession;
 	if (sameString(cds->accession, "."))
 	    cdsSource = cds->source;
 	}
 
     /* Set bed CDS bounds and optionally output bed. */
     cdsEvidenceSetBedThick(cds, bed, freeOfCdsErrors);
     if (fBed)
         bedTabOutN(bed, 12, fBed);
 
     /* Parse out bed name, which is in format chrom.geneId.txId.accession */
     char *geneName = cloneString(bed->name);
     char *accession = strrchr(geneName, '.');
     assert(accession != NULL);
     *accession++ = 0;
     chopSuffix(geneName);
 
     /* Output as GTF */
     bedToGtf(bed, accession, cdsSource, geneName, fGtf);
 
     /* Clean up for next iteration of loop. */
     freez(&geneName);
     bedFree(&bed);
     }
 lineFileClose(&lf);
 carefulClose(&fFa);
 carefulClose(&fGtf);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 dnaUtilOpen();
 if (argc != 6)
     usage();
 char *bedOut = optionVal("bedOut", NULL);
 makeExceptionHashes();
 if (bedOut != NULL)
     fBed = mustOpen(bedOut, "w");
 char *tweaked = optionVal("tweaked", NULL);
 if (tweaked != NULL)
     fTweaked = mustOpen(tweaked, "w");
 txCdsToGene(argv[1], argv[2], argv[3], argv[4], argv[5]);
 carefulClose(&fBed);
 carefulClose(&fTweaked);
 return 0;
 }