src/hg/txCds/txCdsEvFromBed/txCdsEvFromBed.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/txCds/txCdsEvFromBed/txCdsEvFromBed.c src/hg/txCds/txCdsEvFromBed/txCdsEvFromBed.c
index 3a9e457..ee73c06 100644
--- src/hg/txCds/txCdsEvFromBed/txCdsEvFromBed.c
+++ src/hg/txCds/txCdsEvFromBed/txCdsEvFromBed.c
@@ -1,401 +1,401 @@
 /* txCdsEvFromBed - Make a cds evidence file (.tce) from an existing bed file.  
  * Used mostly in transferring CCDS coding regions currently.. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dnautil.h"
 #include "dystring.h"
 #include "binRange.h"
 #include "bed.h"
 #include "txRnaAccs.h"
 #include "twoBit.h"
 
 struct hash *compatibleTxPerCds = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "txCdsEvFromBed - Make a cds evidence file (.tce) from an existing bed file.  Used mostly \n"
   "in transferring CCDS coding regions currently.\n"
   "usage:\n"
   "   txCdsEvFromBed cds.bed type tx.bed genome.2bit output.tce\n"
   "where:\n"
   "   cds.bed is a bed12 format file with thickStart/thickEnd defining coding region\n"
   "   type is the name that will appear in the type column of output.tce\n"
   "   tx.bed is a bed12 format file containing the transcripts we're annotating\n"
   "   genome.2bit is a file containing DNA sequence for the associated genome\n"
   "   output.tce is the tab-delimited output in the same format used by txCdsEvFromRna\n"
   "      txCdsEvFromBed, txCdsPick and txCdsToGene.  It mostly defines the CDS in terms of\n"
   "      the virtual transcript\n"
   "example:\n"
   "    txCdsEvFromBed ccds.bed ccds txWalk.bed hg18.2bit ccds.tce\n"
   "options:\n"
   "   -xxx=XXX\n"
   );
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 struct cdsTxPair
 /* A pair of beds - one a coding region, the other an enclosing transcript. */
     {
     struct cdsTxPair *next;	/* Next in list. */
     struct bed *cds;		/* Coding region. */
     struct bed *tx;		/* Transcript. */
     struct usedBed *usedCds;	/* CDS with a flag. */
     struct usedBed *usedTx;	/* Transcript with a flag. */
     };
 
 struct usedBed
 /* Keeps track of whether a bed is used */
     {
     struct usedBed *next;
     struct bed *bed;	/* Pointer to underlying bed. */
     struct slRef *pairList;	/* List of pairs using this bed. */
     boolean used;	/* True if BED is used. */
     };
 
 struct cdsTxCluster
 /* A cluster of pairs - a pairs that share a tx or a cds. */
     {
     struct cdsTxCluster *next;	/* Next in list. */
     struct usedBed *txList;     /* List of transcripts used. */
     struct usedBed *cdsList;	/* List of cds's used. */
     struct slRef *pairList;	/* List of pairs. */
     };
 
 int cmpCompatibleTx(const void *va, const void *vb)
 /* Sort so that the CDS with the fewest compatible transcripts will be first */
 {
 const struct usedBed *a = *((struct usedBed **)va);
 const struct usedBed *b = *((struct usedBed **)vb);
 int txCountA = hashIntVal(compatibleTxPerCds, a->bed->name);
 int txCountB = hashIntVal(compatibleTxPerCds, b->bed->name);
 return (txCountA - txCountB);
 }
 
 void outputCdsEv(struct bed *tx, struct bed *cds, char *tceType, FILE *f)
 /* Output CDS for transcript. */
 {
 int txSize = bedTotalBlockSize(tx);
 int cdsStartInRna = bedBlockSizeInRange(tx, tx->chromStart, cds->chromStart);
 int cdsEndInRna = bedBlockSizeInRange(tx, tx->chromStart, cds->chromEnd);
 if (tx->strand[0] == '-')
     reverseIntRange(&cdsStartInRna, &cdsEndInRna, txSize);
 fprintf(f, "%s\t", tx->name);
 fprintf(f, "%d\t", cdsStartInRna);
 fprintf(f, "%d\t", cdsEndInRna);
 fprintf(f, "%s\t", tceType);
 fprintf(f, "%s\t", cds->name);
 fprintf(f, "1000\t");	// score
 fprintf(f, "1\t");	// starts with start codon
 fprintf(f, "1\t");	// ends with stop codon
 fprintf(f, "1\t");	// block count
 fprintf(f, "%d,\t", cdsStartInRna);
 fprintf(f, "%d,\n", cdsEndInRna - cdsStartInRna);
 }
 
 struct dyString *dnaOfIntersection(struct bed *bed, int rangeStart, int rangeEnd, 
 	struct twoBitFile *tbf)
 /* Fetch DNA of part of (blocked) bed that is between rangeStart/rangeEnd. */
 {
 int i;
 struct dyString *dy = dyStringNew(0);
 for (i=0; i<bed->blockCount; ++i)
     {
     int blockStart = bed->chromStart + bed->chromStarts[i];
     int blockEnd = blockStart + bed->blockSizes[i];
     if (blockStart < rangeStart) blockStart = rangeStart;
     if (blockEnd > rangeEnd) blockEnd = rangeEnd;
     if (blockStart < blockEnd)
         {
 	struct dnaSeq *seq = twoBitReadSeqFragLower(tbf, bed->chrom, blockStart, blockEnd);
 	dyStringAppendN(dy, seq->dna, seq->size);
 	dnaSeqFree(&seq);
 	}
     }
 return dy;
 }
 
 boolean upstreamStartInString(struct dyString *utr5)
 /* Give 5' UTR sequence return TRUE if there is a start codon in frame before any
  * start codons, starting from end. */
 {
 int codonStart;
 for (codonStart = utr5->stringSize - 3; codonStart >= 0; codonStart -= 3)
     {
     DNA *codon = utr5->string + codonStart;
     if (startsWith("atg", codon))
         return TRUE;
     if (isStopCodon(codon))
         return FALSE;
     }
 return FALSE;
 }
 
 boolean upstreamStartToCds(struct bed *cds, struct bed *tx, struct twoBitFile *tbf)
 /* Return TRUE if there is a start codon in tx that precedes cds. This does allow
  * upstream ORFs though - where there is a stop codon and then a start codon. */
 {
 struct dyString *dna = NULL;
 if (tx->strand[0] == '-')
    {
    dna = dnaOfIntersection(tx, cds->chromEnd, tx->chromEnd, tbf);
    reverseComplement(dna->string, dna->stringSize);
    }
 else
    {
    dna = dnaOfIntersection(tx, tx->chromStart, cds->chromStart, tbf);
    }
 boolean gotStart = upstreamStartInString(dna);
 dyStringFree(&dna);
 return gotStart;
 }
 
 int scoreCdsMatch(struct bed *cds, struct bed *tx, struct twoBitFile *tbf)
 /* Score based on how well tx handles cds as a subset.  This is an ad-hoc score with
  * the following properties:
  *    Large minus score for cds not being a proper subset of tx
  *    Moderate minus score for having an upstream start codon
  *    Slight minus score for having bases in UTR. */
 {
 if (!bedCompatibleExtension(cds, tx))
     return -1000000;
 if (upstreamStartToCds(cds, tx, tbf))
     return -100000;
 return bedTotalBlockSize(cds) - bedTotalBlockSize(tx);
 }
 
 struct usedBed *findBestTxForCds(struct usedBed *cdsUsed, struct usedBed *txList,
     struct twoBitFile *tbf, boolean useTarget)
 /* Return transcript on list that is best match to cds.  This uses score 
  * function defined above. */
 {
 struct bed *cds = cdsUsed->bed;
 struct usedBed *bestTx = NULL;
 int bestScore = -BIGNUM;
 struct usedBed *txUsed;
 for (txUsed = txList; txUsed != NULL; txUsed = txUsed->next)
     {
     if (txUsed->used == useTarget)
         {
 	int score = scoreCdsMatch(cds, txUsed->bed, tbf);
 	if (score > bestScore)
 	    {
 	    bestScore = score;
 	    bestTx = txUsed;
 	    }
 	}
     }
 return bestTx;
 }
 
 void txCdsEvOnCluster(struct cdsTxCluster *cluster, struct twoBitFile *tbf, char *tceType, FILE *f)
 /* Given a cluster of transcripts and cds's that are compatible, try to make sure
  * there is one transcript for each cds, and that each cds has a unique transcript,
  * and that where possible there are no upstream start codons for a CDS. */
 {
 verbose(3, "Cluster %d %d %d:\n", slCount(cluster->pairList), 
 	slCount(cluster->txList), slCount(cluster->cdsList));
 if (cluster->txList->next == NULL && cluster->cdsList->next == NULL)
     {
     /* Special case for clusters with only one CDS and one tx. */
     outputCdsEv(cluster->txList->bed, cluster->cdsList->bed, tceType, f);
     }
 else
     {
     /* Clear used flags for txList. */
     struct usedBed *tx;
     for (tx = cluster->txList; tx != NULL; tx = tx->next)
 	tx->used = FALSE;
 
     /* Sort cdsList so that the ones with fewest compatible transcripts will be first, 
      * and then find best (unused) transcript for each cds. */
     slSort(&cluster->cdsList, cmpCompatibleTx);
     struct usedBed *cds;
     for (cds = cluster->cdsList;  cds != NULL; cds = cds->next)
 	{
 	tx = findBestTxForCds(cds, cluster->txList, tbf, FALSE);
 	if (tx == NULL)
 	    {
 	    tx = findBestTxForCds(cds, cluster->txList, tbf, TRUE);
 	    assert(tx != NULL);
 	    warn("Couldn't find unused transcript for %s, reusing %s",
 		    cds->bed->name, tx->bed->name);
 	    }
 	tx->used = TRUE;
 	outputCdsEv(tx->bed, cds->bed, tceType, f);
 	}
     }
 }
 
 struct usedBed *addToUsedBedHash(struct hash *hash, struct bed *bed, struct cdsTxPair *pair)
 /* If bed name is new, create a usedBed for it and add it to hash and list. */
 {
 struct hashEl *hel;
 if ((hel = hashLookup(hash, bed->name)) == NULL)
     {
     struct usedBed *ub;
     AllocVar(ub);
     ub->bed = bed;
     hel = hashAdd(hash, bed->name, ub);
     }
 struct usedBed *ub = hel->val;
 refAdd(&ub->pairList, pair);
 return ub;
 }
 
 void rAddPair(struct cdsTxCluster *cluster, struct cdsTxPair *pair)
 /* Add pair to cluster.   Also add any pairs connected to this pair to cluster. */
 {
 struct usedBed *cds = pair->usedCds, *tx = pair->usedTx;
 if (cds->used && tx->used)
     return;
 refAdd(&cluster->pairList, pair);
 if (!cds->used)
     {
     slAddHead(&cluster->cdsList, cds);
     cds->used = TRUE;
     struct slRef *ref;
     for (ref = cds->pairList; ref != NULL; ref = ref->next)
         rAddPair(cluster, ref->val);
     }
 if (!tx->used)
     {
     slAddHead(&cluster->txList, tx);
     tx->used = TRUE;
     struct slRef *ref;
     for (ref = tx->pairList; ref != NULL; ref = ref->next)
         rAddPair(cluster, ref->val);
     }
 }
 
 
 void txCdsEvOnChrom(struct cdsTxPair *pairList, struct twoBitFile *tbf, char *tceType, FILE *f)
 /* Cluster together pairs that share any common elements, and then call routine
  * to further process each cluster. */
 {
 /* Create hashes of uniq beds with a usage flag. */
 struct cdsTxPair *pair;
 struct hash *txHash = hashNew(18), *cdsHash = hashNew(18);
 for (pair = pairList; pair != NULL; pair = pair->next)
     {
     pair->usedTx = addToUsedBedHash(txHash, pair->tx, pair);
     pair->usedCds = addToUsedBedHash(cdsHash, pair->cds, pair);
     }
 verbose(2, "Chrom %s has %d tx and %d cds\n", pairList->tx->chrom, txHash->elCount, cdsHash->elCount);
 
 /* Construct clusters, somewhat recursively. */
 struct cdsTxCluster *cluster, *clusterList = NULL;
 for (pair = pairList; pair != NULL; pair = pair->next)
     {
     if (!pair->usedTx->used || !pair->usedCds->used)
         {
 	AllocVar(cluster);
 	slAddHead(&clusterList, cluster);
 	rAddPair(cluster, pair);
 	}
     }
 slReverse(&clusterList);
 
 /* Do further processing on each cluster. */
 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
     {
     txCdsEvOnCluster(cluster, tbf, tceType, f);
     }
 }
 
 
 struct cdsTxPair *getCompatiblePairs(struct bed *txBedList, struct bed *cdsBedList)
 /* Given list of transcripts and cds's, get list of all compatible tx/cds pairs. */
 {
 struct cdsTxPair *pairList = NULL;
 struct bed *cdsBed;
 struct hash *txKeepers = bedsIntoKeeperHash(txBedList);
 
 for (cdsBed = cdsBedList; cdsBed != NULL; cdsBed = cdsBed->next)
     {
     hashAddInt(compatibleTxPerCds, cdsBed->name, 0);
     struct cdsTxPair *pair = NULL;
     struct binKeeper *bk = hashFindVal(txKeepers, cdsBed->chrom);
     if (bk != NULL)
         {
 	struct binElement *bel, *belList = binKeeperFind(bk, cdsBed->chromStart, 
 		cdsBed->chromEnd);
 	for (bel = belList; bel != NULL; bel = bel->next)
 	    {
 	    struct bed *txBed = bel->val;
 	    if (bedCompatibleExtension(cdsBed, txBed))
 	        {
 		AllocVar(pair);
 		pair->cds = cdsBed;
 		pair->tx = txBed;
 		slAddHead(&pairList, pair);
 		hashIncInt(compatibleTxPerCds, cdsBed->name);
 		}
 	    }
 	slFreeList(&belList);
 	}
     if (pair == NULL)
         warn("No tx for %s\n", cdsBed->name);
     }
 hashFreeWithVals(&txKeepers, binKeeperFree);
 slReverse(&pairList);
 return pairList;
 }
 
 
 void txCdsEvFromBed(char *cdsBedFile, char *tceType, char *txBedFile, char *genomeSeq,
 	char *outFile)
 /* txCdsEvFromBed - Make a cds evidence file (.tce) from an existing bed file.  Used 
  * mostly in transferring CCDS coding regions currently. */
 {
 /* Load cds and tx beds, genome, and get list of all compatible pairs. */
 struct bed *txBedList = bedLoadNAll(txBedFile, 12);
 struct bed *cdsBedList = bedLoadNAll(cdsBedFile, 12);
 compatibleTxPerCds = hashNew(0);
 struct cdsTxPair *pairList = getCompatiblePairs(txBedList, cdsBedList);
 
 /* Separate pairs into chromosomes.  This is mostly to avoid problems with ccds IDs 
  * only being unique per-chromosome.  (The pseudo-autosomal regions of X and Y have
  * genes with the same CCDS id. */
 struct hash *chromHash = hashNew(0);
 struct cdsTxPair *pair, *nextPair;
 for (pair = pairList; pair != NULL; pair = nextPair)
     {
     nextPair = pair->next;
     struct hashEl *hel = hashLookup(chromHash, pair->cds->chrom);
     if (hel == NULL)
         hel = hashAdd(chromHash, pair->cds->chrom, NULL);
     slAddHead(&hel->val, pair);
     }
 
 /* Open output file, DNA file, and call routine to process each chromosome. */
 FILE *f = mustOpen(outFile, "w");
 struct twoBitFile *tbf = twoBitOpen(genomeSeq);
 struct hashEl *hel, *helList = hashElListHash(chromHash);
 for (hel = helList; hel != NULL; hel = hel->next)
      txCdsEvOnChrom(hel->val, tbf, tceType, f);
 
 /* Clean up and go home. */
 twoBitClose(&tbf);
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 dnaUtilOpen();
 if (argc != 6)
     usage();
 txCdsEvFromBed(argv[1], argv[2], argv[3], argv[4], argv[5]);
 return 0;
 }