4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/mrnaToGene/mrnaToGene.c src/hg/mrnaToGene/mrnaToGene.c
index fc913a5..51b9428 100644
--- src/hg/mrnaToGene/mrnaToGene.c
+++ src/hg/mrnaToGene/mrnaToGene.c
@@ -1,402 +1,402 @@
 /* mrnaToGene - convert PSL alignments of mRNAs to gene annotation.
  */
 
 /* Copyright (C) 2013 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "options.h"
 #include "jksql.h"
 #include "genbank.h"
 #include "genePred.h"
 #include "psl.h"
 #include "hash.h"
 #include "linefile.h"
 
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"db", OPTION_STRING},
     {"cdsDb", OPTION_STRING},
     {"cdsFile", OPTION_STRING},
     {"requireUtr", OPTION_BOOLEAN},
     {"smallInsertSize", OPTION_INT},
     {"insertMergeSize", OPTION_INT},
     {"smallInsertSize", OPTION_INT},
     {"cdsMergeSize", OPTION_INT},
     {"cdsMergeMod3", OPTION_BOOLEAN},
     {"utrMergeSize", OPTION_INT},
     {"genePredExt", OPTION_BOOLEAN},
     {"allCds", OPTION_BOOLEAN},
     {"noCds", OPTION_BOOLEAN},
     {"keepInvalid", OPTION_BOOLEAN},
     {"ignoreUniqSuffix", OPTION_BOOLEAN},
     {"quiet", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 /* command line options */
 static int gCdsMergeSize = -1;
 static int gUtrMergeSize = -1;
 static unsigned gPslOptions = genePredPslDefaults;
 static boolean gRequireUtr = FALSE;
 static boolean gKeepInvalid = FALSE;
 static boolean gAllCds = FALSE;
 static boolean gNoCds = FALSE;
 static boolean gGenePredExt = FALSE;
 static boolean gQuiet = FALSE;
 static boolean gIgnoreUniqSuffix = FALSE;
 
 /* hash table of accession to CDS */
 static struct hash *gCdsTable = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "mrnaToGene - convert PSL alignments of mRNAs to gene annotations\n"
   "usage:\n"
   "   mrnaToGene [options] psl genePredFile\n"
   "\n"
   "Convert PSL alignments with CDS annotation from genbank to  gene\n"
   "annotations in genePred format.  Accessions without valids CDS are\n"
   "optionally dropped. A best attempt is made to convert incomplete CDS\n"
   "annotations.\n"
   "\n"
   "The psl argument may either be a PSL file or a table in a databases,\n"
   "depending on options.  CDS maybe obtained from the database or file.\n"
   "Accession in PSL files are tried with and with out genbank versions.\n"
   "\n"
   "Options:\n"
   "  -db=db - get PSLs and CDS from this database, psl specifies the table.\n"
   "  -cdsDb=db - get CDS from this database, psl is a file.\n"
   "  -cdsFile=file - get CDS from this file, psl is a file.\n"
   "   File is tab separate with name as the first column and\n"
   "   NCBI CDS the second\n"
   "  -insertMergeSize=%d - Merge inserts (gaps) no larger than this many bases.\n"
   "   A negative size disables merging of blocks.  This differs from specifying zero\n"
   "   in that adjacent blocks will not be merged, allowing tracking of frame for\n"
   "   each block. Defaults to %d unless -cdsMergeSize or -utrMergeSize are specified,\n"
   "   if either of these are specified, this option is ignored.\n"
   "  -smallInsertSize=n - alias for -insertMergetSize\n"
   "  -cdsMergeSize=-1 - merge gaps in CDS no larger than this size.\n"
   "   A negative values disables.\n"
   "  -cdsMergeMod3 - only merge CDS gaps if they mod 3\n"
   "  -utrMergeSize=-1  - merge gaps in UTR no larger than this size.\n"
   "   A negative values disables.\n"
   "  -requireUtr - Drop sequences that don't have both 5' and 3' UTR annotated.\n"
   "  -genePredExt - create a extended genePred, including frame information.\n"
   "  -allCds - consider PSL to be all CDS.\n"
   "  -noCds - consider PSL to not contain any CDS.\n"
   "  -keepInvalid - Keep sequences with invalid CDS.\n"
   "  -quiet - Don't print print info about dropped sequences.\n"
   "  -ignoreUniqSuffix - ignore all characters after last `-' in qName\n"
   "   when looking up CDS. Used when a suffix has been added to make qName\n"
   "   unique.  It is not removed from the name in the genePred.\n"
   "\n", genePredStdInsertMergeSize, genePredStdInsertMergeSize);
 }
 
 void loadCdsFile(char *cdsFile)
 /* read a CDS file into the global hash */
 {
 struct lineFile *lf = lineFileOpen(cdsFile, TRUE);
 char *row[2];
 
 gCdsTable = hashNew(20);
 while (lineFileNextRowTab(lf, row, 2))
     hashAdd(gCdsTable, row[0], 
             lmCloneString(gCdsTable->lm, row[1]));
 
 lineFileClose(&lf);
 }
 
 int getGbVersionIdx(char *acc)
 /* determine if a accession appears to have a genbank version extension.  If
  * so, return index of the dot, otherwise -1 */
 {
 char *verPtr = strrchr(acc, '.');
 int dotIdx;
 if (verPtr == NULL)
     return -1;
 dotIdx = verPtr - acc;
 verPtr++;
 while (*verPtr != '\0')
     {
     if (!isdigit(*verPtr))
         return -1;
     verPtr++;
     }
 return dotIdx;
 }
 
 char *cdsQuery(struct sqlConnection *conn, char *acc, char *cdsBuf, int cdsBufSize)
 /* query for a CDS, either in the hash table or database */
 {
 if (gCdsTable != NULL)
     return hashFindVal(gCdsTable, acc);
 else
     {
     char query[512];
     sqlSafef(query, sizeof(query),
           "SELECT cds.name FROM cds,gbCdnaInfo WHERE (gbCdnaInfo.acc = '%s') AND (gbCdnaInfo.cds !=0) AND (gbCdnaInfo.cds = cds.id)",
           acc);
     return sqlQuickQuery(conn, query, cdsBuf, cdsBufSize);
     }
 }
 
 char *getCdsForAcc(struct sqlConnection *conn, char *acc, char *cdsBuf, int cdsBufSize)
 /* look up a cds, trying with and without version, and optionally dropping unique suffix */
 {
 char *dash = NULL;
 if (gIgnoreUniqSuffix)
     {
     dash = strrchr(acc, '-');
     if (dash != NULL)
         *dash = '\0';
     }
 
 char *cdsStr = cdsQuery(conn, acc, cdsBuf, cdsBufSize);
 if (cdsStr == NULL)
     {
     int dotIdx = getGbVersionIdx(acc);
     if (dotIdx >= 0)
         {
         acc[dotIdx] = '\0';
         cdsStr = cdsQuery(conn, acc, cdsBuf, cdsBufSize);
         acc[dotIdx] = '.';
         }
     }
 if (dash != NULL)
     *dash = '-';
 return cdsStr;
 }
 
 /* CDS return when the is no CDS */
 static struct genbankCds NO_CDS = {-1, -1, FALSE, FALSE};
 
 static struct genbankCds getCdsAll(struct psl *psl)
 /* return CDS structure when it is assume the transcript is CDS */
 {
 struct genbankCds cds;
 cds.start = psl->qStart;
 cds.end = psl->qEnd;
 if (psl->strand[0] == '-')
     reverseIntRange(&cds.start, &cds.end, psl->qSize);
 cds.startComplete = TRUE;
 cds.endComplete = TRUE;
 return cds;
 }
 
 static struct genbankCds getCdsFromSpec(struct sqlConnection *conn, struct psl *psl)
 /* get CDS from specification in database or file */
 {
 struct genbankCds cds = NO_CDS;
 char cdsBuf[4096];
 char *cdsStr = getCdsForAcc(conn, psl->qName, cdsBuf, sizeof(cdsBuf));
 if (cdsStr == NULL)
     {
     if (!gQuiet)
         fprintf(stderr, "Warning: no CDS for %s\n", psl->qName);
     }
 else if (!genbankCdsParse(cdsStr, &cds))
     {
     if (!gQuiet)
         fprintf(stderr, "Warning: invalid CDS for %s: %s\n",
                 psl->qName, cdsStr);
     }
 else if ((cds.end-cds.start) > psl->qSize)
     {
     if (!gQuiet)
         fprintf(stderr, "Warning: CDS for %s (%u..%u) longer than qSize (%u)\n",
                 psl->qName, cds.start, cds.end, psl->qSize);
     cds = NO_CDS;
     }
 return cds;
 }
 
 static struct genbankCds getCds(struct sqlConnection *conn, struct psl *psl)
 /* Lookup the CDS, either in the database or hash, or generate for query.  If
  * not found and looks like a it has a genbank version, try without the
  * version.  If allCds is true, generate a cds that covers the query.  Conn
  * maybe null if gCdsTable exists or gAllCds or gNoCds are true.  If CDS can't be
  * obtained, start and end are both set to -1.  If there is an error parsing
  * it, start and end are both set to 0. */
 {
 if (gNoCds)
     return NO_CDS;
 else if (gAllCds)
     return getCdsAll(psl);
 else
     return getCdsFromSpec(conn, psl);
 }
 
 struct genePred* pslToGenePred(struct psl *psl, struct genbankCds *cds)
 /* Convert a psl to genePred with specified CDS string; return NULL
  * if should be skipped.  cdsStr maybe NULL if not available. */
 {
 unsigned optFields = gGenePredExt ? (genePredAllFlds) : 0;
 
 if ((cds->start == cds->end) && !(gKeepInvalid || gNoCds))
     return NULL;
 if (gRequireUtr && ((cds->start == 0) || (cds->end == psl->qSize)))
     {
     if (!gQuiet)
         fprintf(stderr, "Warning: no 5' or 3' UTR for %s\n", psl->qName);
     return NULL;
     }
 return genePredFromPsl3(psl, cds, optFields, gPslOptions,
                         gCdsMergeSize, gUtrMergeSize);
 }
 
 void convertPsl(struct psl *psl, struct genbankCds *cds, FILE *genePredFh)
 /* convert a cds and psl and output */
 {
 struct genePred *genePred = pslToGenePred(psl, cds);
 if (genePred != NULL)
     {
     genePredTabOut(genePred, genePredFh);
     genePredFree(&genePred);
     }
 }
 
 void convertPslTableRow(char **row, FILE *genePredFh)
 /* A row from the PSL query that includes CDS */
 {
 struct psl *psl = pslLoad(row+1);
 struct  genbankCds cds;
 genbankCdsParse(row[0], &cds);
 convertPsl(psl, &cds, genePredFh);
 pslFree(&psl);
 }
 
 void convertPslTable(struct sqlConnection *conn, char *pslTable, FILE *genePredFh)
 /* convert mrnas in a psl table to genePred objects */
 {
 char query[512], **row;
 struct sqlResult *sr;
 
 /* generate join of cds with psls */
 sqlSafef(query, sizeof(query),
       "SELECT cds.name,matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert,tBaseInsert,strand,qName,qSize,qStart,qEnd,tName,tSize,tStart,tEnd,blockCount,blockSizes,qStarts,tStarts "
       "FROM cds,%s,gbCdnaInfo WHERE (%s.qName = gbCdnaInfo.acc) AND (gbCdnaInfo.cds !=0) AND (gbCdnaInfo.cds = cds.id)",
       pslTable, pslTable);
 sr = sqlGetResult(conn, query);
 while ((row = sqlNextRow(sr)) != NULL)
     convertPslTableRow(row, genePredFh);
 sqlFreeResult(&sr);
 }
 
 void convertPslFileRow(struct sqlConnection *conn, char **row, FILE *genePredFh)
 /* A row from the PSL file, getting CDS */
 {
 struct psl *psl = pslLoad(row);
 struct  genbankCds cds = getCds(conn, psl);
 convertPsl(psl, &cds, genePredFh);
 pslFree(&psl);
 }
 
 void convertPslFile(struct sqlConnection *conn, char *pslFile, FILE *genePredFh)
 /* convert mrnas in a psl file to genePred objects */
 {
 struct lineFile *lf = pslFileOpen(pslFile);
 char *row[PSL_NUM_COLS];
 
 while (lineFileNextRowTab(lf, row, PSL_NUM_COLS))
     convertPslFileRow(conn, row, genePredFh);
 lineFileClose(&lf);
 }
 
 void mrnaToGene(char *db, char *cdsDb, char *cdsFile, char *pslSpec,
                 char *genePredFile)
 /* convert an PSL mRNA table to a genePred file */
 {
 struct sqlConnection *conn = NULL;
 FILE* genePredFh;
 
 if (db != NULL)
     conn = sqlConnect(db);
 else if (cdsDb != NULL)
     conn = sqlConnect(cdsDb);
 genePredFh = mustOpen(genePredFile, "w");
 
 if (cdsFile != NULL)
     loadCdsFile(cdsFile);
 
 if (db == NULL)
     convertPslFile(conn, pslSpec, genePredFh);
 else
     convertPslTable(conn, pslSpec, genePredFh);
 
 if (ferror(genePredFh))
     errAbort("error writing %s", genePredFile);
 carefulClose(&genePredFh);
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *db, *cdsDb, *cdsFile, *pslSpec, *genePredFile;
 int optCnt;
 
 optionInit(&argc, argv, optionSpecs);
 if (argc != 3)
     usage();
 pslSpec = argv[1];
 genePredFile = argv[2];
 db = optionVal("db", NULL);
 cdsDb = optionVal("cdsDb", NULL);
 cdsFile = optionVal("cdsFile", NULL);
 gRequireUtr = optionExists("requireUtr");
 if (optionExists("cdsMergeMod3") && !optionExists("cdsMergeSize"))
     errAbort("must specify -cdsMergeSize with -cdsMergeMod3");
 if (optionExists("cdsMergeSize") || optionExists("utrMergeSize"))
     {
     gCdsMergeSize = optionInt("cdsMergeSize", -1);
     gUtrMergeSize = optionInt("utrMergeSize", -1);
     if (optionExists("cdsMergeMod3"))
         gPslOptions |= genePredPslCdsMod3;
     if (optionExists("smallInsertSize") || optionExists("insertMergeSize"))
         errAbort("can't specify -smallInsertSize or -insertMergeSize with -cdsMergeSize or -utrMergeSize");
     }
 else
     {
     int insertMergeSize = genePredStdInsertMergeSize;
     if (optionExists("smallInsertSize"))
         insertMergeSize = optionInt("smallInsertSize", genePredStdInsertMergeSize);
     insertMergeSize = optionInt("insertMergeSize", genePredStdInsertMergeSize);
     gCdsMergeSize = gUtrMergeSize = insertMergeSize;
     }
 gGenePredExt = optionExists("genePredExt");
 gKeepInvalid = optionExists("keepInvalid");
 gAllCds = optionExists("allCds");
 gNoCds = optionExists("noCds");
 gQuiet = optionExists("quiet");
 gIgnoreUniqSuffix = optionExists("ignoreUniqSuffix");
 
 if ((gAllCds || gNoCds) && ((cdsDb != NULL) || (cdsFile != NULL)))
     errAbort("can't specify -allCds or -noCds with -cdsDb or -cdsFile");
 if (gAllCds && gRequireUtr)
     errAbort("can't specify -allCds with -requireUtr");
 /* this is a bit of work to implement */
 if ((gAllCds || gNoCds) && (db != NULL))
     errAbort("can't specify -allCds or -noCds with -db");
 
 optCnt = 0;
 if (db != NULL)
     optCnt++;
 if (cdsDb == NULL)
     optCnt++;
 if (cdsFile != NULL)
     optCnt++;
 if (gAllCds)
     optCnt++;
 if (gNoCds)
     optCnt++;
 
 if (optCnt == 1)
     errAbort("must specify one and only one of -db, -cdsDb, -cdsFile, -allCds, or -noCds");
 
 mrnaToGene(db, cdsDb, cdsFile, pslSpec, genePredFile);
 return 0;
 }