4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/utils/refSeqGet/refSeqGet.c src/hg/utils/refSeqGet/refSeqGet.c
index b90ce85..24c4bec 100644
--- src/hg/utils/refSeqGet/refSeqGet.c
+++ src/hg/utils/refSeqGet/refSeqGet.c
@@ -1,276 +1,276 @@
 /* refSeqGet - retrieve refseq data from the database. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "refSeqVerInfo.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "hdb.h"
 #include "psl.h"
 #include "pslReader.h"
 #include "genePred.h"
 #include "genePredReader.h"
 #include "genbank.h"
 #include "fa.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "refSeqGet - retrieve refseq data from the database\n"
   "usage:\n"
   "   refSeqGet db\n"
   "\n"
   "Get a consistent snapshot of the RefSeq data from a genome browser database.\n"
   "All accessions will include version numbers.  Only ReqSeqs for the organism\n"
   "associated with the specified genome assembly are retrieved.\n"
   "\n"
   "options:\n"
   "   -aligns=pslFile - get the PSL alignments of the RefSeqs to the genome.\n"
   "   -geneAnns=genePred - get the genome annotations in genePred format.\n"
   "   -rnaSeqs=mrnaFa - get a FASTA file of the mRNA/RNA sequences with the CDS capitalized.\n"
   "   -protSeqs=protFa - get a FASTA file of the protein sequences.\n"
   "   -metaData=metaTsv - get a tab-separated file of the meta-data for the RefSeqs\n"
   "   -getNM - get NM sequences, if neither -getNM or -getNR is specified, all are return\n"
   "   -getNR - get the NR sequences\n"
   "   -accList=accfile - only get data for this set of accessions.  If version is included,\n"
   "    then only the specified version is retrieved, otherwise the version loaded is\n"
   "    retrieved.  This is incompatible with -getNR or -inclNM\n");
 }
 
 /* Notes:
  * This isn't very fast, it was done for simplicity more than speed.
  */
 
 static struct optionSpec options[] = {
     {"aligns", OPTION_STRING},
     {"geneAnns", OPTION_STRING},
     {"rnaSeqs", OPTION_STRING},
     {"protSeqs", OPTION_STRING},
     {"metaData", OPTION_STRING},
     {"getNM", OPTION_BOOLEAN},
     {"getNR", OPTION_BOOLEAN},
     {"accList", OPTION_STRING},
     {NULL, 0},
 };
 
 static char *addVer(char *acc, int ver, char *buf, int bufSize)
 /* construct acc.ver in buffer */
 {
 safef(buf, bufSize, "%s.%d", acc, ver);
 return buf;
 }
 
 static void processPsl(FILE *fh, struct hash *refSeqVerInfoTbl, struct psl *psl)
 /* check if a psl has been select, if so, write including version in qName */
 {
 struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, psl->qName);
 if (rsvi != NULL)
     {
     char buf[GENBANK_ACC_BUFSZ], *hold = psl->qName;
     psl->qName = addVer(psl->qName, rsvi->ver, buf, sizeof(buf));
     pslTabOut(psl, fh);
     psl->qName = hold;
     }
 }
 
 static void getAligns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
 /* get request alignments from database */
 {
 struct psl *psls = pslReaderLoadQuery(conn, "refSeqAli", NULL);
 slSort(psls, pslCmpQuery);
 FILE *fh = mustOpen(outFile, "w");
 struct psl *psl;
 for (psl = psls; psl != NULL; psl = psl->next)
     processPsl(fh, refSeqVerInfoTbl, psl);
 carefulClose(&fh);
 pslFreeList(&psls);
 }
 
 static void processGenePred(FILE *fh, struct hash *refSeqVerInfoTbl, struct genePred *gp)
 /* check if a genePred has been select, if so, write including version in name */
 {
 struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, gp->name);
 if (rsvi != NULL)
     {
     char buf[GENBANK_ACC_BUFSZ], *hold = gp->name;
     gp->name = addVer(gp->name, rsvi->ver, buf, sizeof(buf));
     genePredTabOut(gp, fh);
     gp->name = hold;
     }
 }
 
 static void getGeneAnns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
 /* get request genePred annotations from database */
 {
 struct genePred *gps = genePredReaderLoadQuery(conn, "refGene", NULL);
 slSort(&gps, genePredNameCmp);
 FILE *fh = mustOpen(outFile, "w");
 struct genePred *gp;
 for (gp = gps; gp != NULL; gp = gp->next)
     processGenePred(fh, refSeqVerInfoTbl, gp);
 carefulClose(&fh);
 genePredFreeList(&gps);
 }
 
 static void processRnaSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi)
 /* get an RNA sequence, which already includes version in name */
 {
 struct dnaSeq *seq = hGenBankGetMrnaC(conn, rsvi->acc, NULL);
 if (seq == NULL)
     errAbort("failed to get %s RNA sequence from database", rsvi->acc);
 faWriteNext(fh, seq->name, seq->dna, seq->size);
 dnaSeqFree(&seq);
 }
 
 static void getRnaSeqs(struct sqlConnection *conn, struct refSeqVerInfo *refSeqVerInfoLst, char *outFile)
 /* get request RNA sequences from database */
 {
 FILE *fh = mustOpen(outFile, "w");
 struct refSeqVerInfo *rsvi;
 for (rsvi = refSeqVerInfoLst; rsvi != NULL; rsvi = rsvi->next)
     processRnaSeq(fh, conn, rsvi);
 carefulClose(&fh);
 }
 
 static void processProtSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi, struct hash *doneProts)
 /* get an protein sequence, which already includes version in name. Don't duplicate NPs */
 {
 char query[128];
 sqlSafef(query, sizeof(query), "SELECT protAcc FROM hgFixed.refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
 char *protAcc = sqlNeedQuickString(conn, query);
 if (isNotEmpty(protAcc) && hashLookup(doneProts, protAcc) == NULL)
     {
     struct dnaSeq *seq = hGenBankGetPepC(conn, protAcc, NULL);
     if (seq == NULL)
         errAbort("failed to get %s peptide sequence from database", protAcc);
     faWriteNext(fh, seq->name, seq->dna, seq->size);
     dnaSeqFree(&seq);
     hashAdd(doneProts, protAcc, NULL);
     }
 freeMem(protAcc);
 }
 
 static void getProtSeqs(struct sqlConnection *conn, struct refSeqVerInfo *refSeqVerInfoLst, char *outFile)
 /* get request prot sequences from database */
 {
 struct hash *doneProts = hashNew(16);
 FILE *fh = mustOpen(outFile, "w");
 struct refSeqVerInfo *rsvi;
 for (rsvi = refSeqVerInfoLst; rsvi != NULL; rsvi = rsvi->next)
     processProtSeq(fh, conn, rsvi, doneProts);
 carefulClose(&fh);
 hashFree(&doneProts);
 }
 
 static char *getProtAccVerIf(struct sqlConnection *conn, char *mrnaAcc, char *protAcc, char *buf, int bufSize)
 /* if protAcc is not empty, get acc.version, otherwise return empty */
 {
 if (isEmpty(protAcc))
     return "";
 else
     {
     int protVer = refSeqVerInfoGetVersion(protAcc, conn);
     if (protVer == 0)
         errAbort("no protein version entry for %s associated with %s", protAcc, mrnaAcc);
     safef(buf, bufSize, "%s.%d", protAcc, protVer);
     return buf;
     }
 }
 
 static char *getCds(struct sqlConnection *conn, char *acc)
 /* get CDS for an NM, results should be freed */
 {
 char query[1024];
 sqlSafef(query, sizeof(query), "SELECT hgFixed.cds.name FROM hgFixed.gbCdnaInfo,hgFixed.cds WHERE (hgFixed.gbCdnaInfo.acc = \"%s\") AND (hgFixed.gbCdnaInfo.cds = hgFixed.cds.id)", acc);
 return sqlNeedQuickString(conn, query);
 }
 
 static void processMetaData(FILE *fh, struct sqlConnection *conn, struct sqlConnection *conn2, struct refSeqVerInfo *rsvi)
 /* get meta data for an accession */
 {
 boolean isCoding = genbankIsRefSeqCodingMRnaAcc(rsvi->acc);
 char query[256];
 sqlSafef(query, sizeof(query), "SELECT rl.name,rl.product,rl.protAcc,rl.locusLinkId,rs.status FROM hgFixed.refLink rl, hgFixed.refSeqStatus rs WHERE (rl.mrnaAcc = \"%s\") and (rs.mrnaAcc = rl.mrnaAcc)", rsvi->acc);
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row = sqlNextRow(sr);
 if (row == NULL)
     errAbort("no RefLink entry for %s", rsvi->acc);
 char buf[32];
 char *protAccVer = getProtAccVerIf(conn2, rsvi->acc, row[2], buf, sizeof(buf));
 char *cds = isCoding ? getCds(conn2, rsvi->acc) : "";
 fprintf(fh, "%s.%d\t%s\t%s\t%s\t%s\t%s\t%s\n", rsvi->acc, rsvi->ver, protAccVer, row[0], row[3], cds, row[1], row[4]);
 sqlFreeResult(&sr);
 if (isCoding)
     freeMem(cds);
 }
 
 static void getMetaData(struct sqlConnection *conn, struct refSeqVerInfo *refSeqVerInfoLst, char *outFile)
 /* get request prot sequences from database */
 {
 struct sqlConnection *conn2 = sqlConnect(sqlGetDatabase(conn));
 static char *hdr = "#mrnaAcc\t" "protAcc\t" "geneName\t" "ncbiGeneId\t" "cds\t" "product\t" "status\n";
 FILE *fh = mustOpen(outFile, "w");
 fputs(hdr, fh);
 struct refSeqVerInfo *rsvi;
 for (rsvi = refSeqVerInfoLst; rsvi != NULL; rsvi = rsvi->next)
     processMetaData(fh, conn, conn2, rsvi);
 carefulClose(&fh);
 sqlDisconnect(&conn2);
 }
 
 static void refSeqGet(char *db, char *aligns, char *geneAnns, char *rnaSeqs, char *protSeqs,
                       char *metaData, boolean getNM, boolean getNR, char *accList)
 /* refSeqGet - retrieve refseq data from the database. */
 {
 struct sqlConnection *conn = sqlConnect(db);
 struct refSeqVerInfo *refSeqVerInfoLst = NULL;
 struct hash *refSeqVerInfoTbl = (accList == NULL)
     ? refSeqVerInfoFromDb(conn, getNM, getNR, &refSeqVerInfoLst)
     : refSeqVerInfoFromFile(conn, accList, &refSeqVerInfoLst);
 if (aligns != NULL)
     getAligns(conn, refSeqVerInfoTbl, aligns);
 if (geneAnns != NULL)
     getGeneAnns(conn, refSeqVerInfoTbl, geneAnns);
 if (rnaSeqs != NULL)
     getRnaSeqs(conn, refSeqVerInfoLst, rnaSeqs);
 if (protSeqs != NULL)
     getProtSeqs(conn, refSeqVerInfoLst, protSeqs);
 if (metaData != NULL)
     getMetaData(conn, refSeqVerInfoLst, metaData);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
 
 if (optionExists("accList"))
     {
     if (optionExists("getNM"))
         errAbort("can't specify both -accList and -getNM");
     if (optionExists("getNR"))
         errAbort("can't specify both -accList and -getNR");
     }
 boolean getNM = optionExists("getNM");
 boolean getNR = optionExists("getNR");
 if (!(getNM || getNR))
     getNM = getNR = TRUE;
 
 initGenbankTableNames("hgFixed");
 refSeqGet(argv[1],
           optionVal("aligns", NULL),
           optionVal("geneAnns", NULL),
           optionVal("rnaSeqs", NULL),
           optionVal("protSeqs", NULL),
           optionVal("metaData", NULL),
           getNM, getNR,
           optionVal("accList", NULL));
 return 0;
 }