59e29ecdf6202d659db238f69903cb27c5e8f69c
jcasper
  Fri Aug 23 10:49:52 2024 -0700
A few code changes and docs in support of knownGene V47, refs #34219

diff --git src/hg/makeDb/makeGencodeKnownGene/makeGencodeKnownGene.c src/hg/makeDb/makeGencodeKnownGene/makeGencodeKnownGene.c
index 355645c..6dfa14a 100644
--- src/hg/makeDb/makeGencodeKnownGene/makeGencodeKnownGene.c
+++ src/hg/makeDb/makeGencodeKnownGene/makeGencodeKnownGene.c
@@ -55,99 +55,99 @@
 struct hash *hash = newHash(6);
 char *words[2];
 int wordsRead;
 
 while( (wordsRead = lineFileChopNext(lf, words, sizeof(words)/sizeof(char *)) ))
     hashAdd(hash, words[0], cloneString(words[1]));
 
 lineFileClose(&lf);
 return hash;
 }
 
 static struct hash *getRefSeqTable(struct sqlConnection *conn,  char *version)
 {
 char versionQuery[4096];
 
-safef(versionQuery, sizeof versionQuery, "select * from wgEncodeGencodeRefSeq%s",  version);
+sqlSafef(versionQuery, sizeof versionQuery, "select * from wgEncodeGencodeRefSeq%s",  version);
 
 char **row;
 struct sqlResult *sr;
 struct hash *hash = newHash(6);
 
 sr = sqlGetResult(conn, versionQuery);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct wgEncodeGencodeRefSeq *wga = wgEncodeGencodeRefSeqLoad(row);
     hashAdd(hash, wga->transcriptId, wga);
     }
 sqlFreeResult(&sr);
 
 return hash;
 }
 
 static struct hash *getAttrsTable(struct sqlConnection *conn,  char *version)
 {
 char versionQuery[4096];
 
-safef(versionQuery, sizeof versionQuery, "select * from wgEncodeGencodeAttrs%s",  version);
+sqlSafef(versionQuery, sizeof versionQuery, "select * from wgEncodeGencodeAttrs%s",  version);
 
 char **row;
 struct sqlResult *sr;
 struct hash *hash = newHash(6);
 
 sr = sqlGetResult(conn, versionQuery);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct wgEncodeGencodeAttrs *wga = wgEncodeGencodeAttrsLoad(row, sqlCountColumns(sr));
     hashAdd(hash, wga->transcriptId, wga);
     }
 sqlFreeResult(&sr);
 
 return hash;
 }
 
 static struct hash *getMapTable(struct sqlConnection *conn, char *query, char *version)
 {
 char versionQuery[4096];
 
 if (version != NULL)
-    safef(versionQuery, sizeof versionQuery, "%s%s", query, version);
+    sqlSafef(versionQuery, sizeof versionQuery, query, version);
 else
-    safecpy(versionQuery, sizeof versionQuery,  query);
+    sqlSafef(versionQuery, sizeof versionQuery, query, NULL);
 
 char **row;
 struct sqlResult *sr;
 struct hash *hash = newHash(6);
 
 sr = sqlGetResult(conn, versionQuery);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     hashAdd(hash, row[0], cloneString(row[1]));
     }
 sqlFreeResult(&sr);
 
 return hash;
 }
 
 static struct genePred *loadGenePreds(struct sqlConnection *conn, char *query, char *version)
 {
 struct genePred *gpList = NULL;
 char versionQuery[4096];
 char **row;
 struct sqlResult *sr;
 
-safef(versionQuery, sizeof versionQuery, "%s%s", query, version);
+sqlSafef(versionQuery, sizeof versionQuery, query, version);
 sr = sqlGetResult(conn, versionQuery);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct genePred *gp = genePredExtLoad(&row[1], 15);
 
     slAddHead(&gpList, gp);
     }
 
 sqlFreeResult(&sr);
 
 slReverse(&gpList);
 return gpList;
 }
 
 static void writeOutOneKnownGeneNoNl(FILE *f, struct genePred *gp, struct hashes *hashes)
@@ -254,66 +254,68 @@
 FILE *fx = mustOpen("knownGeneExt.gp", "w");
 
 for (gp = compGenePreds; gp; gp = gp->next)
     {
     writeOutOneKnownGene(f, gp, hashes);
     writeOutOneKnownGeneExt(fx, gp, hashes);
     }
 fclose(f);
 fclose(fx);
 }
 
 static char *descriptionFromAcc(struct sqlConnection *conn, char *acc)
 {
 char query[4096];
 
-safef(query, sizeof query, "select d.name from all_mrna a, hgFixed.gbCdnaInfo c, hgFixed.description d where a.qName=\"%s\" and  a.qName=c.acc and c.description = d.id", acc );
+sqlSafef(query, sizeof query, "select d.name from all_mrna a, hgFixed.gbCdnaInfo c, hgFixed.description d where a.qName=\"%s\" and  a.qName=c.acc and c.description = d.id", acc );
 return sqlQuickString(conn, query);
 }
 
 static char *uniProtDescriptionFromAcc(struct sqlConnection *conn, char *acc)
 {
 char query[4096];
 
-safef(query, sizeof query, "select v.val from commentVal v, comment c where c.acc = '%s' and v.id=c.commentVal", acc);
+sqlSafef(query, sizeof query, "select v.val from commentVal v, comment c where c.acc = '%s' and v.id=c.commentVal", acc);
 return sqlQuickString(conn, query);
 }
 
 static char *displayIdFromAcc(struct sqlConnection *conn, char *acc)
 {
 char query[4096];
 
-safef(query, sizeof query, "select val from displayId where acc = '%s'", acc);
+sqlSafef(query, sizeof query, "select val from displayId where acc = '%s'", acc);
 return sqlQuickString(conn, query);
 }
 
 
 static void outputDescription(FILE *f, struct sqlConnection *conn,  struct sqlConnection *uconn,struct sqlConnection *pconn, struct genePred *gp, struct hashes *hashes)
 {
 char *description;
 // try refSeq
 struct wgEncodeGencodeRefSeq *wgr = (struct wgEncodeGencodeRefSeq *)hashFindVal(hashes->genToRefSeq, gp->name);
 
 if (wgr != NULL)
     {
     char buffer[256];
 
     safecpy(buffer, sizeof buffer, wgr->rnaAcc);
+
+/*  Don't need to strip the version; we're getting info from ncbiRefSeqLink now, which includes them
     char *ptr = strrchr(buffer, '.');
     if (ptr != NULL)
 	*ptr = 0;
-
+*/
     description = (char *)hashFindVal(hashes->refSeqToDescription, buffer);
 
     if (!isEmpty(description))
 	{
 	fprintf(f, "%s (from RefSeq %s)\t", description, buffer);
 	return;
 	}
     }
 //
 //  try Uniprot
 char *uniProtAcc = (char *)hashFindVal(hashes->genToUniProt, gp->name);
 
 if (uniProtAcc != NULL)
     {
     //char *description = (char *)hashFindVal(hashes->descriptionFromUniProtId, gp->name);
@@ -627,47 +629,48 @@
     }
 }
 #endif
 
 static void makeGencodeKnownGene(char *database, char *tempDatabase, char *version, char *txToAccTab)
 /* makeGencodeKnownGene - make knownGene from Gencode tables. */
 {
 struct sqlConnection *conn = sqlConnect(database);
 struct sqlConnection *tconn = sqlConnect(tempDatabase);
 struct sqlConnection *pconn = sqlConnect("proteome");
 //struct sqlConnection *uconn = sqlConnect("uniProt");
 struct hashes hashes;
 hashes.genToUC = getMap(txToAccTab);
 hashes.genToAttrs = getAttrsTable(conn, version);
 //hashes.genToAnnRemark = getMapTable(conn, "select transcriptId, remark from wgEncodeGencodeAnnotationRemark", version);
-hashes.genToUniProt = getMapTable(conn, "select transcriptId, acc from wgEncodeGencodeUniProt", version);
+hashes.genToUniProt = getMapTable(conn, "select transcriptId, acc from wgEncodeGencodeUniProt%s", version);
 hashes.genToRefSeq = getRefSeqTable(conn, version);
 //hashes.genToRefSeq = getMapTable(tconn, "select name, value from knownToRefSeq", NULL);
-struct genePred *compGenePreds = loadGenePreds(conn, "select * from wgEncodeGencodeComp", version);
-struct genePred *pseudoGenePreds = loadGenePreds(conn, "select * from wgEncodeGencodePseudoGene", version);
+struct genePred *compGenePreds = loadGenePreds(conn, "select * from wgEncodeGencodeComp%s", version);
+struct genePred *pseudoGenePreds = loadGenePreds(conn, "select * from wgEncodeGencodePseudoGene%s", version);
 compGenePreds = slCat(compGenePreds, pseudoGenePreds);
 hashes.refSeqToPepName = getMapTable(conn, "select mrnaAcc,protAcc from hgFixed.refLink", NULL);
 //hashes.mrnaToDescription = getMapTable(conn, "select a.name,d.name from all_mrna a, gbCdnaInfo c, description d where a.qName=c.acc and c.description = d.id", NULL);
-hashes.refSeqToDescription = getMapTable(conn, "select g.name,d.name from refGene g, hgFixed.gbCdnaInfo c, hgFixed.description d where g.name=c.acc and c.description = d.id", NULL);
+//hashes.refSeqToDescription = getMapTable(conn, "select g.name,d.name from refGene g, hgFixed.gbCdnaInfo c, hgFixed.description d where g.name=c.acc and c.description = d.id", NULL);
+hashes.refSeqToDescription = getMapTable(conn, "select id,product from ncbiRefSeqLink", NULL);
 hashes.hgncDescriptionFromGeneName = getMapTable(pconn, "select symbol, name from hgnc", NULL);
 hashes.refSeqToStatus = getMapTable(conn, "select mrnaAcc, status from hgFixed.refSeqStatus", NULL);
-hashes.genToPdb = getMapTable(conn, "select transcriptId, pdbId from wgEncodeGencodePdb", version);
+hashes.genToPdb = getMapTable(conn, "select transcriptId, pdbId from wgEncodeGencodePdb%s", version);
 //hashes.displayIdFromUniProtId = getMapTable(uconn, "select acc, val from displayId", NULL);
 //printf("displayIdFromUniProtId %ld\n", time(NULL) - start);
 //hashes.descriptionFromUniProtId = getMapTable(uconn, "select c.acc, v.val from commentVal v, comment c where v.id=c.commentVal", NULL);
 //printf("descriptionFromUniProtId %ld\n", time(NULL) - start);
-hashes.genToTags = getMapTable(conn, "select transcriptId, tag from wgEncodeGencodeTag", version);
+hashes.genToTags = getMapTable(conn, "select transcriptId, tag from wgEncodeGencodeTag%s", version);
 if (!justKnown)
     outputKnownCanonical(compGenePreds, &hashes);
 outputKnownGene(compGenePreds, &hashes);
 if (justKnown)
     exit(0);
 outputKnownGeneColor(compGenePreds, &hashes);
 hashes.genToMrna = getMapTable(tconn, "select name, value from knownToMrnaSingle", NULL);
 outputKgXref(conn, compGenePreds, &hashes);
 outputKnownCanonical(compGenePreds, &hashes);
 //outputKnownToRefSeq(compGenePreds, &hashes);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {