59e29ecdf6202d659db238f69903cb27c5e8f69c jcasper Fri Aug 23 10:49:52 2024 -0700 A few code changes and docs in support of knownGene V47, refs #34219 diff --git src/hg/makeDb/makeGencodeKnownGene/makeGencodeKnownGene.c src/hg/makeDb/makeGencodeKnownGene/makeGencodeKnownGene.c index 355645c..6dfa14a 100644 --- src/hg/makeDb/makeGencodeKnownGene/makeGencodeKnownGene.c +++ src/hg/makeDb/makeGencodeKnownGene/makeGencodeKnownGene.c @@ -55,99 +55,99 @@ struct hash *hash = newHash(6); char *words[2]; int wordsRead; while( (wordsRead = lineFileChopNext(lf, words, sizeof(words)/sizeof(char *)) )) hashAdd(hash, words[0], cloneString(words[1])); lineFileClose(&lf); return hash; } static struct hash *getRefSeqTable(struct sqlConnection *conn, char *version) { char versionQuery[4096]; -safef(versionQuery, sizeof versionQuery, "select * from wgEncodeGencodeRefSeq%s", version); +sqlSafef(versionQuery, sizeof versionQuery, "select * from wgEncodeGencodeRefSeq%s", version); char **row; struct sqlResult *sr; struct hash *hash = newHash(6); sr = sqlGetResult(conn, versionQuery); while ((row = sqlNextRow(sr)) != NULL) { struct wgEncodeGencodeRefSeq *wga = wgEncodeGencodeRefSeqLoad(row); hashAdd(hash, wga->transcriptId, wga); } sqlFreeResult(&sr); return hash; } static struct hash *getAttrsTable(struct sqlConnection *conn, char *version) { char versionQuery[4096]; -safef(versionQuery, sizeof versionQuery, "select * from wgEncodeGencodeAttrs%s", version); +sqlSafef(versionQuery, sizeof versionQuery, "select * from wgEncodeGencodeAttrs%s", version); char **row; struct sqlResult *sr; struct hash *hash = newHash(6); sr = sqlGetResult(conn, versionQuery); while ((row = sqlNextRow(sr)) != NULL) { struct wgEncodeGencodeAttrs *wga = wgEncodeGencodeAttrsLoad(row, sqlCountColumns(sr)); hashAdd(hash, wga->transcriptId, wga); } sqlFreeResult(&sr); return hash; } static struct hash *getMapTable(struct sqlConnection *conn, char *query, char *version) { char versionQuery[4096]; if (version != NULL) - safef(versionQuery, sizeof versionQuery, "%s%s", query, version); + sqlSafef(versionQuery, sizeof versionQuery, query, version); else - safecpy(versionQuery, sizeof versionQuery, query); + sqlSafef(versionQuery, sizeof versionQuery, query, NULL); char **row; struct sqlResult *sr; struct hash *hash = newHash(6); sr = sqlGetResult(conn, versionQuery); while ((row = sqlNextRow(sr)) != NULL) { hashAdd(hash, row[0], cloneString(row[1])); } sqlFreeResult(&sr); return hash; } static struct genePred *loadGenePreds(struct sqlConnection *conn, char *query, char *version) { struct genePred *gpList = NULL; char versionQuery[4096]; char **row; struct sqlResult *sr; -safef(versionQuery, sizeof versionQuery, "%s%s", query, version); +sqlSafef(versionQuery, sizeof versionQuery, query, version); sr = sqlGetResult(conn, versionQuery); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredExtLoad(&row[1], 15); slAddHead(&gpList, gp); } sqlFreeResult(&sr); slReverse(&gpList); return gpList; } static void writeOutOneKnownGeneNoNl(FILE *f, struct genePred *gp, struct hashes *hashes) @@ -254,66 +254,68 @@ FILE *fx = mustOpen("knownGeneExt.gp", "w"); for (gp = compGenePreds; gp; gp = gp->next) { writeOutOneKnownGene(f, gp, hashes); writeOutOneKnownGeneExt(fx, gp, hashes); } fclose(f); fclose(fx); } static char *descriptionFromAcc(struct sqlConnection *conn, char *acc) { char query[4096]; -safef(query, sizeof query, "select d.name from all_mrna a, hgFixed.gbCdnaInfo c, hgFixed.description d where a.qName=\"%s\" and a.qName=c.acc and c.description = d.id", acc ); +sqlSafef(query, sizeof query, "select d.name from all_mrna a, hgFixed.gbCdnaInfo c, hgFixed.description d where a.qName=\"%s\" and a.qName=c.acc and c.description = d.id", acc ); return sqlQuickString(conn, query); } static char *uniProtDescriptionFromAcc(struct sqlConnection *conn, char *acc) { char query[4096]; -safef(query, sizeof query, "select v.val from commentVal v, comment c where c.acc = '%s' and v.id=c.commentVal", acc); +sqlSafef(query, sizeof query, "select v.val from commentVal v, comment c where c.acc = '%s' and v.id=c.commentVal", acc); return sqlQuickString(conn, query); } static char *displayIdFromAcc(struct sqlConnection *conn, char *acc) { char query[4096]; -safef(query, sizeof query, "select val from displayId where acc = '%s'", acc); +sqlSafef(query, sizeof query, "select val from displayId where acc = '%s'", acc); return sqlQuickString(conn, query); } static void outputDescription(FILE *f, struct sqlConnection *conn, struct sqlConnection *uconn,struct sqlConnection *pconn, struct genePred *gp, struct hashes *hashes) { char *description; // try refSeq struct wgEncodeGencodeRefSeq *wgr = (struct wgEncodeGencodeRefSeq *)hashFindVal(hashes->genToRefSeq, gp->name); if (wgr != NULL) { char buffer[256]; safecpy(buffer, sizeof buffer, wgr->rnaAcc); + +/* Don't need to strip the version; we're getting info from ncbiRefSeqLink now, which includes them char *ptr = strrchr(buffer, '.'); if (ptr != NULL) *ptr = 0; - +*/ description = (char *)hashFindVal(hashes->refSeqToDescription, buffer); if (!isEmpty(description)) { fprintf(f, "%s (from RefSeq %s)\t", description, buffer); return; } } // // try Uniprot char *uniProtAcc = (char *)hashFindVal(hashes->genToUniProt, gp->name); if (uniProtAcc != NULL) { //char *description = (char *)hashFindVal(hashes->descriptionFromUniProtId, gp->name); @@ -627,47 +629,48 @@ } } #endif static void makeGencodeKnownGene(char *database, char *tempDatabase, char *version, char *txToAccTab) /* makeGencodeKnownGene - make knownGene from Gencode tables. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlConnection *tconn = sqlConnect(tempDatabase); struct sqlConnection *pconn = sqlConnect("proteome"); //struct sqlConnection *uconn = sqlConnect("uniProt"); struct hashes hashes; hashes.genToUC = getMap(txToAccTab); hashes.genToAttrs = getAttrsTable(conn, version); //hashes.genToAnnRemark = getMapTable(conn, "select transcriptId, remark from wgEncodeGencodeAnnotationRemark", version); -hashes.genToUniProt = getMapTable(conn, "select transcriptId, acc from wgEncodeGencodeUniProt", version); +hashes.genToUniProt = getMapTable(conn, "select transcriptId, acc from wgEncodeGencodeUniProt%s", version); hashes.genToRefSeq = getRefSeqTable(conn, version); //hashes.genToRefSeq = getMapTable(tconn, "select name, value from knownToRefSeq", NULL); -struct genePred *compGenePreds = loadGenePreds(conn, "select * from wgEncodeGencodeComp", version); -struct genePred *pseudoGenePreds = loadGenePreds(conn, "select * from wgEncodeGencodePseudoGene", version); +struct genePred *compGenePreds = loadGenePreds(conn, "select * from wgEncodeGencodeComp%s", version); +struct genePred *pseudoGenePreds = loadGenePreds(conn, "select * from wgEncodeGencodePseudoGene%s", version); compGenePreds = slCat(compGenePreds, pseudoGenePreds); hashes.refSeqToPepName = getMapTable(conn, "select mrnaAcc,protAcc from hgFixed.refLink", NULL); //hashes.mrnaToDescription = getMapTable(conn, "select a.name,d.name from all_mrna a, gbCdnaInfo c, description d where a.qName=c.acc and c.description = d.id", NULL); -hashes.refSeqToDescription = getMapTable(conn, "select g.name,d.name from refGene g, hgFixed.gbCdnaInfo c, hgFixed.description d where g.name=c.acc and c.description = d.id", NULL); +//hashes.refSeqToDescription = getMapTable(conn, "select g.name,d.name from refGene g, hgFixed.gbCdnaInfo c, hgFixed.description d where g.name=c.acc and c.description = d.id", NULL); +hashes.refSeqToDescription = getMapTable(conn, "select id,product from ncbiRefSeqLink", NULL); hashes.hgncDescriptionFromGeneName = getMapTable(pconn, "select symbol, name from hgnc", NULL); hashes.refSeqToStatus = getMapTable(conn, "select mrnaAcc, status from hgFixed.refSeqStatus", NULL); -hashes.genToPdb = getMapTable(conn, "select transcriptId, pdbId from wgEncodeGencodePdb", version); +hashes.genToPdb = getMapTable(conn, "select transcriptId, pdbId from wgEncodeGencodePdb%s", version); //hashes.displayIdFromUniProtId = getMapTable(uconn, "select acc, val from displayId", NULL); //printf("displayIdFromUniProtId %ld\n", time(NULL) - start); //hashes.descriptionFromUniProtId = getMapTable(uconn, "select c.acc, v.val from commentVal v, comment c where v.id=c.commentVal", NULL); //printf("descriptionFromUniProtId %ld\n", time(NULL) - start); -hashes.genToTags = getMapTable(conn, "select transcriptId, tag from wgEncodeGencodeTag", version); +hashes.genToTags = getMapTable(conn, "select transcriptId, tag from wgEncodeGencodeTag%s", version); if (!justKnown) outputKnownCanonical(compGenePreds, &hashes); outputKnownGene(compGenePreds, &hashes); if (justKnown) exit(0); outputKnownGeneColor(compGenePreds, &hashes); hashes.genToMrna = getMapTable(tconn, "select name, value from knownToMrnaSingle", NULL); outputKgXref(conn, compGenePreds, &hashes); outputKnownCanonical(compGenePreds, &hashes); //outputKnownToRefSeq(compGenePreds, &hashes); } int main(int argc, char *argv[]) /* Process command line. */ {