src/hg/utils/gencodeGeneSymVerTx/gencodeGeneSymVerTx.c 6eaed8d72f6b12600d7fc8f6bb94b14e396b0948

6eaed8d72f6b12600d7fc8f6bb94b14e396b0948
kent
  Mon Jan 18 20:31:32 2021 -0800
Adding in refGene name2 column search as basically the last thing to try.  The way it selects the best transcript out of the refGene pile-up is pretty ad-hoc, just favoring protein coding and better curated, and often just taking the first of many equivalently scoring ones.

diff --git src/hg/utils/gencodeGeneSymVerTx/gencodeGeneSymVerTx.c src/hg/utils/gencodeGeneSymVerTx/gencodeGeneSymVerTx.c
index 3d799b6..39595af 100644
--- src/hg/utils/gencodeGeneSymVerTx/gencodeGeneSymVerTx.c
+++ src/hg/utils/gencodeGeneSymVerTx/gencodeGeneSymVerTx.c
@@ -1,81 +1,83 @@
 /* gencodeGeneSymVerTx - Create a tab separated file with gene ID/symbol/best transcript mapping for many 
  * version of gencode.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "fieldedTable.h"
 #include "genePred.h"
+#include "jksql.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "gencodeGeneSymVerTx - Create a tab separated file with gene ID/symbol/best transcript mapping \n"
   "for many version of gencode.  Internal to UCSC - depends on local directory structures.\n"
   "        /hive/data/genomes/hg*/bed/gencodeV17/data/gencode.tsv\n"
-  "Needs to contain exactly the files we need for input.\n"
+  "Needs to contain exactly the files we need for input. Also this will poll the refGene table\n"
   "usage:\n"
   "   gencodeGeneSymVerTx input.list\toutput.tsv\n"
   "where input.lst is files to be run on.  The file names are parsed and should be\n"
   "of the form:\n"
   "        /hive/data/genomes/hg*/bed/gencodeV17/data/gencode.tsv\n"
   "and there needs to be a gencode.gp in same dir\n"
   "options:\n"
   "   -xxx=XXX\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 struct transcript
 /* Collects info for a single transcript */
     {
     struct transcript *next;
     char *name;	    /* ENST000... */
     char *status;
     char *type;
     struct slName *tags;
+    struct genePred *gp;
     };
 
 struct gene
 /* Collects info from multiple transcripts */
     {
     struct gene *next;
     char *name;			/* ENSG000... */
     char *symbol;	/* HUGO name */
     struct transcript *txList;	/* List of transcripts */
     struct transcript *bestTx;  /* Best transcript */
     };
 
 struct version
 /* One record per gencode version */
     {
     struct version *next;	/* Next in list */
     char *name;			/* gencodeV19 gencodeV34 etc */
-    char *metaName;		/* Full path to gencode.tsv file */
-    char *genePredName;		/* Full path to gencode.gp file */
+    char *metaName;		/* Full path to gencode.tsv file if any*/
+    char *genePredName;		/* Full path to gencode.gp file if any */
     char *ucscDb;		/* hg19, hg38, etc */
     struct gene *geneList;	/* List of genes */
     struct hash *gpHash;	/* Hash of genePredictions keyed by transcript ID */
     };
 
-int scoreTranscript(struct transcript *tx)
+int scoreGencodeTx(struct transcript *tx)
 /* Return a score for transcripts, higher is better.  Weights are ad hoc fit to data */
 {
 int score = 0;
 struct slName *tag;
 for (tag = tx->tags; tag != NULL; tag = tag->next)
      {
      char *name = tag->name;
      if (sameString("appris_principal", name))
           score += 30000;
      if (sameString("appris_principal_1", name))
           score += 30000;
      else if (startsWith("appris_principal", name))
           score += 15000;
      else if (sameString("appris_candidate_highest_score", name))
           score += 20000;
@@ -112,114 +114,210 @@
 else if (endsWith("pseudogene", type))
     score -= 1000;
 return score;
 }
 
 struct slName *parseTags(struct hash *tagHash, char *tagString)
 /* Parse tagString to reveal string list. */
 {
 struct slName *tagList = slNameListFromString(tagString, ',');
 struct slName *tag;
 for (tag = tagList; tag != NULL; tag = tag->next)
     hashIncInt(tagHash, tag->name);
 return tagList;
 }
 
-struct hash *hashGenePredFile(char *fileName)
-/* Load in a genepred into a hash keyed by name */
+struct hash *hashGenePredList(struct genePred *gpList)
+/* Load a hash up with gene preds keyed by name */
 {
-struct genePred *gp, *gpList = genePredLoadAll(fileName);
 struct hash *hash = hashNew(0);
+struct genePred *gp;
 for (gp = gpList; gp != NULL; gp = gp->next)
     hashAdd(hash, gp->name, gp);
 return hash;
 }
 
+struct hash *hashGenePredFile(char *fileName)
+/* Load in a genepred into a hash keyed by name */
+{
+struct genePred *gpList = genePredLoadAll(fileName);
+return hashGenePredList(gpList);
+}
+
+struct gene *geneNew(char *name, char *symbol, struct hash *geneHash)
+/* Make a new nearly empty gene structure */
+{
+struct gene *gene;
+struct lm *lm = geneHash->lm;
+lmAllocVar(lm, gene);
+gene->symbol = lmCloneString(lm, symbol);
+hashAddSaveName(geneHash, name, gene, &gene->name);
+return gene;
+}
 
-struct gene *makeGeneList(struct version *v)
-/* makeGeneList - Create a table that links gene with a canonical transcript to 
+struct transcript *transcriptNew(char *name)
+/* Make a new nearly empty transcript */
+{
+struct transcript *tx;
+AllocVar(tx);
+tx->name = cloneString(name);
+return tx;
+}
+
+
+struct gene *makeGencodeGeneList(struct version *v)
+/* makeGencodeGeneList - Create a table that links gene with a canonical transcript to 
  *  represent the gene for a particular version of gencode. */
 {
 char *metaIn = v->metaName;
 
 /* Read in table and set up indexes into required fields */
 static char *required[] = {"geneId", "geneName", "transcriptId", "tags", 
 	"transcriptStatus", "transcriptType"};
 struct fieldedTable *table = fieldedTableFromTabFile(metaIn, metaIn, required, ArraySize(required));
 verbose(1, "Read %s with %d rows and %d fields\n", metaIn, table->rowCount, table->fieldCount);
 int geneIx = fieldedTableFindFieldIx(table, "geneId");
 int geneNameIx = fieldedTableFindFieldIx(table, "geneName");
 int transcriptIx = fieldedTableFindFieldIx(table, "transcriptId");
 int tagsIx = fieldedTableFindFieldIx(table, "tags");
 int statusIx = fieldedTableFindFieldIx(table, "transcriptStatus");
 int typeIx = fieldedTableFindFieldIx(table, "transcriptType");
 
 struct hash *statusHash = hashNew(0);   // Null val uniqueness cache for status
 struct hash *typeHash = hashNew(0);	// Null val uniqueness cache for type
 struct hash *tagHash = hashNew(0);	// Null val uniqueness cache for tag
 struct hash *txHash = hashNew(0);	// Null val uniqueness cache for transcripts
 
 struct hash *geneHash = hashNew(0);	// gene valued
 struct gene *geneList = NULL;
 
 /* Scan through table building up transcripts and genes. */
-struct fieldedRow *row;
-for (row = table->rowList; row != NULL; row = row->next)
+struct fieldedRow *fr;
+for (fr = table->rowList; fr != NULL; fr = fr->next)
     {
-    char *geneName = row->row[geneIx];
+    char **row = fr->row;
+    char *geneName = row[geneIx];
     struct gene *gene = hashFindVal(geneHash, geneName);
     if (gene == NULL)
         {
-	AllocVar(gene);
-	gene->symbol = cloneString(row->row[geneNameIx]);
-	hashAddSaveName(geneHash, geneName, gene, &gene->name);
+	gene = geneNew(geneName, row[geneNameIx], geneHash);
 	slAddHead(&geneList, gene);
 	}
-    struct transcript *tx;
-    AllocVar(tx);
-    tx->status = hashStoreName(statusHash, row->row[statusIx]);
-    tx->type = hashStoreName(typeHash,row->row[typeIx]);
-    tx->tags = parseTags(tagHash, row->row[tagsIx]);
-    tx->name = hashStoreName(txHash, row->row[transcriptIx]);
+    struct transcript *tx = transcriptNew(row[transcriptIx]);
+    tx->status = hashStoreName(statusHash, row[statusIx]);
+    tx->type = hashStoreName(typeHash,row[typeIx]);
+    tx->tags = parseTags(tagHash, row[tagsIx]);
     slAddHead(&gene->txList, tx);
     }
 slReverse(&geneList);
-verbose(1, "Have %d genes, %d transcripts, %d status, %d type, %d tags\n", 
+verbose(2, "Have %d genes, %d transcripts, %d status, %d type, %d tags\n", 
     geneHash->elCount, txHash->elCount, statusHash->elCount, typeHash->elCount, tagHash->elCount);
 
 /* Find best for each gene */
 struct gene *gene;
 for (gene = geneList; gene != NULL; gene = gene->next)
     {
     slReverse(&gene->txList);
     struct transcript *bestTx = NULL, *tx = NULL;
     int bestScore = -BIGNUM;
     for (tx = gene->txList; tx != NULL; tx = tx->next)
         {
-	int score = scoreTranscript(tx);
+	int score = scoreGencodeTx(tx);
 	if (score > bestScore)
 	     {
 	     bestScore = score;
 	     bestTx = tx;
 	     }
 	}
     gene->bestTx = bestTx;
     }
-verbose(1, "Found best genes\n");
+verbose(2, "Found best genes\n");
 return geneList;
 }
 
+int scoreRefSeqTx(struct transcript *tx)
+/* Return a score for transcripts, higher is better.  Weights are ad hoc fit to data */
+{
+char *name = tx->name;
+int score = 0;
+if (startsWith("NM_", name))
+     score += 9000;
+else if (startsWith("NR_", name))
+     score += 8000;
+else if (startsWith("NP_", name))
+     score += 7000;
+struct genePred *gp = tx->gp;
+if (gp->cdsStart < gp->cdsEnd)
+     score += 600;
+
+/* Favor mappings to real chromosomes */
+score += 90 - strlen(gp->chrom);
+return score;
+}
+
+struct transcript *findBestRefTx(struct transcript *txList)
+/* Return best looking transcript in list assuming it's a refSeq thing */
+{
+struct transcript *tx;
+int bestScore = -BIGNUM;
+struct transcript *bestTx = NULL;
+
+for (tx = txList; tx != NULL; tx = tx->next)
+    {
+    int score = scoreRefSeqTx(tx);
+    if (score > bestScore)
+        {
+	bestScore = score;
+	bestTx = tx;
+	}
+    }
+return bestTx;
+}
+
+
+struct gene *makeRefGeneList(struct version *v, struct genePred *gpList)
+/* We turn genePred format list of transcripts into list of genes by fusing on 
+ * name2 field */
+{
+struct hash *geneHash = hashNew(0);
+struct genePred *gp;
+struct gene *geneList = NULL;
+
+for (gp = gpList; gp != NULL; gp = gp->next)
+    {
+    char *name = gp->name2;
+    struct gene *gene = hashFindVal(geneHash, name);
+    if (gene == NULL)
+        {
+	gene = geneNew(name, name, geneHash);
+	slAddHead(&geneList, gene);
+	}
+    struct transcript *tx = transcriptNew(gp->name);
+    tx->gp = gp;
+    slAddHead(&gene->txList, tx);
+    }
+
+/* Now go through trying to make best transcript for the gene */
+struct gene *gene;
+for (gene = geneList; gene != NULL; gene = gene->next)
+    {
+    slReverse(&gene->txList);
+    gene->bestTx = findBestRefTx(gene->txList);
+    gene->name = cloneString(gene->bestTx->name);   
+    }
+return geneList;
+}
 
 
 struct version *versionsFromFile(char *input)
 // Read in input file which is a list of things like
 //      /hive/data/genomes/hg*/bed/gencodeV17/data/gencode.tsv
 // and return a list of versions */
 {
 struct lineFile *lf = lineFileOpen(input, TRUE);
 char *line;
 struct version *list = NULL, *v;
 while (lineFileNextReal(lf, &line))
     {
     line = trimSpaces(line);
     if (!fileExists(line))
         errAbort("%s in %s doesn't exist", line, input);
@@ -229,58 +327,106 @@
     if ((v->ucscDb = stringBetween("/genomes/", "/bed/", line)) == NULL)
         errAbort("Couldn't parse ucscDb version from %s", line);
     v->metaName = cloneString(line);
 
     /* Make .gp file name from metaName*/
     char buf[PATH_LEN];
     chopSuffix(line);
     safef(buf, sizeof(buf), "%s.gp", line);
     v->genePredName = cloneString(buf);
     slAddHead(&list, v);
     }
 slReverse(&list);
 return list;
 }
 
+struct genePred *gpLoadAllFromTable(char *db, char *table)
+/* Connect to database and load all genePreds from table */
+{
+struct genePred *list = NULL;
+struct sqlConnection *conn = sqlConnect(db);
+char query[512];
+sqlSafef(query, sizeof(query), "select * from %s", table);
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    struct genePred *gp = genePredKnownLoad(row+1, 12); // Skip bin
+    slAddHead(&list, gp);
+    }
+slReverse(&list);
+sqlFreeResult(&sr);
+sqlDisconnect(&conn);
+return list;
+}
+
+struct version *refSeqVersions()
+/* Return list of refSeq versions, just from hg19 or hg38 for now */
+{
+char *targets[] = {"hg19", "hg38"};
+int targetCount = ArraySize(targets);
+struct version *list = NULL;
+int i;
+for (i=0; i<targetCount; ++i)
+    {
+    char *db = targets[i];
+    struct version *v;
+    AllocVar(v);
+    v->name = "refGene";
+    v->ucscDb = db;
+    struct genePred *gpList = gpLoadAllFromTable(db, v->name);
+    v->geneList = makeRefGeneList(v, gpList);
+    v->gpHash = hashGenePredList(gpList);
+    verbose(1, "refSeq %s %d transcripts into %d genes\n", db, slCount(gpList), slCount(v->geneList));
+    slAddHead(&list, v);
+    }
+return list;
+}
+
 void gencodeGeneSymVerTx(char *input, char *output)
 /* gencodeGeneSymVerTx - Create a tab separated file with gene ID/symbol/best transcript 
  * mapping for many version of gencode. */
 {
+/* Read in refSeq */
+struct version *refSeqVersionList = refSeqVersions();
+verbose(1, "refSeq %d versions\n", slCount(refSeqVersionList));
+
 struct version *v, *versionList = versionsFromFile(input);
-verbose(1, "Read %d versions\n", slCount(versionList));
+verbose(1, "Gencode %d versions\n", slCount(versionList));
 
-/* Read them all in before making output.  */
+/* Read in all gencode from file from output.  */
 for (v = versionList; v != NULL; v = v->next)
     {
-    v->geneList = makeGeneList(v);
+    v->geneList = makeGencodeGeneList(v);
     v->gpHash = hashGenePredFile(v->genePredName);
     }
 
+versionList = slCat(refSeqVersionList, versionList);
+
 /* Now make output */
 FILE *f = mustOpen(output, "w");
 fprintf(f, "#gene\tsymbol\tgencodeVersion\tucscDb\tchrom\tchromStart\tchromEnd\ttranscript\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tblockCount\tblockSizes\tblockStarts\n");
 for (v = versionList; v != NULL; v = v->next)
     {
     struct gene *gene;
     for (gene = v->geneList; gene != NULL; gene = gene->next)
         {
-	struct hashEl *hel;
-	for (hel = hashLookup(v->gpHash, gene->bestTx->name); hel != NULL; hel = hashLookupNext(hel))
+	struct genePred *gp = hashFindVal(v->gpHash, gene->bestTx->name);
+	if (gp != NULL)
 	    {
 	    fprintf(f, "%s\t%s\t%s\t%s\t",   
 		gene->name, gene->symbol, v->name, v->ucscDb);
-	    struct genePred *gp = hel->val;
 	    /* Print scalar bed fields. */
 	    fprintf(f, "%s\t", gp->chrom);
 	    fprintf(f, "%u\t", gp->txStart);
 	    fprintf(f, "%u\t", gp->txEnd);
 	    fprintf(f, "%s\t", gene->bestTx->name);
 	    fprintf(f, "%u\t", 0);
 	    fprintf(f, "%s\t", gp->strand);
 	    fprintf(f, "%u\t", gp->cdsStart);
 	    fprintf(f, "%u\t", gp->cdsEnd);
 	    fprintf(f, "%u\t", 0);
 	    fprintf(f, "%u\t", gp->exonCount);
 
 	    /* Print exon-by-exon fields. */
 	    int i;