e32ecd59b34531455b424a6f0249353fe5ec4262
braney
  Fri Jun 24 10:54:16 2022 -0700
fix problem with geneType in knownGene bigGenePred file

diff --git src/hg/utils/genePredToBigGenePred/genePredToBigGenePred.c src/hg/utils/genePredToBigGenePred/genePredToBigGenePred.c
index d656223..e1676d2 100644
--- src/hg/utils/genePredToBigGenePred/genePredToBigGenePred.c
+++ src/hg/utils/genePredToBigGenePred/genePredToBigGenePred.c
@@ -13,63 +13,66 @@
 /* Explain usage and exit. */
 {
 errAbort(
   "genePredToBigGenePred - converts genePred or genePredExt to bigGenePred input (bed format with extra fields)\n"
   "usage:\n"
   "  genePredToBigGenePred [-known] [-score=scores] [-geneNames=geneNames] [-colors=colors] file.gp stdout | sort -k1,1 -k2,2n > file.bgpInput\n"
   "NOTE: to build bigBed:\n"
   "   wget https://genome.ucsc.edu/goldenpath/help/examples/bigGenePred.as\n"
   "   bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as file.bgpInput chrom.sizes output.bb\n"
   "options:\n"
   "    -known                input file is a genePred in knownGene format\n"
   "    -score=scores         scores is two column file with id's mapping to scores\n"
   "    -geneNames=geneNames  geneNames is a three column file with id's mapping to two gene names\n"
   "    -colors=colors        colors is a four column file with id's mapping to r,g,b\n"
   "    -cds=cds              cds is a five column file with id's mapping to cds status codes and exonFrames (see knownCds.as)\n"
+  "    -geneType=geneType              geneType is a two column file with id's mapping to geneType\n"
   );
 }
 
 
 struct cds
 {
 char *name;
 enum cdsStatus  cdsStartStat;       /* enum('none','unk','incmpl','cmpl') */
 enum cdsStatus cdsEndStat;  /* enum('none','unk','incmpl','cmpl') */
 int exonCount;
 int *exonFrames;    /* Exon frame {0,1,2}, or -1 if no frame for exon */
 };
 
 struct geneNames 
 {
 char *name;
 char *name2;
 };
 
 struct hash *colorsHash = NULL;
 
 struct hash *scoreHash = NULL;
 struct hash *geneHash = NULL;
 struct hash *cdsHash = NULL;
+struct hash *geneTypeHash = NULL;
 boolean isKnown;
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"known", OPTION_BOOLEAN},
    {"score", OPTION_STRING},
    {"geneNames", OPTION_STRING},
    {"colors", OPTION_STRING},
    {"cds", OPTION_STRING},
+   {"geneType", OPTION_STRING},
    {NULL, 0},
 };
 
 #define MAX_BLOCKS 10000
 unsigned blockSizes[MAX_BLOCKS];
 unsigned blockStarts[MAX_BLOCKS];
 
 void outBigGenePred(FILE *fp, struct genePred *gp)
 {
 struct bigGenePred bgp;
 
 if (gp->exonCount > MAX_BLOCKS)
     errAbort("genePred has more than %d exons, make MAX_BLOCKS bigger in source", MAX_BLOCKS);
 
 if (gp->exonFrames == NULL)
@@ -131,30 +134,32 @@
 bgp.geneName = gp->name;
 bgp.geneName2 = gp->name2;
 if (geneHash)
     {
     struct geneNames *gn = hashFindVal(geneHash, gp->name);
     if (gn == NULL)
         warn("Warning: no gene name found for %s", gp->name);
     else
         {
         bgp.geneName = gn->name;
         bgp.geneName2 = gn->name2;
         }
     }
 
 bgp.geneType = NULL;
+if (geneTypeHash)
+    bgp.geneType = hashFindVal(geneTypeHash, gp->name);
 
 bigGenePredOutput(&bgp, fp, '\t', '\n');
 }
 
 void genePredToBigGenePred(char *genePredFile, char *bigGeneOutput)
 /* genePredToBigGenePred - converts genePred or genePredExt to bigGenePred. */
 {
 struct genePred *gp;
 if (isKnown)
     gp = genePredKnownLoadAll(genePredFile) ;
 else
     gp = genePredExtLoadAll(genePredFile) ;
 
 FILE *fp = mustOpen(bigGeneOutput, "w");
 
@@ -229,30 +234,34 @@
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 isKnown = optionExists("known");
 
 char *scoreFile = optionVal("score", NULL);
 if (scoreFile != NULL)
     scoreHash = hashTwoColumnFile(scoreFile);
 
+char *geneTypeFile = optionVal("geneType", NULL);
+if (geneTypeFile != NULL)
+    geneTypeHash = hashTwoColumnFile(geneTypeFile);
+
 char *geneNames = optionVal("geneNames", NULL);
 if (geneNames != NULL)
     geneHash = hashGeneNames(geneNames);
 
 char *cdsValues = optionVal("cds", NULL);
 if (cdsValues != NULL)
     cdsHash = hashCds(cdsValues);
 
 char *colors = optionVal("colors", NULL);
 if (colors != NULL)
     colorsHash = hashColors(colors);
 
 genePredToBigGenePred(argv[1], argv[2]);
 return 0;
 }