e32ecd59b34531455b424a6f0249353fe5ec4262 braney Fri Jun 24 10:54:16 2022 -0700 fix problem with geneType in knownGene bigGenePred file diff --git src/hg/utils/genePredToBigGenePred/genePredToBigGenePred.c src/hg/utils/genePredToBigGenePred/genePredToBigGenePred.c index d656223..e1676d2 100644 --- src/hg/utils/genePredToBigGenePred/genePredToBigGenePred.c +++ src/hg/utils/genePredToBigGenePred/genePredToBigGenePred.c @@ -13,63 +13,66 @@ /* Explain usage and exit. */ { errAbort( "genePredToBigGenePred - converts genePred or genePredExt to bigGenePred input (bed format with extra fields)\n" "usage:\n" " genePredToBigGenePred [-known] [-score=scores] [-geneNames=geneNames] [-colors=colors] file.gp stdout | sort -k1,1 -k2,2n > file.bgpInput\n" "NOTE: to build bigBed:\n" " wget https://genome.ucsc.edu/goldenpath/help/examples/bigGenePred.as\n" " bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as file.bgpInput chrom.sizes output.bb\n" "options:\n" " -known input file is a genePred in knownGene format\n" " -score=scores scores is two column file with id's mapping to scores\n" " -geneNames=geneNames geneNames is a three column file with id's mapping to two gene names\n" " -colors=colors colors is a four column file with id's mapping to r,g,b\n" " -cds=cds cds is a five column file with id's mapping to cds status codes and exonFrames (see knownCds.as)\n" + " -geneType=geneType geneType is a two column file with id's mapping to geneType\n" ); } struct cds { char *name; enum cdsStatus cdsStartStat; /* enum('none','unk','incmpl','cmpl') */ enum cdsStatus cdsEndStat; /* enum('none','unk','incmpl','cmpl') */ int exonCount; int *exonFrames; /* Exon frame {0,1,2}, or -1 if no frame for exon */ }; struct geneNames { char *name; char *name2; }; struct hash *colorsHash = NULL; struct hash *scoreHash = NULL; struct hash *geneHash = NULL; struct hash *cdsHash = NULL; +struct hash *geneTypeHash = NULL; boolean isKnown; /* Command line validation table. */ static struct optionSpec options[] = { {"known", OPTION_BOOLEAN}, {"score", OPTION_STRING}, {"geneNames", OPTION_STRING}, {"colors", OPTION_STRING}, {"cds", OPTION_STRING}, + {"geneType", OPTION_STRING}, {NULL, 0}, }; #define MAX_BLOCKS 10000 unsigned blockSizes[MAX_BLOCKS]; unsigned blockStarts[MAX_BLOCKS]; void outBigGenePred(FILE *fp, struct genePred *gp) { struct bigGenePred bgp; if (gp->exonCount > MAX_BLOCKS) errAbort("genePred has more than %d exons, make MAX_BLOCKS bigger in source", MAX_BLOCKS); if (gp->exonFrames == NULL) @@ -131,30 +134,32 @@ bgp.geneName = gp->name; bgp.geneName2 = gp->name2; if (geneHash) { struct geneNames *gn = hashFindVal(geneHash, gp->name); if (gn == NULL) warn("Warning: no gene name found for %s", gp->name); else { bgp.geneName = gn->name; bgp.geneName2 = gn->name2; } } bgp.geneType = NULL; +if (geneTypeHash) + bgp.geneType = hashFindVal(geneTypeHash, gp->name); bigGenePredOutput(&bgp, fp, '\t', '\n'); } void genePredToBigGenePred(char *genePredFile, char *bigGeneOutput) /* genePredToBigGenePred - converts genePred or genePredExt to bigGenePred. */ { struct genePred *gp; if (isKnown) gp = genePredKnownLoadAll(genePredFile) ; else gp = genePredExtLoadAll(genePredFile) ; FILE *fp = mustOpen(bigGeneOutput, "w"); @@ -229,30 +234,34 @@ } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); isKnown = optionExists("known"); char *scoreFile = optionVal("score", NULL); if (scoreFile != NULL) scoreHash = hashTwoColumnFile(scoreFile); +char *geneTypeFile = optionVal("geneType", NULL); +if (geneTypeFile != NULL) + geneTypeHash = hashTwoColumnFile(geneTypeFile); + char *geneNames = optionVal("geneNames", NULL); if (geneNames != NULL) geneHash = hashGeneNames(geneNames); char *cdsValues = optionVal("cds", NULL); if (cdsValues != NULL) cdsHash = hashCds(cdsValues); char *colors = optionVal("colors", NULL); if (colors != NULL) colorsHash = hashColors(colors); genePredToBigGenePred(argv[1], argv[2]); return 0; }