68efbf55fbd3a55194ef24ca5e21d4bb09221c22 kent Sat Dec 26 13:07:43 2020 -0800 First version of something to make file to help assign gencode genes and symbols to best transcripts is done. diff --git src/hg/utils/gencodeGeneSymVerTx/gencodeGeneSymVerTx.c src/hg/utils/gencodeGeneSymVerTx/gencodeGeneSymVerTx.c new file mode 100644 index 0000000..031f29a --- /dev/null +++ src/hg/utils/gencodeGeneSymVerTx/gencodeGeneSymVerTx.c @@ -0,0 +1,310 @@ +/* gencodeGeneSymVerTx - Create a tab separated file with gene ID/symbol/best transcript mapping for many + * version of gencode.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "fieldedTable.h" +#include "genePred.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "gencodeGeneSymVerTx - Create a tab separated file with gene ID/symbol/best transcript mapping \n" + "for many version of gencode.\n" + " /hive/data/genomes/hg*/bed/gencodeV17/data/gencode.tsv\n" + "containing exactly the files we need for input.\n" + "usage:\n" + " gencodeGeneSymVerTx input.list\toutput.tsv\n" + "where input.lst is files to be run on. The file names are parsed and should be\n" + "of the form:\n" + " /hive/data/genomes/hg*/bed/gencodeV17/data/gencode.tsv\n" + "and there needs to be a gencode.gp in same dir\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +struct transcript +/* Collects info for a single transcript */ + { + struct transcript *next; + char *name; /* ENST000... */ + char *status; + char *type; + struct slName *tags; + }; + +struct gene +/* Collects info from multiple transcripts */ + { + struct gene *next; + char *name; /* ENSG000... */ + char *symbol; /* HUGO name */ + struct transcript *txList; /* List of transcripts */ + struct transcript *bestTx; /* Best transcript */ + }; + +struct version +/* One record per gencode version */ + { + struct version *next; /* Next in list */ + char *name; /* gencodeV19 gencodeV34 etc */ + char *metaName; /* Full path to gencode.tsv file */ + char *genePredName; /* Full path to gencode.gp file */ + char *ucscDb; /* hg19, hg38, etc */ + struct gene *geneList; /* List of genes */ + struct hash *gpHash; /* Hash of genePredictions keyed by transcript ID */ + }; + +int scoreTranscript(struct transcript *tx) +/* Return a score for transcripts, higher is better. Weights are ad hoc fit to data */ +{ +int score = 0; +struct slName *tag; +for (tag = tx->tags; tag != NULL; tag = tag->next) + { + char *name = tag->name; + if (sameString("appris_principal", name)) + score += 30000; + if (sameString("appris_principal_1", name)) + score += 30000; + else if (startsWith("appris_principal", name)) + score += 15000; + else if (sameString("appris_candidate_highest_score", name)) + score += 20000; + else if (sameString("appris_candidate_longest_ccds", name)) + score += 15100; + else if (sameString("appris_candidate_longest", name)) + score += 15000; + else if (sameString("appris_candidate_longest_seq", name)) + score += 15000; + else if (sameString("appris_candidate_ccds", name)) + score += 14000; + else if (sameString("appris_candidate", name)) + score += 10000; + else if (startsWith("appris_alternative", name)) + score += 0; + else if (startsWith("appris_", name)) + errAbort("Unknown appris %s", name); + else if (sameString("CCDS", name)) + score += 3000; + } +char *status = tx->status; +if (sameString("KNOWN", status)) + score += 300; +else if (sameString("NOVEL", status)) + score += 200; + +char *type = tx->type; +if (sameString("protein_coding", type)) + score += 300; +else if (sameString("antisense", type)) + score -= 100; +else if (sameString("sense_intronic", type)) + score -= 100; +else if (endsWith("pseudogene", type)) + score -= 1000; +return score; +} + +struct slName *parseTags(struct hash *tagHash, char *tagString) +/* Parse tagString to reveal string list. */ +{ +struct slName *tagList = slNameListFromString(tagString, ','); +struct slName *tag; +for (tag = tagList; tag != NULL; tag = tag->next) + hashIncInt(tagHash, tag->name); +return tagList; +} + +struct hash *hashGenePredFile(char *fileName) +/* Load in a genepred into a hash keyed by name */ +{ +struct genePred *gp, *gpList = genePredLoadAll(fileName); +struct hash *hash = hashNew(0); +for (gp = gpList; gp != NULL; gp = gp->next) + hashAdd(hash, gp->name, gp); +return hash; +} + + +struct gene *makeGeneList(struct version *v) +/* makeGeneList - Create a table that links gene with a canonical transcript to + * represent the gene for a particular version of gencode. */ +{ +char *metaIn = v->metaName; + +/* Read in table and set up indexes into required fields */ +static char *required[] = {"geneId", "geneName", "transcriptId", "tags", + "transcriptStatus", "transcriptType"}; +struct fieldedTable *table = fieldedTableFromTabFile(metaIn, metaIn, required, ArraySize(required)); +verbose(1, "Read %s with %d rows and %d fields\n", metaIn, table->rowCount, table->fieldCount); +int geneIx = fieldedTableFindFieldIx(table, "geneId"); +int geneNameIx = fieldedTableFindFieldIx(table, "geneName"); +int transcriptIx = fieldedTableFindFieldIx(table, "transcriptId"); +int tagsIx = fieldedTableFindFieldIx(table, "tags"); +int statusIx = fieldedTableFindFieldIx(table, "transcriptStatus"); +int typeIx = fieldedTableFindFieldIx(table, "transcriptType"); + +struct hash *statusHash = hashNew(0); // Null val uniqueness cache for status +struct hash *typeHash = hashNew(0); // Null val uniqueness cache for type +struct hash *tagHash = hashNew(0); // Null val uniqueness cache for tag +struct hash *txHash = hashNew(0); // Null val uniqueness cache for transcripts + +struct hash *geneHash = hashNew(0); // gene valued +struct gene *geneList = NULL; + +/* Scan through table building up transcripts and genes. */ +struct fieldedRow *row; +for (row = table->rowList; row != NULL; row = row->next) + { + char *geneName = row->row[geneIx]; + struct gene *gene = hashFindVal(geneHash, geneName); + if (gene == NULL) + { + AllocVar(gene); + gene->symbol = cloneString(row->row[geneNameIx]); + hashAddSaveName(geneHash, geneName, gene, &gene->name); + slAddHead(&geneList, gene); + } + struct transcript *tx; + AllocVar(tx); + tx->status = hashStoreName(statusHash, row->row[statusIx]); + tx->type = hashStoreName(typeHash,row->row[typeIx]); + tx->tags = parseTags(tagHash, row->row[tagsIx]); + tx->name = hashStoreName(txHash, row->row[transcriptIx]); + slAddHead(&gene->txList, tx); + } +slReverse(&geneList); +verbose(1, "Have %d genes, %d transcripts, %d status, %d type, %d tags\n", + geneHash->elCount, txHash->elCount, statusHash->elCount, typeHash->elCount, tagHash->elCount); + +/* Find best for each gene */ +struct gene *gene; +for (gene = geneList; gene != NULL; gene = gene->next) + { + slReverse(&gene->txList); + struct transcript *bestTx = NULL, *tx = NULL; + int bestScore = -BIGNUM; + for (tx = gene->txList; tx != NULL; tx = tx->next) + { + int score = scoreTranscript(tx); + if (score > bestScore) + { + bestScore = score; + bestTx = tx; + } + } + gene->bestTx = bestTx; + } +verbose(1, "Found best genes\n"); +return geneList; +} + + + +struct version *versionsFromFile(char *input) +// Read in input file which is a list of things like +// /hive/data/genomes/hg*/bed/gencodeV17/data/gencode.tsv +// and return a list of versions */ +{ +struct lineFile *lf = lineFileOpen(input, TRUE); +char *line; +struct version *list = NULL, *v; +while (lineFileNextReal(lf, &line)) + { + line = trimSpaces(line); + if (!fileExists(line)) + errAbort("%s in %s doesn't exist", line, input); + AllocVar(v); + if ((v->name = stringBetween("/bed/", "/data/", line)) == NULL) + errAbort("Couldn't parse gencode version from %s", line); + if ((v->ucscDb = stringBetween("/genomes/", "/bed/", line)) == NULL) + errAbort("Couldn't parse ucscDb version from %s", line); + v->metaName = cloneString(line); + + /* Make .gp file name from metaName*/ + char buf[PATH_LEN]; + chopSuffix(line); + safef(buf, sizeof(buf), "%s.gp", line); + v->genePredName = cloneString(buf); + slAddHead(&list, v); + } +slReverse(&list); +return list; +} + +void gencodeGeneSymVerTx(char *input, char *output) +/* gencodeGeneSymVerTx - Create a tab separated file with gene ID/symbol/best transcript + * mapping for many version of gencode. */ +{ +struct version *v, *versionList = versionsFromFile(input); +verbose(1, "Read %d versions\n", slCount(versionList)); + +/* Read them all in before making output. */ +for (v = versionList; v != NULL; v = v->next) + { + v->geneList = makeGeneList(v); + v->gpHash = hashGenePredFile(v->genePredName); + } + +/* Now make output */ +FILE *f = mustOpen(output, "w"); +fprintf(f, "#gene\tsymbol\tgencodeVersion\tucscDb\tchrom\tchromStart\tchromEnd\ttranscript\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tblockCount\tblockSizes\tblockStarts\n"); +for (v = versionList; v != NULL; v = v->next) + { + struct gene *gene; + for (gene = v->geneList; gene != NULL; gene = gene->next) + { + struct hashEl *hel; + for (hel = hashLookup(v->gpHash, gene->bestTx->name); hel != NULL; hel = hashLookupNext(hel)) + { + fprintf(f, "%s\t%s\t%s\t%s\t", + gene->name, gene->symbol, v->name, v->ucscDb); + struct genePred *gp = hel->val; + /* Print scalar bed fields. */ + fprintf(f, "%s\t", gp->chrom); + fprintf(f, "%u\t", gp->txStart); + fprintf(f, "%u\t", gp->txEnd); + fprintf(f, "%s\t", gene->bestTx->name); + fprintf(f, "%u\t", 0); + fprintf(f, "%s\t", gp->strand); + fprintf(f, "%u\t", gp->cdsStart); + fprintf(f, "%u\t", gp->cdsEnd); + fprintf(f, "%u\t", 0); + fprintf(f, "%u\t", gp->exonCount); + + /* Print exon-by-exon fields. */ + int i; + + /* Print exon sizes */ + for (i=0; iexonCount; ++i) + fprintf(f, "%u,", gp->exonEnds[i] - gp->exonStarts[i]); + fprintf(f, "\t"); + + /* Print exons starts */ + for (i=0; iexonCount; ++i) + fprintf(f, "%u,", gp->exonStarts[i] - gp->txStart); + fprintf(f, "\n"); + } + } + } +carefulClose(&f); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +gencodeGeneSymVerTx(argv[1], argv[2]); +return 0; +}