79124adf60b5a8e99e4025e8f982389e13db955e braney Wed Sep 4 15:09:02 2019 -0700 starting V32 of hg38 knownGene diff --git src/hg/makeDb/kgAllocId/kgAllocId.c src/hg/makeDb/kgAllocId/kgAllocId.c new file mode 100644 index 0000000..9b1ea5f --- /dev/null +++ src/hg/makeDb/kgAllocId/kgAllocId.c @@ -0,0 +1,154 @@ +/* kgAllocId - Assign new knownGene ids to Gencode IDs. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "txCommon.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "kgAllocId - Assign new knownGene ids to Gencode IDs\n" + "usage:\n" + " kgAllocId oldMap newIds startIdx newMap\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +struct version +{ +unsigned number; +char *id; +}; + +struct hash *readMapNoVersion(char *name) +{ +struct hash *hash = newHash(10); +struct lineFile *lf = lineFileOpen(name, TRUE); + +char *row[2]; +while (lineFileRow(lf, row)) + { + char *ptr = strrchr(row[0], '.'); + *ptr++ = 0; + struct version *version; + AllocVar(version); + version->number = atoi(ptr); + version->id = cloneString(row[1]); + + hashAdd(hash, row[0], version); + } + +lineFileClose(&lf); + +return hash; +} + +struct hash *readMap(char *name) +{ +struct hash *hash = newHash(10); +struct lineFile *lf = lineFileOpen(name, TRUE); + +char *row[2]; +while (lineFileRow(lf, row)) + hashAdd(hash, row[0], cloneString(row[1])); + +lineFileClose(&lf); + +return hash; +} + +unsigned txId; +char *newId() +{ +char *newAcc = needMem(100); +txGeneAccFromId(++txId, newAcc); +strcat(newAcc, ".1"); +return newAcc; +} + +char *addOne(char *id) +{ +if (startsWith("uc064bas", id)) + printf("big\n"); +char *copyId = cloneString(id); +char *ptr = strrchr(copyId, '.'); +*ptr++ = 0; +unsigned number = atoi(ptr) + 1; + +char buffer[4096]; +safef(buffer, sizeof buffer, "%s.%d", copyId, number); + +return cloneString(buffer); +} + +void kgAllocId(char *oldMap, char *newIds, char * startIdStr, char *newMap) +/* kgAllocId - Assign new knownGene ids to Gencode IDs. */ +{ +txId = atoi(startIdStr); +struct hash *oldMapHash = readMap(oldMap); +struct hash *oldMapHashNoVer = readMapNoVersion(oldMap); +struct lineFile *lf = lineFileOpen(newIds, TRUE); +FILE *out = mustOpen(newMap, "w"); + +char *row[1]; +while (lineFileRow(lf, row)) + { + char *thisId = cloneString(row[0]); + // first look to see if this id already in map + char *val = hashFindVal(oldMapHash, row[0]); + if (val) + { + fprintf(out, "%s\t%s\n", row[0], val); + continue; + } + + // check to see if we have the id with a different version + char *ptr = strrchr(row[0], '.'); + *ptr++ = 0; + struct hashEl *hel = hashLookup(oldMapHashNoVer, row[0]); + + if (hel) + { + struct hashEl *iter = hel; + char *id = NULL; + unsigned max = 0; + for(; iter; iter = iter->next) + { + if (differentString(iter->name, row[0])) + continue; + struct version *version =((struct version *)iter->val); + unsigned value = version->number; + if (value > max) + { + max = value; + id = version->id; + } + } + + fprintf(out, "%s\t%s\n", thisId, addOne(id)); + continue; + } + + fprintf(out, "%s\t%s\n", thisId, newId()); + + } +fprintf(stderr,"lastId %d\n", txId); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 5) + usage(); +kgAllocId(argv[1], argv[2], argv[3], argv[4]); +return 0; +}