2670d57795b4b8ff02a480f97adcfdc0b3304a11 braney Mon Sep 9 13:59:07 2019 -0700 add some comments in response to code review diff --git src/hg/makeDb/kgAllocId/kgAllocId.c src/hg/makeDb/kgAllocId/kgAllocId.c index 9b1ea5f..5077345 100644 --- src/hg/makeDb/kgAllocId/kgAllocId.c +++ src/hg/makeDb/kgAllocId/kgAllocId.c @@ -1,154 +1,161 @@ /* kgAllocId - Assign new knownGene ids to Gencode IDs. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "txCommon.h" void usage() /* Explain usage and exit. */ { errAbort( "kgAllocId - Assign new knownGene ids to Gencode IDs\n" "usage:\n" " kgAllocId oldMap newIds startIdx newMap\n" - "options:\n" - " -xxx=XXX\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; struct version { -unsigned number; +unsigned number; // version number char *id; }; struct hash *readMapNoVersion(char *name) +/* Read in a mapping between ENST ids and UC id's. + * Strip off version number of ENST id before adding to hash. */ { struct hash *hash = newHash(10); struct lineFile *lf = lineFileOpen(name, TRUE); char *row[2]; while (lineFileRow(lf, row)) { char *ptr = strrchr(row[0], '.'); *ptr++ = 0; struct version *version; AllocVar(version); version->number = atoi(ptr); version->id = cloneString(row[1]); hashAdd(hash, row[0], version); } lineFileClose(&lf); return hash; } struct hash *readMap(char *name) +/* Read in a mapping between ENST ids and UC id's. */ { struct hash *hash = newHash(10); struct lineFile *lf = lineFileOpen(name, TRUE); char *row[2]; while (lineFileRow(lf, row)) hashAdd(hash, row[0], cloneString(row[1])); lineFileClose(&lf); return hash; } -unsigned txId; +static unsigned txId; // the next UC id to allocate + char *newId() +// Allocate a new UC ID. */ { char *newAcc = needMem(100); txGeneAccFromId(++txId, newAcc); strcat(newAcc, ".1"); return newAcc; } char *addOne(char *id) +// Add one to the version number of this id. { -if (startsWith("uc064bas", id)) - printf("big\n"); char *copyId = cloneString(id); char *ptr = strrchr(copyId, '.'); *ptr++ = 0; unsigned number = atoi(ptr) + 1; char buffer[4096]; safef(buffer, sizeof buffer, "%s.%d", copyId, number); return cloneString(buffer); } void kgAllocId(char *oldMap, char *newIds, char * startIdStr, char *newMap) /* kgAllocId - Assign new knownGene ids to Gencode IDs. */ { txId = atoi(startIdStr); struct hash *oldMapHash = readMap(oldMap); struct hash *oldMapHashNoVer = readMapNoVersion(oldMap); struct lineFile *lf = lineFileOpen(newIds, TRUE); FILE *out = mustOpen(newMap, "w"); char *row[1]; while (lineFileRow(lf, row)) { char *thisId = cloneString(row[0]); + // first look to see if this id already in map char *val = hashFindVal(oldMapHash, row[0]); if (val) { fprintf(out, "%s\t%s\n", row[0], val); continue; } // check to see if we have the id with a different version char *ptr = strrchr(row[0], '.'); *ptr++ = 0; struct hashEl *hel = hashLookup(oldMapHashNoVer, row[0]); if (hel) { + // we found the id with a different version number struct hashEl *iter = hel; char *id = NULL; unsigned max = 0; + + // make sure we find the id with the highest version number for(; iter; iter = iter->next) { if (differentString(iter->name, row[0])) continue; struct version *version =((struct version *)iter->val); unsigned value = version->number; if (value > max) { max = value; id = version->id; } } fprintf(out, "%s\t%s\n", thisId, addOne(id)); continue; } + // didn't find it, so allocate a new id fprintf(out, "%s\t%s\n", thisId, newId()); } fprintf(stderr,"lastId %d\n", txId); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); kgAllocId(argv[1], argv[2], argv[3], argv[4]); return 0; }