78817dc87e7ff4c691c17f5724410bf548877181 kent Wed May 21 16:18:08 2014 -0700 Adding 'addId' to this utility that handles fa files with the same name for multiple records. Needed for sponge. diff --git src/utils/faUniqify/faUniqify.c src/utils/faUniqify/faUniqify.c index caac6a8..6f403cf 100644 --- src/utils/faUniqify/faUniqify.c +++ src/utils/faUniqify/faUniqify.c @@ -1,60 +1,83 @@ /* faUniqify - Remove redundant sequences from fasta. Warn if different sequences have same name.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "fa.h" +/* Globals that hold command line options. */ +boolean addId = FALSE; + void usage() /* Explain usage and exit. */ { errAbort( "faUniqify - Remove redundant sequences from fasta. Warn if different sequences have same name.\n" "usage:\n" " faUniqify in.fa out.fa\n" "options:\n" " -verbose=0 - suppress warning messages\n" + " -addId - add an ID suffix to make things unique\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { + {"addId", OPTION_BOOLEAN}, {NULL, 0}, }; +int hashCountMatches(struct hash *hash, char *key) +/* Count number of things matching key in hash */ +{ +int count = 0; +struct hashEl *hel; +for (hel = hashLookup(hash, key); hel != NULL; hel = hashLookupNext(hel)) + ++count; +return count; +} void faUniqify(char *inFile, char *outFile) /* faUniqify - Remove redundant sequences from fasta. Warn if different sequences have same name.. */ { struct hash *uniqHash = hashNew(0); struct lineFile *lf = lineFileOpen(inFile, FALSE); FILE *f = mustOpen(outFile, "w"); DNA *dna; int size; char *name; while (faMixedSpeedReadNext(lf, &dna, &size, &name)) { char *newDna = cloneMem(dna, size+1); char *oldDna = hashFindVal(uniqHash, name); if (oldDna != NULL) { if (!sameString(oldDna, newDna)) warn("Name %s reused for different sequences", name); } - else + if (addId || oldDna == NULL) { hashAdd(uniqHash, name, newDna); - faWriteNext(f, name, newDna, size); + char *fullName = name; + char buf[PATH_LEN]; + if (addId) + { + int count = hashCountMatches(uniqHash, name); + safef(buf, sizeof(buf), "%sv%d", name, count); + fullName = buf; + } + faWriteNext(f, fullName, newDna, size); } } carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); +addId = optionExists("addId"); faUniqify(argv[1], argv[2]); return 0; }