a44421a79fb36cc2036fe116b97ea3bc9590cd0c braney Fri Dec 2 09:34:39 2011 -0800 removed rcsid (#295) diff --git src/utils/raToSvmLight/raToSvmLight.c src/utils/raToSvmLight/raToSvmLight.c index cb71c60..581f8c3 100644 --- src/utils/raToSvmLight/raToSvmLight.c +++ src/utils/raToSvmLight/raToSvmLight.c @@ -1,183 +1,182 @@ /* raToSvmLight - Convert .ra file to feature vector input for svmLight.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" #include "obscure.h" #include "ra.h" -static char const rcsid[] = "$Id: raToSvmLight.c,v 1.2 2007/03/13 21:31:05 kent Exp $"; boolean good = FALSE; boolean bad = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "raToSvmLight - Convert .ra file to feature vector input for svmLight.\n" "usage:\n" " raToSvmLight in.ra keyField out.feature out.keys\n" "where:\n" " in.ra is a ra file. All fields except the keyField should be numeric\n" " keyField is the name of the field that identifies the record, often 'name' or 'acc'\n" " out.feature is the converted output, ready for svm_learn or svm_classify\n" " out.keys - contains one line per ra record, with just the keyField value.\n" " You want this because it is not in out.feature. The svm_learn/svm_classify\n" " rely on you keeping track externally what feature is in what line.\n" "options:\n" " -good - Mark this as positive training set\n" " -bad - Mark this as negative training set\n" " -fields=fields.tab Save the field/numerical id values here\n" ); } static struct optionSpec options[] = { {"good", OPTION_BOOLEAN}, {"bad", OPTION_BOOLEAN}, {"fields", OPTION_STRING}, {NULL, 0}, }; struct idVal /* An id (index into feature vector) and it's value */ { struct idVal *next; int id; double val; }; int idValCmp(const void *va, const void *vb) /* Compare to sort based on id. */ { const struct idVal *a = *((struct idVal **)va); const struct idVal *b = *((struct idVal **)vb); int diff = a->id - b->id; if (diff == 0) { double valDiff = b->val - a->val; if (valDiff < 0) diff = -1; else if (valDiff > 0) diff = 1; } return diff; } void idValAdd(struct lm *lm, struct idVal **pList, int id, double val) /* Make up new idVal and add it to head of list. */ { struct idVal *iv; AllocVar(iv); iv->id = id; iv->val = val; slAddHead(pList, iv); } void raToSvmLight(char *inFile, char *keyField, char *outFeatures, char *outKeys) /* raToSvmLight - Convert .ra file to feature vector input for svmLight. */ { /* Read file into a list of ra hashes. Build up symbol table mapping */ struct lineFile *lf = lineFileOpen(inFile, TRUE); struct hash *ra, *raList = NULL; struct hash *symHash = hashNew(0); int id = 0; while ((ra = raNextRecord(lf)) != NULL) { struct hashCookie cookie = hashFirst(ra); struct hashEl *el; while ((el = hashNext(&cookie)) != NULL) { if (!sameString(el->name, keyField)) if (!hashLookup(symHash, el->name)) hashAdd(symHash, el->name, NULL); } slAddHead(&raList, ra); } lineFileClose(&lf); slReverse(&raList); /* Alphabetize symbols and assign IDs. (The alphabetization is so that * different files missing data in different places still end up with * same IDs. */ struct hashEl *el, *list = hashElListHash(symHash); slSort(&list, hashElCmp); for (el = list; el != NULL; el = el->next) { struct hashEl *realEl = hashLookup(symHash, el->name); realEl->val = intToPt(++id); } /* For each ra, convert to feature list, sort, and output. */ FILE *f = mustOpen(outFeatures, "w"); for (ra = raList; ra != NULL; ra = ra->next) { struct lm *lm = lmInit(0); struct idVal *iv, *ivList = NULL; struct hashCookie cookie = hashFirst(ra); struct hashEl *el; while ((el = hashNext(&cookie)) != NULL) { if (!sameString(el->name, keyField)) { int id = hashIntVal(symHash, el->name); char *valString = el->val; char c = valString[0]; if (isdigit(c) || (c == '-' && isdigit(valString[1]))) idValAdd(lm, &ivList, id, atof(valString)); else errAbort("%s has non-numeric value %s", el->name, valString); } } slSort(&ivList, idValCmp); if (good) fprintf(f, "+1"); else if (bad) fprintf(f, "-1"); else fprintf(f, "0"); for (iv = ivList; iv != NULL; iv = iv->next) fprintf(f, " %d:%g", iv->id, iv->val); fprintf(f, "\n"); } carefulClose(&f); /* Write out key fields in same order as feature vector lines. */ f = mustOpen(outKeys, "w"); for (ra = raList; ra != NULL; ra = ra->next) { char *key = hashMustFindVal(ra, keyField); fprintf(f, "%s\n", key); } carefulClose(&f); /* Optionally write out correspondence between feature field ID and ra field names. */ if (optionExists("fields")) { char *fileName = optionVal("fields", NULL); f = mustOpen(fileName, "w"); struct hashEl *el, *elList = hashElListHash(symHash); slSort(&elList, hashElCmp); for (el = elList; el != NULL; el = el->next) { fprintf(f, "%s\t%d\n", el->name, ptToInt(el->val)); } carefulClose(&f); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); good = optionExists("good"); bad = optionExists("bad"); if (bad && good) errAbort("The options good and bad don't go together."); raToSvmLight(argv[1], argv[2], argv[3], argv[4]); return 0; }