a44421a79fb36cc2036fe116b97ea3bc9590cd0c braney Fri Dec 2 09:34:39 2011 -0800 removed rcsid (#295) diff --git src/oneShot/scrapeCruzBiotech/scrapeCruzBiotech.c src/oneShot/scrapeCruzBiotech/scrapeCruzBiotech.c index d084960..f51a981 100644 --- src/oneShot/scrapeCruzBiotech/scrapeCruzBiotech.c +++ src/oneShot/scrapeCruzBiotech/scrapeCruzBiotech.c @@ -1,90 +1,89 @@ /* scrapeCruzBiotech - Do some screen scraping of Santa Cruz biotech site looking for antibodies with immunofluorescence.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "htmlPage.h" -static char const rcsid[] = "$Id: scrapeCruzBiotech.c,v 1.1 2006/06/15 15:27:00 kent Exp $"; void usage() /* Explain usage and exit. */ { errAbort( "scrapeCruzBiotech - Do some screen scraping of Santa Cruz biotech site looking for antibodies with immunofluorescence.\n" "usage:\n" " scrapeCruzBiotech startUrl destFile\n" "options:\n" " -xxx=XXX\n" ); } static struct optionSpec options[] = { {NULL, 0}, }; char *catBase = "http://www.scbt.com/catalog/"; void weedString(char *weed, char *s) { s = stringIn(weed, s); if (s != NULL) { int i, len = strlen(weed); for (i=0; i<len; ++i) s[i] = 'X'; } } void scrapeProduct(char *url, int id, FILE *f) /* Scrape one product */ { struct htmlPage *page = htmlPageGet(url); weedString("Multicolor FCM Systems", page->htmlText); boolean gotFcm = (stringIn("FCM", page->htmlText) != NULL); boolean gotIhc = (stringIn("IHC", page->htmlText) != NULL); char *scIdStart = stringIn("sc-", page->htmlText); char *fcmStart = stringIn("FCM", page->htmlText); char *ihcStart = stringIn("IHC", page->htmlText); char *type = "other"; if (stringIn(">Transcription Regulators<", page->htmlText)) type = "txn"; else if (stringIn(">Homeodomain Proteins<", page->htmlText)) type = "hox"; if (scIdStart != NULL) { char *scId = nextWord(&scIdStart); fprintf(f, "%s\tsc-%d\t%s\t%d\t%d\n", scId, id, type, gotFcm, gotIhc); uglyf("%s\tsc-%d\t%s\t%d\t%d\n", scId, id, type, gotFcm, gotIhc); } htmlPageFree(&page); } void scrapeCruzBiotech(char *outFile) /* scrapeCruzBiotech - Do some screen scraping of Santa Cruz biotech site looking for antibodies with immunofluorescence.. */ { FILE *f = mustOpen(outFile, "w"); char url[1024]; int id, minId = 1, maxId=46000; for (id=minId; id<=maxId; ++id) { safef(url, sizeof(url), "http://www.scbt.com/catalog/detail.lasso?-token.order_id=&-database=catalog&-layout=web_detail&-Op=eq&catalog_number=sc-%d&-search", id); scrapeProduct(url, id, f); uglyf("Done %d of %d\n", id, maxId); } carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 2) usage(); scrapeCruzBiotech(argv[1]); return 0; }