38e1bbe7c81547be3306dc87e44bc08e3cf4cd97 braney Wed Apr 8 11:24:17 2015 -0700 first cut at little feet mode diff --git src/hg/lib/customAdjacency.c src/hg/lib/customAdjacency.c new file mode 100644 index 0000000..bcbd0a4 --- /dev/null +++ src/hg/lib/customAdjacency.c @@ -0,0 +1,263 @@ +#include "common.h" +#include "errCatch.h" +#include "hash.h" +#include "linefile.h" +#include "portable.h" +#include "obscure.h" +#include "binRange.h" +#include "pipeline.h" +#include "jksql.h" +#include "net.h" +#include "bed.h" +#include "psl.h" +#include "gff.h" +#include "wiggle.h" +#include "genePred.h" +#include "trackDb.h" +#include "hgConfig.h" +#include "hdb.h" +#include "hui.h" +#include "customTrack.h" +#include "customPp.h" +#include "customFactory.h" +#include "trashDir.h" +#include "jsHelper.h" +#include "encode/encodePeak.h" +#include "udc.h" +#include "bbiFile.h" +#include "bigWig.h" +#include "bigBed.h" +#ifdef USE_BAM +#include "hgBam.h" +#endif//def USE_BAM +#include "vcf.h" +#include "makeItemsItem.h" +#include "bedDetail.h" +#include "pgSnp.h" +#include "regexHelper.h" +#include "chromInfo.h" +#include "adjacency.h" + +char *ctGenomeOrCurrent(struct customTrack *ct); +/*** adjacency Factory - allow adjacency(personal genome SNP) custom tracks ***/ + +static boolean rowIsAdjacency (char **row, char *db, char *type) +/* return TRUE if row looks like a adjacency row */ +{ +return TRUE; +#ifdef NOTNOW +boolean isAdjacency = rowIsBed(row, 3, db); +if (type != NULL && !sameWord(type, "adjacency")) + return FALSE; +if (!isAdjacency && type == NULL) + return FALSE; +else if (!isAdjacency) + errAbort("Error line 1 of custom track, type is adjacency but first 3 fields are not BED"); +if (!isdigit(row[4][0]) && type == NULL) + return FALSE; +else if (!isdigit(row[4][0])) + errAbort("Error line 1 of custom track, type is adjacency but count is not an integer (%s)", row[4]); +int count = atoi(row[4]); +if (count < 1 && type == NULL) + return FALSE; +else if (count < 1) + errAbort("Error line 1 of custom track, type is adjacency but count is less than 1"); +char pattern[128]; /* include count in pattern */ +safef(pattern, sizeof(pattern), "^[ACTG-]+(\\/[ACTG-]+){%d}$", count - 1); +if (! regexMatchNoCase(row[3], pattern) && type == NULL) + return FALSE; +else if (! regexMatchNoCase(row[3], pattern)) + errAbort("Error line 1 of custom track, type is adjacency with a count of %d but allele is invalid %s", count, row[3]); +safef(pattern, sizeof(pattern), "^[0-9]+(,[0-9]+){%d}$", count - 1); +if (! regexMatchNoCase(row[5], pattern) && type == NULL) + return FALSE; +else if (! regexMatchNoCase(row[5], pattern)) + errAbort("Error line 1 of custom track, type is adjacency with a count of %d but frequency is invalid (%s)", count, row[5]); +safef(pattern, sizeof(pattern), "^[0-9.]+(,[0-9.]+){%d}$", count - 1); +if (! regexMatchNoCase(row[6], pattern) && type == NULL) + return FALSE; +else if (! regexMatchNoCase(row[6], pattern)) + errAbort("Error line 1 of custom track, type is adjacency with a count of %d but score is invalid (%s)", count, row[6]); +/* if get here must be adjacency format */ +return TRUE; +#endif +} + +static boolean sameType(char *a, char *b) +/* Case-sensitive comparison of first word if multiple words, + * so that we can compare types like "bed" vs. "bed 3 ." etc. + * Tolerates one null input but not two. */ +{ +if (a == NULL && b == NULL) + errAbort("sameType should not be called when both inputs are NULL."); +else if (a == NULL || b == NULL) + return FALSE; +char *aCopy = cloneString(a); +char *bCopy = cloneString(b); +char *aWord = firstWordInLine(aCopy); +char *bWord = firstWordInLine(bCopy); +boolean same = sameString(aWord, bWord); +freeMem(aCopy); +freeMem(bCopy); +return same; +} + +static boolean adjacencyRecognizer(struct customFactory *fac, + struct customPp *cpp, char *type, + struct customTrack *track) +/* Return TRUE if looks like we're handling an adjacency track */ +{ +if (type != NULL && !sameType(type, fac->name)) + return FALSE; +char *line = customFactoryNextRealTilTrack(cpp); +if (line == NULL) + return FALSE; +char *dupe = cloneString(line); +char *row[9+3]; +int wordCount = chopLine(dupe, row); +boolean isAdjacency = FALSE; +if (wordCount == 9) + { + track->fieldCount = wordCount; + char *ctDb = ctGenomeOrCurrent(track); + isAdjacency = rowIsAdjacency(row, ctDb, type); + } +freeMem(dupe); +customPpReuse(cpp, line); +return (isAdjacency); +} + +static struct pipeline *adjacencyLoaderPipe(struct customTrack *track) +/* Set up pipeline that will load the adjacency into database. */ +{ +/* running the single command: + * hgLoadBed -customTrackLoader -sqlTable=loader/adjacency.sql -renameSqlTable + * -trimSqlTable -notItemRgb -tmpDir=/data/tmp + * -maxChromNameLength=${nameLength} customTrash tableName stdin + */ +struct dyString *tmpDy = newDyString(0); +char *cmd1[] = {"loader/hgLoadBed", "-customTrackLoader", + "-sqlTable=loader/adjacency.sql", "-noBin", "-renameSqlTable", "-trimSqlTable", "-notItemRgb", NULL, NULL, NULL, NULL, NULL, NULL}; +char *tmpDir = cfgOptionDefault("customTracks.tmpdir", "/data/tmp"); +struct stat statBuf; +int index = 6; + +if (stat(tmpDir,&statBuf)) + errAbort("can not find custom track tmp load directory: '%s'
\n" + "create directory or specify in hg.conf customTracks.tmpdir", tmpDir); +dyStringPrintf(tmpDy, "-tmpDir=%s", tmpDir); +cmd1[index++] = dyStringCannibalize(&tmpDy); tmpDy = newDyString(0); +dyStringPrintf(tmpDy, "-maxChromNameLength=%d", track->maxChromName); +cmd1[index++] = dyStringCannibalize(&tmpDy); +cmd1[index++] = CUSTOM_TRASH; +cmd1[index++] = track->dbTableName; +cmd1[index++] = "stdin"; +assert(index <= ArraySize(cmd1)); + +/* the "/dev/null" file isn't actually used for anything, but it is used + * in the pipeLineOpen to properly get a pipe started that isn't simply + * to STDOUT which is what a NULL would do here instead of this name. + * This function exits if it can't get the pipe created + * The dbStderrFile will get stderr messages from hgLoadBed into the + * our private error log so we can send it back to the user + */ +return pipelineOpen1(cmd1, pipelineWrite | pipelineNoAbort, + "/dev/null", track->dbStderrFile); +} + + +//struct adjacency *adjacencyLineFileLoad(char **row, struct lineFile *lf); +void customFactoryCheckChromNameDb(char *genomeDb, char *word, struct lineFile *lf); +void pipelineFailExit(struct customTrack *track); + +/* customTrackAdjacency load item */ +static struct adjacency *customTrackAdjacency(char *db, char **row, + struct hash *chromHash, struct lineFile *lf) +/* Convert a row of strings to adjacency. */ +{ +//struct adjacency *item = adjacencyLineFileLoad(row, lf); +struct adjacency *item = adjacencyLoad(row); +hashStoreName(chromHash, item->chrom); +customFactoryCheckChromNameDb(db, item->chrom, lf); +int chromSize = hChromSize(db, item->chrom); +if (item->chromEnd > chromSize) + lineFileAbort(lf, "chromEnd larger than chrom %s size (%d > %d)", + item->chrom, item->chromEnd, chromSize); +return item; +} + +/* remember to set all the custom track settings necessary */ +static struct customTrack *adjacencyFinish(struct customTrack *track, struct adjacency *itemList) +/* Finish up adjacency tracks. */ +{ +struct adjacency *item; +char buf[50]; +track->tdb->type = cloneString("adjacency"); +track->dbTrackType = cloneString("adjacency"); +safef(buf, sizeof(buf), "%d", track->fieldCount); +ctAddToSettings(track, "fieldCount", cloneString(buf)); +safef(buf, sizeof(buf), "%d", slCount(itemList)); +ctAddToSettings(track, "itemCount", cloneString(buf)); +safef(buf, sizeof(buf), "%s:%u-%u", itemList->chrom, + itemList->chromStart, itemList->chromEnd); +ctAddToSettings(track, "firstItemPos", cloneString(buf)); + +/* If necessary add track offsets. */ +int offset = track->offset; +if (offset != 0) + { + /* Add track offsets if any */ + for (item = itemList; item != NULL; item = item->next) + { + item->chromStart += offset; + item->chromEnd += offset; + } + track->offset = 0; /* so DB load later won't do this again */ + hashMayRemove(track->tdb->settingsHash, "offset"); /* nor the file reader*/ + } + +/* If necessary load database */ +customFactorySetupDbTrack(track); +struct pipeline *dataPipe = adjacencyLoaderPipe(track); +FILE *out = pipelineFile(dataPipe); +for (item = itemList; item != NULL; item = item->next) + adjacencyOutput(item, out, '\t', '\n'); +fflush(out); /* help see error from loader failure */ +if(ferror(out) || pipelineWait(dataPipe)) + pipelineFailExit(track); /* prints error and exits */ +unlink(track->dbStderrFile); /* no errors, not used */ +pipelineFree(&dataPipe); +return track; +} + +static struct customTrack *adjacencyLoader(struct customFactory *fac, + struct hash *chromHash, + struct customPp *cpp, struct customTrack *track, boolean dbRequested) +/* Load up adjacency data until get next track line. */ +{ +char *line; +char *db = ctGenomeOrCurrent(track); +struct adjacency *itemList = NULL; +if (!dbRequested) + errAbort("adjacency custom track type unavailable without custom trash database. Please set that up in your hg.conf"); +while ((line = customFactoryNextRealTilTrack(cpp)) != NULL) + { + char *row[9]; + int wordCount = chopLine(line, row); + struct lineFile *lf = cpp->fileStack; + lineFileExpectAtLeast(lf, track->fieldCount, wordCount); + struct adjacency *item = customTrackAdjacency(db, row, chromHash, lf); + slAddHead(&itemList, item); + } +slReverse(&itemList); +return adjacencyFinish(track, itemList); +} + +struct customFactory adjacencyFactory = +/* Factory for adjacency tracks */ + { + NULL, + "adjacency", + adjacencyRecognizer, + adjacencyLoader, + };