ff5e23b66198677fa1002eb60698aeef959faca5
kent
  Thu Oct 28 12:37:17 2010 -0700
Moving many routines from hubCheck to trackHub.c library file.
diff --git src/hg/lib/trackHub.c src/hg/lib/trackHub.c
new file mode 100644
index 0000000..ddd01de
--- /dev/null
+++ src/hg/lib/trackHub.c
@@ -0,0 +1,281 @@
+/* trackHub - supports collections of tracks hosted on a remote site.
+ * The basic layout of a data hub is:
+ *        hub.ra - contains information about the hub itself
+ *        genomes.ra - says which genomes are supported by hub
+ *                 Contains file name of trackDb.ra for each genome
+ *        trackDb.ra - contains a stanza for each track.  Stanzas
+ *                 are in a subset of the usual trackDb format. 
+ * How you use the routines here most commonly is as so:
+ *     struct trackHub *hub = trackHubOpen(hubRaUrl);
+ *     struct trackHubGenome *hubGenome = trackHubFindGenome(hub, "hg19");
+ *     struct trackDb *tdbList = trackHubTracksForGenome(hub, hubGenome);
+ *          // do something with tdbList
+ *     trackHubClose(&hub);
+ * Note that the tdbList returned does not have the parent/subtrack pointers set.
+ * It is just a simple list of tracks, not a tree.  
+ */
+
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "udc.h"
+#include "ra.h"
+#include "filePath.h"
+#include "htmlPage.h"
+#include "trackDb.h"
+#include "trackHub.h"
+
+static boolean hasProtocol(char *urlOrPath)
+/* Return TRUE if it looks like it has http://, ftp:// etc. */
+{
+return stringIn("://", urlOrPath) != NULL;
+}
+
+char *trackHubRelativeUrl(char *hubUrl, char *path)
+/* Return full path (in URL form if it's a remote hub) given
+ * path possibly relative to hubUrl. Do a freeMem of result
+ * when done. */
+{
+/* If path itself is a URL then just return a copy of it. */
+if (hasProtocol(path))
+    return cloneString(path);
+
+/* If it's a remote hub, let html path expander handle it. */
+if (hasProtocol(hubUrl))
+    return htmlExpandUrl(hubUrl, path);
+
+/* If we got to here hub is local, and so is path.  Do standard
+ * path parsing. */
+return pathRelativeToFile(hubUrl, path);
+}
+
+static void badGenomeStanza(struct lineFile *lf)
+/* Put up semi-informative error message about a genome stanza being bad. */
+{
+errAbort("Genome stanza should have exactly two lines, one with 'genome' and one with 'trackDb'\n"
+         "Bad stanza format ending line %d of %s", lf->lineIx, lf->fileName);
+}
+
+static struct trackHubGenome *trackHubGenomeReadRa(char *url, struct hash *hash)
+/* Read in a genome.ra format url and return it as a list of trackHubGenomes. 
+ * Also add it to hash, which is keyed by genome. */
+{
+struct lineFile *lf = udcWrapShortLineFile(url, NULL, 16*1024*1024);
+struct trackHubGenome *list = NULL, *el;
+
+struct hash *ra;
+while ((ra = raNextRecord(lf)) != NULL)
+    {
+    if (ra->elCount != 2)
+	badGenomeStanza(lf);
+    char *genome = hashFindVal(ra, "genome");
+    if (genome == NULL)
+        badGenomeStanza(lf);
+    if (hashLookup(hash, genome) != NULL)
+        errAbort("Duplicate genome %s in stanza ending line %d of %s",
+		genome, lf->lineIx, lf->fileName);
+    char *trackDb = hashFindVal(ra, "trackDb");
+    if (trackDb == NULL)
+        badGenomeStanza(lf);
+    AllocVar(el);
+    el->name = cloneString(genome);
+    el->trackDbFile = trackHubRelativeUrl(url, trackDb);
+    hashAdd(hash, el->name, el);
+    slAddHead(&list, el);
+    hashFree(&ra);
+    }
+
+/* Clean up and go home. */
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
+char *trackHubSetting(struct trackHub *hub, char *name)
+/* Return setting if it exists, otherwise NULL. */
+{
+return hashFindVal(hub->settings, name);
+}
+
+char *trackHubRequiredSetting(struct trackHub *hub, char *name)
+/* Return named setting.  Abort with error message if not found. */
+{
+char *val = trackHubSetting(hub, name);
+if (val == NULL)
+    errAbort("Missing required setting %s from %s", name, hub->url);
+return val;
+}
+
+struct trackHub *trackHubOpen(char *url)
+/* Open up a track hub from url.  Reads and parses hub.ra and the genomesFile. */
+{
+struct lineFile *lf = udcWrapShortLineFile(url, NULL, 256*1024);
+struct hash *hubRa = raNextRecord(lf);
+if (hubRa == NULL)
+    errAbort("empty %s in trackHubOpen", url);
+if (raNextRecord(lf) != NULL)
+    errAbort("multiple records in %s", url);
+
+/* Allocate hub and fill in settings field and url. */
+struct trackHub *hub;
+AllocVar(hub);
+hub->url = cloneString(url);
+hub->settings = hubRa;
+
+/* Fill in required fields from settings. */
+hub->shortLabel = trackHubRequiredSetting(hub, "shortLabel");
+hub->longLabel = trackHubRequiredSetting(hub, "longLabel");
+hub->genomesFile = trackHubRequiredSetting(hub, "genomesFile");
+
+lineFileClose(&lf);
+char *genomesUrl = trackHubRelativeUrl(hub->url, hub->genomesFile);
+
+hub->genomeHash = hashNew(8);
+hub->genomeList = trackHubGenomeReadRa(genomesUrl, hub->genomeHash);
+freez(&genomesUrl);
+
+return hub;
+}
+
+void trackHubClose(struct trackHub **pHub)
+/* Close up and free resources from hub. */
+{
+struct trackHub *hub = *pHub;
+if (hub != NULL)
+    {
+    trackHubGenomeFreeList(&hub->genomeList);
+    freeMem(hub->url);
+    hashFree(&hub->settings);
+    hashFree(&hub->genomeHash);
+    freez(pHub);
+    }
+}
+
+void trackHubGenomeFree(struct trackHubGenome **pGenome)
+/* Free up genome info. */
+{
+struct trackHubGenome *genome = *pGenome;
+if (genome != NULL)
+    {
+    freeMem(genome->name);
+    freeMem(genome->trackDbFile);
+    freez(pGenome);
+    }
+}
+
+void trackHubGenomeFreeList(struct trackHubGenome **pList)
+/* Free a list of dynamically allocated trackHubGenome's */
+{
+struct trackHubGenome *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+    {
+    next = el->next;
+    trackHubGenomeFree(&el);
+    }
+*pList = NULL;
+}
+
+static char *requiredSetting(struct trackHub *hub, struct trackHubGenome *genome,
+	struct trackDb *tdb, char *setting)
+/* Fetch setting or give an error message, a little more specific than the
+ * error message from trackDbRequiredSetting(). */
+{
+char *val = trackDbSetting(tdb, setting);
+if (val == NULL)
+    errAbort("Missing required %s setting in hub %s genome %s track %s", setting,
+    	hub->url, genome->name, tdb->track);
+return val;
+}
+
+static void checkTagsLegal(struct trackHub *hub, struct trackHubGenome *genome,
+	struct trackDb *tdb)
+/* Make sure that tdb has all the required tags and is of a supported type. */
+{
+/* Check for existence of fields required in all tracks */
+requiredSetting(hub, genome, tdb, "shortLabel");
+requiredSetting(hub, genome, tdb, "longLabel");
+
+/* Further checks depend whether it is a container. */
+if (tdb->subtracks != NULL)
+    {
+    if (trackDbSetting(tdb, "compositeTrack"))
+        {
+	}
+    else if (trackDbSetting(tdb, "container"))
+        {
+	}
+    else
+        {
+	errAbort("Parent track %s is not compositeTrack or container in hub %s genome %s", 
+		tdb->track, hub->url, genome->name);
+	}
+    }
+else
+    {
+    /* Check type field. */
+    char *type = requiredSetting(hub, genome, tdb, "type");
+    if (startsWithWord("bigWig", type))
+	;
+    else if (startsWithWord("bigBed", type))
+	;
+    else if (startsWithWord("bam", type))
+	;
+    else
+	errAbort("Unsupported type %s in hub %s genome %s track %s", type,
+	    hub->url, genome->name, tdb->track);
+
+    requiredSetting(hub, genome, tdb, "bigDataUrl");
+    }
+
+}
+
+struct trackHubGenome *trackHubFindGenome(struct trackHub *hub, char *genomeName)
+/* Return trackHubGenome of given name associated with hub.  Return NULL if no
+ * such genome. */
+{
+return hashFindVal(hub->genomeHash, genomeName);
+}
+
+struct trackDb *trackHubTracksForGenome(struct trackHub *hub, struct trackHubGenome *genome)
+/* Get list of tracks associated with genome.  Check that it only is composed of legal
+ * types.  Do a few other quick checks to catch errors early. */
+{
+struct lineFile *lf = udcWrapShortLineFile(genome->trackDbFile, NULL, 16*1024*1024);
+struct trackDb *tdbList = trackDbFromOpenRa(lf);
+lineFileClose(&lf);
+
+/* Connect up subtracks and parents.  Note this loop does not actually move tracks
+ * from list to parent subtracks, it just uses the field as a marker. Just do this
+ * so when doing error checking can distinguish between container tracks and others.
+ * This does have the pleasant side effect of making good error messages for
+ * non-existant parents. */
+struct trackDb *tdb;
+struct hash *hash = hashNew(0);
+for (tdb = tdbList; tdb != NULL; tdb = tdb->next)
+    hashAdd(hash, tdb->track, tdb);
+for (tdb = tdbList; tdb != NULL; tdb = tdb->next)
+    {
+    char *parentName = trackDbLocalSetting(tdb, "parent");
+    if (parentName != NULL)
+         {
+	 struct trackDb *parent = hashFindVal(hash, parentName);
+	 if (parent == NULL)
+	    errAbort("Parent %s of track %s doesn't exist in hub %s genome %s", parentName,
+		tdb->track, hub->url, genome->name);
+	 tdb->parent = parent;
+	 parent->subtracks = tdb;
+	 }
+    }
+hashFree(&hash);
+
+/* Loop through list checking tags and removing ad-hoc use of parent and subtracks tags. */
+for (tdb = tdbList; tdb != NULL; tdb = tdb->next)
+    {
+    checkTagsLegal(hub, genome, tdb);
+    tdb->parent = tdb->subtracks = NULL;
+    }
+return tdbList;
+}
+