src/hg/utils/tdbQuery/tdbQuery.c 1.19

1.19 2009/12/05 22:47:39 kent
Adding additional checks on child/parent relationships. Forcing children to be close to parent in file.
Index: src/hg/utils/tdbQuery/tdbQuery.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/tdbQuery/tdbQuery.c,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -B -U 4 -r1.18 -r1.19
--- src/hg/utils/tdbQuery/tdbQuery.c	5 Dec 2009 19:30:24 -0000	1.18
+++ src/hg/utils/tdbQuery/tdbQuery.c	5 Dec 2009 22:47:39 -0000	1.19
@@ -151,39 +151,8 @@
 		var->name, var->name, glTagTypeFile);
     }
 }
 
-static void doRecordChecks(struct tdbRecord *recordList, struct lm *lm)
-/* Do additional checks on records. */
-{
-struct tdbRecord *record;
-for (record = recordList; record != NULL; record = record->next)
-    {
-    struct tdbField *typeField = tdbRecordField(record, "type");
-    char *fullType = (typeField != NULL ? typeField->val : record->key);
-    char *type = lmCloneFirstWord(lm, fullType);
-    struct tdbField *field;
-    for (field = record->fieldList; field != NULL; field = field->next)
-        {
-	struct slName *typeList = hashFindVal(glTagTypes, field->name);
-	if (typeList == NULL)
-	    {
-	    recordAbort(record, 
-	    	"Tag '%s' not found in %s.\nIf it's not a typo please add %s to that file.  "
-		"The tag is", 
-	    	field->name, glTagTypeFile, field->name);
-	    }
-	if (!matchAnyWild(typeList, type))
-	    {
-	    recordAbort(record, 
-	    	"Tag '%s' not allowed for tracks of type '%s'.  Please add it to supported types\n"
-		"in %s if this is not a mistake.  The tag is", 
-	    	field->name, type, glTagTypeFile);
-	    }
-	}
-    }
-}
-
 struct dbPath
 /* A database directory and path. */
     {
     struct dbPath *next;
@@ -577,9 +546,9 @@
 static int parentChildFileDistance(struct tdbRecord *parent, struct tdbRecord *child)
 /* Return distance of two records.  If they're in different files the
  * distance gets pretty big.  Would be flaky on records split across
  * different files, hence the ad-hoc in the name.  Not worth implementing
- * somthing that handles this though with the hope that the parent/child
+ * something that handles this though with the hope that the parent/child
  * relationship will become indentation rather than ID based. */
 {
 struct tdbFilePos *parentFp = parent->posList, *childFp = child->posList;
 if (!sameString(parentFp->fileName, childFp->fileName))
@@ -598,12 +567,8 @@
 {
 struct tdbField *parentField = tdbRecordField(rec, parentFieldName);
 if (parentField == NULL)
     return NULL;
-#ifdef OLD
-if (!recordMatchesRelease(rec, alpha))
-    return NULL;
-#endif /* OLD */
 char *parentLine = parentField->val;
 int len = strlen(parentLine);
 char buf[len+1];
 strcpy(buf, parentLine);
@@ -615,20 +580,15 @@
 for (hel = hashLookup(hash, parentName); hel != NULL; hel = hashLookupNext(hel))
     {
     gotParentSomeRelease = TRUE;
     struct tdbRecord *parent = hel->val;
-#ifdef OLD
-    if (recordMatchesRelease(parent, alpha))
-#endif /* OLD */
-	{
 	int distance = parentChildFileDistance(parent, rec);
 	if (distance < closestDistance)
 	    {
 	    closestParent = parent;
 	    closestDistance = distance;
 	    }
 	}
-    }
 if (closestParent != NULL)
     return closestParent;
 
 /* If we haven't matched so far, it could be that the release tag is set in the parent
@@ -679,14 +639,14 @@
 hashFree(&hash);
 }
 
 
-struct tdbRecord *tdbsForDbPath(struct dbPath *p, struct lm *lm, struct hash *recordHash,
+struct tdbRecord *tdbsForDbPath(struct dbPath *p, struct lm *lm, 
 	char *parentField, boolean alpha)
 /* Assemble recordList for given database.  This looks at the root/organism/assembly
- * levels.  It returns a list, and fills in a hash (which should be passed in empty)
- * of the records keyed by record->key. */
+ * levels.  It returns a list of records. */
 {
+struct hash *recordHash = hashNew(0);
 struct slName *fileLevelList = dbPathToFiles(p), *fileLevel;
 struct tdbRecord *recordList = NULL;
 for (fileLevel = fileLevelList; fileLevel != NULL; fileLevel = fileLevel->next)
     {
@@ -721,10 +681,10 @@
 	    slAddHead(&recordList, record);
 	    }
 	}
     }
+hashFree(&recordHash);
 slReverse(&recordList);
-
 return recordList;
 }
 
 static void mergeParentRecord(struct tdbRecord *record, struct tdbRecord *parent, 
@@ -896,8 +856,116 @@
     }
 return FALSE;
 }
 
+static struct tdbRecord *closestParentInFile(struct slRef *allParentRefs, 
+	struct tdbFilePos *childPos)
+/* Find parent that comes closest to (but before) childPos. */
+{
+struct slRef *parentRef;
+struct tdbRecord *closestParent = NULL;
+int closestDistance = BIGNUM;
+for (parentRef = allParentRefs; parentRef != NULL; parentRef = parentRef->next)
+    {
+    struct tdbRecord *parent = parentRef->val;
+    struct tdbFilePos *pos;
+    for (pos = parent->posList; pos != NULL; pos = pos->next)
+        {
+	if (sameString(pos->fileName, childPos->fileName))
+	    {
+	    int distance = childPos->lineIx - pos->lineIx;
+	    if (distance > 0)
+	        {
+		if (distance < closestDistance)
+		    {
+		    closestDistance = distance;
+		    closestParent = parent;
+		    }
+		}
+	    }
+	}
+    }
+return closestParent;
+}
+
+static void checkChildUnderNearestParent(struct slRef *allParentRefs,
+	struct tdbRecord *parent, struct tdbRecord *child)
+/* Make sure that parent record occurs before child, and that indeed it is the
+ * closest parent before the child. */
+{
+/* We do the check for each file the child is in */
+struct tdbFilePos *childFp, *parentFp;
+for (childFp = child->posList; childFp != NULL; childFp = childFp->next)
+    {
+    /* Find parentFp that is in this file if any. */
+    for (parentFp = parent->posList; parentFp != NULL; parentFp = parentFp->next)
+        {
+	if (sameString(parentFp->fileName, childFp->fileName))
+	    {
+	    if (parentFp->lineIx > childFp->lineIx)
+	        errAbort("Child before parent in %s\n"
+		         "Child (%s) at line %d, parent (%s) at line %d",
+			 childFp->fileName, child->key, childFp->lineIx, 
+			 parent->key, parentFp->lineIx);
+	    struct tdbRecord *closestParent = closestParentInFile(allParentRefs, childFp);
+	    assert(closestParent != NULL);
+	    if (closestParent != parent)
+	        errAbort("%s comes between parent (%s) and child (%s) in %s\n"
+		         "Parent at line %d, child at line %d.",
+			 closestParent->key, parent->key, child->key, childFp->fileName,
+			 parentFp->lineIx, childFp->lineIx);
+	    }
+	}
+    }
+}
+
+static void doRecordChecks(struct tdbRecord *recordList, struct lm *lm)
+/* Do additional checks on records. */
+{
+/* Check fields against tagType.tag. */
+struct tdbRecord *record;
+for (record = recordList; record != NULL; record = record->next)
+    {
+    struct tdbField *typeField = tdbRecordField(record, "type");
+    char *fullType = (typeField != NULL ? typeField->val : record->key);
+    char *type = lmCloneFirstWord(lm, fullType);
+    struct tdbField *field;
+    for (field = record->fieldList; field != NULL; field = field->next)
+        {
+	struct slName *typeList = hashFindVal(glTagTypes, field->name);
+	if (typeList == NULL)
+	    {
+	    recordAbort(record, 
+	    	"Tag '%s' not found in %s.\nIf it's not a typo please add %s to that file.  "
+		"The tag is", 
+	    	field->name, glTagTypeFile, field->name);
+	    }
+	if (!matchAnyWild(typeList, type))
+	    {
+	    recordAbort(record, 
+	    	"Tag '%s' not allowed for tracks of type '%s'.  Please add it to supported types\n"
+		"in %s if this is not a mistake.  The tag is", 
+	    	field->name, type, glTagTypeFile);
+	    }
+	}
+    }
+
+/* Create parent list, which we'll use for various child/parent checks. */
+struct slRef *parentRefList = NULL;
+for (record = recordList; record != NULL; record = record->next)
+    {
+    if (record->children != NULL)
+        refAdd(&parentRefList, record);
+    }
+
+/* Additional child/parent checks. */
+for (record = recordList; record != NULL; record = record->next)
+    {
+    if (record->parent != NULL)
+        checkChildUnderNearestParent(parentRefList, record->parent, record);
+    }
+}
+
 void tdbQuery(char *sql)
 /* tdbQuery - Query the trackDb system using SQL syntax.. */
 {
 /* Load in hash of legitimate tags. */
@@ -931,10 +999,9 @@
     {
     struct lm *lm = lmInit(0);
     struct dbPath *p = dbOrder->val;
     char *db = p->db;
-    struct hash *recordHash = hashNew(0);
-    struct tdbRecord *recordList = tdbsForDbPath(p, lm, recordHash, "subTrack", clAlpha);
+    struct tdbRecord *recordList = tdbsForDbPath(p, lm, "subTrack", clAlpha);
          
 
     verbose(2, "Composed %d records from %s\n", slCount(recordList), db);
     inheritFromParents(recordList, "subTrack", "noInherit", clAlpha, lm);
@@ -985,9 +1052,8 @@
 		}
 	    }
 	}
     lmCleanup(&lm);
-    hashFree(&recordHash);
     }
 dyStringFree(&fileString);
 
 if (sameString(rql->command, "count"))