src/hg/utils/tdbQuery/tdbQuery.c 1.8
1.8 2009/12/03 04:42:28 kent
Massaging handling of duplicate record detection to handle a tricky case in danRer6 involving inheritence and inherited releases.
Index: src/hg/utils/tdbQuery/tdbQuery.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/tdbQuery/tdbQuery.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 4 -r1.7 -r1.8
--- src/hg/utils/tdbQuery/tdbQuery.c 2 Dec 2009 21:02:32 -0000 1.7
+++ src/hg/utils/tdbQuery/tdbQuery.c 3 Dec 2009 04:42:28 -0000 1.8
@@ -64,8 +64,10 @@
{"alpha", OPTION_BOOLEAN},
{NULL, 0},
};
+#define glParentField "subTrack"
+
void recordLocationReport(struct tdbRecord *rec, FILE *out)
/* Write out where record ends. */
{
struct tdbFilePos *pos;
@@ -229,40 +231,96 @@
}
hashFree(&uniqHash);
}
-static void checkDupeKeys(struct tdbRecord *recordList, boolean ignoreIfRelease)
+static struct tdbField *findFieldInSelfOrParents(struct tdbRecord *record, char *fieldName)
+/* Find field if it exists in self or ancestors. */
+{
+struct tdbRecord *p;
+for (p = record; p != NULL; p = p->parent)
+ {
+ struct tdbField *field = tdbRecordField(p, fieldName);
+ if (field != NULL)
+ return field;
+ }
+return NULL;
+}
+
+static char *findFieldValInSelfOrParents(struct tdbRecord *record, char *fieldName)
+/* Find value of given field if it exists in self or ancestors. Return NULL if
+ * field does not exist. */
+{
+struct tdbField *field = findFieldInSelfOrParents(record, fieldName);
+return (field != NULL ? field->val : NULL);
+}
+
+static void checkDupeKeys(struct tdbRecord *recordList, boolean checkRelease)
/* Make sure that there are no duplicate records (with keys) */
{
struct tdbRecord *record;
struct hash *uniqHash = hashNew(0);
for (record = recordList; record != NULL; record = record->next)
{
- if (record->key != NULL)
+ char *key = record->key;
+ if (key != NULL)
{
- struct tdbRecord *oldRecord = hashFindVal(uniqHash, record->key);
- if (oldRecord != NULL)
+ struct hashEl *hel;
+ for (hel = hashLookup(uniqHash, key); hel != NULL; hel = hashLookupNext(hel))
{
+ struct tdbRecord *oldRecord = hel->val;
struct tdbFilePos *oldPos = oldRecord->posList;
struct tdbFilePos *newPos = record->posList;
boolean doAbort = TRUE;
- if (ignoreIfRelease)
+ if (checkRelease)
{
- if (tdbRecordField(record, "release"))
doAbort = FALSE;
+ char *oldRelease = findFieldValInSelfOrParents(oldRecord, "release");
+ char *newRelease = findFieldValInSelfOrParents(record, "release");
+ if (oldRelease == NULL || newRelease == NULL)
+ doAbort = TRUE;
+ else
+ {
+ if (sameString(oldRelease, newRelease))
+ doAbort = TRUE;
+ }
}
if (doAbort)
{
+ char *oldRelease = NULL;
+ struct tdbField *oldField = tdbRecordField(oldRecord, "release");
+ if (oldField) oldRelease = oldField->val;
+ char *newRelease = NULL;
+ struct tdbField *newField = tdbRecordField(record, "release");
+ if (newField) newRelease = newField->val;
+ if (newRelease == NULL && oldRelease != NULL)
+ {
+ errAbort("Have release tag for track %s at line %d of %s, but not "
+ "at line %d of %s",
+ key, oldPos->lineIx, oldPos->fileName,
+ newPos->lineIx, newPos->fileName);
+ }
+ else if (oldRelease == NULL && newRelease != NULL)
+ {
+ errAbort("Have release tag for track %s at line %d of %s, but not "
+ "at line %d of %s",
+ key, newPos->lineIx, newPos->fileName,
+ oldPos->lineIx, oldPos->fileName);
+ }
+ else
+ {
if (sameString(oldPos->fileName, newPos->fileName))
+ {
errAbort("Duplicate tracks %s ending lines %d and %d of %s",
- oldRecord->key, oldPos->lineIx, newPos->lineIx, oldPos->fileName);
+ key, oldPos->lineIx, newPos->lineIx, oldPos->fileName);
+ }
else
errAbort("Duplicate tracks %s ending lines %d of %s and %d of %s",
- oldRecord->key, oldPos->lineIx, oldPos->fileName,
+ key, oldPos->lineIx, oldPos->fileName,
newPos->lineIx, newPos->fileName);
}
}
- hashAdd(uniqHash, record->key, record);
+ }
+ hashAdd(uniqHash, key, record);
}
}
hashFree(&uniqHash);
}
@@ -321,9 +379,8 @@
recurseThroughIncludes(fileName, lm, circularHash, &recordList);
hashAdd(circularHash, fileName, NULL);
hashFree(&circularHash);
slReverse(&recordList);
-checkDupeKeys(recordList, TRUE);
return recordList;
}
static void mergeRecords(struct tdbRecord *old, struct tdbRecord *record, char *key, struct lm *lm)
@@ -348,9 +405,116 @@
}
old->posList = slCat(old->posList, record->posList);
}
-struct tdbRecord *tdbsForDbPath(struct dbPath *p, struct lm *lm, struct hash *recordHash)
+static int parentChildFileDistance(struct tdbRecord *parent, struct tdbRecord *child)
+/* Return distance of two records. If they're in different files the
+ * distance gets pretty big. Would be flaky on records split across
+ * different files, hence the ad-hoc in the name. Not worth implementing
+ * somthing that handles this though with the hope that the parent/child
+ * relationship will become indentation rather than ID based. */
+{
+struct tdbFilePos *parentFp = parent->posList, *childFp = child->posList;
+if (!sameString(parentFp->fileName, childFp->fileName))
+ return BIGNUM/2;
+int distance = childFp->lineIx - parentFp->lineIx;
+if (distance < 0)
+ return BIGNUM/4 - distance;
+return distance;
+}
+
+static struct tdbRecord *findParent(struct tdbRecord *rec,
+ char *parentFieldName, struct hash *hash, boolean alpha)
+/* Find parent record if possible. This is a bit complicated by wanting to
+ * match parents and children from the same release if possible. Our
+ * strategy is to just ignore records from the wrond release. */
+{
+struct tdbField *parentField = tdbRecordField(rec, parentFieldName);
+if (parentField == NULL)
+ return NULL;
+#ifdef OLD
+if (!recordMatchesRelease(rec, alpha))
+ return NULL;
+#endif /* OLD */
+char *parentLine = parentField->val;
+int len = strlen(parentLine);
+char buf[len+1];
+strcpy(buf, parentLine);
+char *parentName = firstWordInLine(buf);
+struct hashEl *hel;
+boolean gotParentSomeRelease = FALSE;
+struct tdbRecord *closestParent = NULL;
+int closestDistance = BIGNUM;
+for (hel = hashLookup(hash, parentName); hel != NULL; hel = hashLookupNext(hel))
+ {
+ gotParentSomeRelease = TRUE;
+ struct tdbRecord *parent = hel->val;
+#ifdef OLD
+ if (recordMatchesRelease(parent, alpha))
+#endif /* OLD */
+ {
+ int distance = parentChildFileDistance(parent, rec);
+ if (distance < closestDistance)
+ {
+ closestParent = parent;
+ closestDistance = distance;
+ }
+ }
+ }
+if (closestParent != NULL)
+ return closestParent;
+
+/* If we haven't matched so far, it could be that the release tag is set in the parent
+ * but not in us, and the parent is not our release parent. In this case we go ahead
+ * and return the out-of-release parent, so we can inherit the out-of-release release
+ * tag, so we get filtered out! */
+struct tdbField *releaseField = tdbRecordField(rec, "release");
+if (gotParentSomeRelease && releaseField == NULL)
+ {
+ struct tdbRecord *parent = hashFindVal(hash, parentName);
+ assert(parent != NULL);
+ return parent;
+ }
+recordWarn(rec, "%s is a subTrack of %s, but %s doesn't exist", rec->key,
+ parentField->val, parentField->val);
+return NULL;
+}
+
+static void linkUpParents(struct tdbRecord *list, char *parentField, boolean alpha)
+/* Link up records according to parent/child relationships. */
+{
+/* Zero out children, parent, and older sibling fields, since going to recalculate
+ * them and need lists to start out empty. */
+struct tdbRecord *rec;
+for (rec = list; rec != NULL; rec = rec->next)
+ rec->parent = rec->olderSibling = rec->children = NULL;
+
+/* Build up hash of records indexed by key field. */
+struct hash *hash = hashNew(0);
+for (rec = list; rec != NULL; rec = rec->next)
+ {
+ if (rec->key != NULL)
+ hashAdd(hash, rec->key, rec);
+ }
+
+/* Scan through linking up parents. */
+for (rec = list; rec != NULL; rec = rec->next)
+ {
+ struct tdbRecord *parent = findParent(rec, parentField, hash, alpha);
+ if (parent != NULL)
+ {
+ rec->parent = parent;
+ rec->olderSibling = parent->children;
+ parent->children = rec;
+ }
+ }
+
+hashFree(&hash);
+}
+
+
+struct tdbRecord *tdbsForDbPath(struct dbPath *p, struct lm *lm, struct hash *recordHash,
+ char *parentField, boolean alpha)
/* Assemble recordList for given database. This looks at the root/organism/assembly
* levels. It returns a list, and fills in a hash (which should be passed in empty)
* of the records keyed by record->key. */
{
@@ -360,8 +524,10 @@
{
char *fileName = fileLevel->name;
struct tdbRecord *fileRecords = readStartingFromFile(fileName, lm);
verbose(2, "Read %d records starting from %s\n", slCount(fileRecords), fileName);
+ linkUpParents(fileRecords, parentField, alpha);
+ checkDupeKeys(fileRecords, TRUE);
struct tdbRecord *record, *nextRecord;
for (record = fileRecords; record != NULL; record = nextRecord)
{
nextRecord = record->next;
@@ -392,50 +558,8 @@
return recordList;
}
-static struct tdbRecord *findParent(struct tdbRecord *rec,
- char *parentFieldName, struct hash *hash, boolean alpha)
-/* Find parent field if possible. This is a bit complicated by wanting to
- * match parents and children from the same release if possible. Our
- * strategy is to just ignore records from the wrond release. */
-{
-struct tdbField *parentField = tdbRecordField(rec, parentFieldName);
-if (parentField == NULL)
- return NULL;
-if (!recordMatchesRelease(rec, alpha))
- return NULL;
-char *parentLine = parentField->val;
-int len = strlen(parentLine);
-char buf[len+1];
-strcpy(buf, parentLine);
-char *parentName = firstWordInLine(buf);
-struct hashEl *hel;
-boolean gotParentSomeRelease = FALSE;
-for (hel = hashLookup(hash, parentName); hel != NULL; hel = hashLookupNext(hel))
- {
- gotParentSomeRelease = TRUE;
- struct tdbRecord *parent = hel->val;
- if (recordMatchesRelease(parent, alpha))
- return parent;
- }
-
-/* If we haven't matched so far, it could be that the release tag is set in the parent
- * but not in us, and the parent is not our release parent. In this case we go ahead
- * and return the out-of-release parent, so we can inherit the out-of-release release
- * tag, so we get filtered out! */
-struct tdbField *releaseField = tdbRecordField(rec, "release");
-if (gotParentSomeRelease && releaseField == NULL)
- {
- struct tdbRecord *parent = hashFindVal(hash, parentName);
- assert(parent != NULL);
- return parent;
- }
-recordWarn(rec, "%s is a subTrack of %s, but %s doesn't exist", rec->key,
- parentField->val, parentField->val);
-return NULL;
-}
-
static void mergeParentRecord(struct tdbRecord *record, struct tdbRecord *parent,
struct lm *lm)
/* Merge in parent record. This only updates fields that are in parent but not record. */
{
@@ -458,30 +583,12 @@
boolean alpha, struct lm *lm)
/* Go through list. If an element has a parent field, then fill in non-existent fields from
* parent and from view with settings defined in parent. */
{
-/* Build up hash of records indexed by key field. */
-struct hash *hash = hashNew(0);
-struct tdbRecord *rec;
-for (rec = list; rec != NULL; rec = rec->next)
- {
- if (rec->key != NULL)
- hashAdd(hash, rec->key, rec);
- }
-
-/* Scan through linking up parents. */
-for (rec = list; rec != NULL; rec = rec->next)
- {
- struct tdbRecord *parent = findParent(rec, parentField, hash, alpha);
- if (parent != NULL)
- {
- rec->parent = parent;
- rec->olderSibling = parent->children;
- parent->children = rec;
- }
- }
+linkUpParents(list, parentField, alpha);
/* Scan through doing inheritance. */
+struct tdbRecord *rec;
for (rec = list; rec != NULL; rec = rec->next)
{
/* First inherit from view. */
char *viewName = rec->view;
@@ -619,9 +726,9 @@
struct lm *lm = lmInit(0);
struct dbPath *p = dbOrder->val;
char *db = p->db;
struct hash *recordHash = hashNew(0);
- struct tdbRecord *recordList = tdbsForDbPath(p, lm, recordHash);
+ struct tdbRecord *recordList = tdbsForDbPath(p, lm, recordHash, "subTrack", clAlpha);
verbose(2, "Composed %d records from %s\n", slCount(recordList), db);
inheritFromParents(recordList, "subTrack", "noInherit", clAlpha, lm);
recordList = filterOnRelease(recordList, clAlpha);