src/hg/utils/tdbQuery/tdbQuery.c 1.3
1.3 2009/12/02 06:47:27 kent
Adding check for duplicate records including handling of different records for different releases. Merging together records from root/organism/assembly tree.
Index: src/hg/utils/tdbQuery/tdbQuery.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/tdbQuery/tdbQuery.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/utils/tdbQuery/tdbQuery.c 2 Dec 2009 05:28:46 -0000 1.2
+++ src/hg/utils/tdbQuery/tdbQuery.c 2 Dec 2009 06:47:27 -0000 1.3
@@ -14,8 +14,9 @@
static char *clRoot = "~/kent/src/hg/makeDb/trackDb"; /* Root dir of trackDb system. */
static char *clFile = NULL; /* a .ra file to use instead of trackDb system. */
static boolean clCheck = FALSE; /* If set perform lots of checks on input. */
static boolean clStrict = FALSE; /* If set only return tracks with actual tables. */
+static boolean clAlpha = FALSE; /* If set include release alphas, exclude release beta. */
void usage()
/* Explain usage and exit. */
{
@@ -46,8 +47,10 @@
"Check that trackDb is internally consistent. Prints diagnostic output to stderr and aborts if \n"
"there's problems.\n"
" -strict\n"
"Mimic -strict option on hgTrackDb. Suppresses tracks where corresponding table does not exist."
+" -alpha\n"
+"Do checking on release alpha (and not release beta) tracks\n"
);
}
@@ -55,8 +58,9 @@
{"root", OPTION_STRING},
{"file", OPTION_STRING},
{"check", OPTION_BOOLEAN},
{"strict", OPTION_BOOLEAN},
+ {"alpha", OPTION_BOOLEAN},
{NULL, 0},
};
struct dbPath
@@ -142,8 +146,27 @@
break;
return p;
}
+boolean filterOnRelease(struct tdbRecord *record, boolean alpha, struct lineFile *lf)
+/* Look for a release tag, and return FALSE if it doesn't match alpha status. */
+{
+struct tdbField *releaseField = tdbRecordField(record, "release");
+if (releaseField == NULL)
+ return TRUE;
+char *release = releaseField->val;
+if (sameString(release, "alpha"))
+ return alpha;
+else if (sameString(release, "beta"))
+ return !alpha;
+else
+ {
+ errAbort("Unrecognized release value %s in stanza ending %d of %s",
+ release, lf->lineIx, lf->fileName);
+ return FALSE;
+ }
+}
+
static void checkDupeFields(struct tdbRecord *record, struct lineFile *lf)
/* Make sure that each field in record is unique. */
{
struct hash *uniqHash = hashNew(0);
@@ -157,8 +180,36 @@
}
hashFree(&uniqHash);
}
+static void checkDupeKeys(struct tdbRecord *recordList)
+/* Make sure that there are no duplicate records (with keys) */
+{
+struct tdbRecord *record;
+struct hash *uniqHash = hashNew(0);
+for (record = recordList; record != NULL; record = record->next)
+ {
+ if (record->key != NULL)
+ {
+ struct tdbRecord *oldRecord = hashFindVal(uniqHash, record->key);
+ if (oldRecord != NULL)
+ {
+ struct tdbFilePos *oldPos = oldRecord->posList;
+ struct tdbFilePos *newPos = record->posList;
+ if (sameString(oldPos->fileName, newPos->fileName))
+ errAbort("Duplicate tracks %s ending lines %d and %d of %s",
+ oldRecord->key, oldPos->lineIx, newPos->lineIx, oldPos->fileName);
+ else
+ errAbort("Duplicate tracks %s ending lines %d of %s and %d of %s",
+ oldRecord->key, oldPos->lineIx, oldPos->fileName,
+ newPos->lineIx, newPos->fileName);
+ }
+ hashAdd(uniqHash, record->key, record);
+ }
+ }
+hashFree(&uniqHash);
+}
+
static void recurseThroughIncludes(char *fileName, struct lm *lm,
struct hash *circularHash, struct tdbRecord **pRecordList)
/* Recurse through include files. */
{
@@ -192,11 +243,18 @@
}
else
{
checkDupeFields(record, lf);
+ if (record->key != NULL)
+ {
+ if (filterOnRelease(record, clAlpha, lf))
+ {
+ record->posList = tdbFilePosNew(lm, fileName, lf->lineIx);
slAddHead(pRecordList, record);
}
}
+ }
+ }
lineFileClose(&lf);
}
struct tdbRecord *readStartingFromFile(char *fileName, struct lm *lm)
@@ -207,11 +265,35 @@
recurseThroughIncludes(fileName, lm, circularHash, &recordList);
hashAdd(circularHash, fileName, NULL);
hashFree(&circularHash);
slReverse(&recordList);
+checkDupeKeys(recordList);
return recordList;
}
+static void mergeRecords(struct tdbRecord *old, struct tdbRecord *record, char *key, struct lm *lm)
+/* Merge record into old, updating any old fields with new record values. */
+{
+struct tdbField *field;
+for (field = record->fieldList; field != NULL; field = field->next)
+ {
+ if (!sameString(field->name, key))
+ {
+ struct tdbField *oldField = tdbRecordField(old, field->name);
+ if (oldField != NULL)
+ oldField->val = field->val;
+ else
+ {
+ lmAllocVar(lm, oldField);
+ oldField->name = field->name;
+ oldField->val = field->val;
+ slAddTail(&old->fieldList, oldField);
+ }
+ }
+ }
+old->posList = slCat(old->posList, record->posList);
+}
+
void tdbQuery(char *sql)
/* tdbQuery - Query the trackDb system using SQL syntax.. */
{
/* Parse out sql statement. */
@@ -232,26 +314,53 @@
}
}
uglyf("%d databases in from clause\n", slCount(dbOrderList));
+/* Loop through each database. */
for (dbOrder = dbOrderList; dbOrder != NULL; dbOrder = dbOrder->next)
{
struct lm *lm = lmInit(0);
struct dbPath *p = dbOrder->val;
struct slName *fileLevelList = dbPathToFiles(p), *fileLevel;
+
+ /* Assemble recordList and record hash from the root/organism/assembly levels */
struct hash *recordHash = hashNew(0);
+ struct tdbRecord *recordList = NULL;
for (fileLevel = fileLevelList; fileLevel != NULL; fileLevel = fileLevel->next)
{
char *fileName = fileLevel->name;
- struct tdbRecord *recordList = readStartingFromFile(fileName, lm);
- uglyf("Read %d records starting from %s\n", slCount(recordList), fileName);
- struct tdbRecord *record;
- for (record = recordList; record != NULL; record = record->next)
+ struct tdbRecord *fileRecords = readStartingFromFile(fileName, lm);
+ uglyf("Read %d records starting from %s\n", slCount(fileRecords), fileName);
+ struct tdbRecord *record, *nextRecord;
+ for (record = fileRecords; record != NULL; record = nextRecord)
+ {
+ nextRecord = record->next;
+ char *key = record->key;
+ struct tdbRecord *oldRecord = hashFindVal(recordHash, key);
+ if (oldRecord != NULL)
+ {
+ if (!record->override)
+ {
+ oldRecord->fieldList = record->fieldList;
+ oldRecord->posList = record->posList;
+ oldRecord->settingsByView = record->settingsByView;
+ oldRecord->subGroups = record->subGroups;
+ oldRecord->view = record->view;
+ oldRecord->viewHash = record->viewHash;
+ }
+ else
+ mergeRecords(oldRecord, record, key, lm);
+ }
+ else
{
- if (record->key != NULL)
hashAdd(recordHash, record->key, record);
+ slAddHead(&recordList, record);
}
}
+ }
+ slReverse(&recordList);
+ uglyf("Composed %d records from %s\n", slCount(recordList), p->db);
+
lmCleanup(&lm);
}
@@ -267,7 +376,8 @@
clRoot = optionVal("root", clRoot);
clFile = optionVal("file", clFile);
clCheck = optionExists("check");
clStrict = optionExists("strict");
+clAlpha = optionExists("alpha");
tdbQuery(argv[1]);
return 0;
}