7a09cffe08484c6e1fc843a61edd858203872853
braney
  Thu Jan 12 15:44:14 2012 -0800
a little utility to check to see if a twoBit file has duplicated sequences
diff --git src/utils/twoBitDup/twoBitDup.c src/utils/twoBitDup/twoBitDup.c
new file mode 100644
index 0000000..cfedbf5
--- /dev/null
+++ src/utils/twoBitDup/twoBitDup.c
@@ -0,0 +1,63 @@
+/* twoBitDup - check to see if a twobit file has any identical sequences in it. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "twoBit.h"
+#include "dnaseq.h"
+#include "math.h"
+
+static char const rcsid[] = "$Id: newProg.c,v 1.30 2010/03/24 21:18:33 hiram Exp $";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "twoBitDup - check to see if a twobit file has any identical sequences in it\n"
+  "usage:\n"
+  "   twoBitDup file.2bit\n"
+  "options:\n"
+  );
+}
+
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+void twoBitDup(char *filename)
+/* twoBitDup - check to see if a twobit file has any identical sequences in it. */
+{
+struct twoBitFile *tbf;
+
+tbf = twoBitOpen(filename);
+struct twoBitIndex *index;
+int seqCount = slCount(tbf->indexList);
+int hashSize = log2(seqCount) + 2;	 // +2 for luck
+struct hash *seqHash = newHash(hashSize);
+
+verbose(2, "hash size is %d\n", hashSize);
+
+for (index = tbf->indexList; index != NULL; index = index->next)
+    {
+    verbose(2,"grabbing seq %s\n", index->name);
+    int size;
+    struct dnaSeq *seq = twoBitReadSeqFragExt(tbf, index->name,
+	0, 0, FALSE, &size);
+    struct hashEl *hel;
+    if ((hel = hashLookup(seqHash, seq->dna)) != NULL)
+	printf("%s and %s are identical\n", index->name, (char *)hel->val);
+    else
+	hashAdd(seqHash, seq->dna, index->name);
+    freeDnaSeq(&seq);
+    }
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+    usage();
+twoBitDup(argv[1]);
+return 0;
+}