src/hg/encode/validateFiles/validateFiles.c 1.5

1.5 2009/03/13 16:39:21 mikep
making everythign use linefile so .gz/.bzip2 files handled automatically
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 4 -r1.4 -r1.5
--- src/hg/encode/validateFiles/validateFiles.c	13 Mar 2009 08:22:13 -0000	1.4
+++ src/hg/encode/validateFiles/validateFiles.c	13 Mar 2009 16:39:21 -0000	1.5
@@ -337,19 +337,19 @@
 warn("Error [file=%s, line=%d]: invalid strand '%s' (want '+','-','.') [%s]", file, line, strand, row);
 return FALSE;
 }
 
-int validateTagOrPairedTagAlign(char *file, boolean paired)
+int validateTagOrPairedTagAlign(struct lineFile *lf, char *file, boolean paired)
 {
 char *row;
 char buf[1024];
 char *words[9];
-struct lineFile *lf = lineFileOpen(file, TRUE);
 int line = 0;
 int errs = 0;
 unsigned chromSize;
+int size;
 verbose(2,"[%s %3d] paired=%d file(%s)\n", __func__, __LINE__, paired, file);
-while (lineFileNextReal(lf, &row))
+while (lineFileNext(lf, &row, &size))
     {
     ++line;
     safecpy(buf, sizeof(buf), row);
     int n = chopByWhite(buf, words, 9);
@@ -377,66 +377,58 @@
 	if (++errs >= maxErrors)
 	    errAbort("Aborting .. found %d errors\n", errs);
 	}
     }
-lineFileClose(&lf);
 return errs;
 }
 
 // tagAlign
 // chr1     6082    6117    TCTACTGGCTCTGTGTGTACCAGTCTGTCACTGAG     1000    -
 // chr1     7334    7369    AGCCAGGGGGTGACGTTGTTAGATTAGATTTCTTA     1000    +
 
-int validateTagAlign(char *file)
+int validateTagAlign(struct lineFile *lf, char *file)
 {
-return validateTagOrPairedTagAlign(file, FALSE);
+return validateTagOrPairedTagAlign(lf, file, FALSE);
 }
 
 // pairedTagAlign
 // chr10    96316360        96310862        9       1000    +       TCTCACCCGATAACGACCCCCTCCC       TGATCCTTGACTCACTTGCTAATTT
 // chr8    126727657       126721865       10      1000    +       AATTCTTCACCTCTCCTGTTCAAAG       TGTGTGAGATCCAAGAATCCTCTCT
 
-int validatePairedTagAlign(char *file)
+int validatePairedTagAlign(struct lineFile *lf, char *file)
 {
-return validateTagOrPairedTagAlign(file, TRUE);
+return validateTagOrPairedTagAlign(lf, file, TRUE);
 }
 
 // fastq:
 // @NINA_1_FC30G3VAAXX:5:1:110:908
 // ATCGTCAGGTGGGATAATCCTTACCTTTTCCTCCTC
 // +NINA_1_FC30G3VAAXX:5:1:110:908
 // aa`]`a`XQ^VQQ^`aaaaaaa^[[ZG[aXUX[[[X
 
-int validateFastq(char *file)
+int validateFastq(struct lineFile *lf, char *file)
 {
-char *seqName = NULL;
-char *seq = NULL; 
-char *qName = NULL;
-char *qual = NULL;
+char *seqName, *seq, *qName, *qual;
 int line = 0;
 int errs = 0;
 boolean startOfFile = TRUE;
-FILE *f = mustOpen(file, "rb");
 verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
-while ( (seqName = readLine(f)) )
+while ( lineFileNext(lf, &seqName, NULL))
     {
     ++line;
     if (startOfFile)
 	{
 	if (*seqName == '#')
-	    {
-	    freez(&seqName);
 	    continue;
-	    }
 	else
 	    startOfFile = FALSE;
 	}
     if (checkSeqName(file, line, seqName, '@', "sequence name")
-	&& (seq = readLine(f))
+	&& (lineFileNext(lf, &seq, NULL))
 	&& checkSeq(file, ++line, seq, seq, "sequence")
-	&& (qName = readLine(f))
+	&& (lineFileNext(lf, &qName, NULL))
 	&& checkSeqName(file, ++line, qName, '+', "quality name")
-	&& (qual = readLine(f))
+	&& (lineFileNext(lf, &qual, NULL))
 	&& checkQual(file, ++line, qual) )
 	{
 	if (printOkLines)
 	    printf("%s\n%s\n%s\n%s\n", seqName, seq, qName, qual);
@@ -447,14 +439,9 @@
 	    printf("%s\n%s\n%s\n%s\n", seqName, seq, qName, qual);
 	if (++errs >= maxErrors)
 	    errAbort("Aborting .. found %d errors\n", errs);
 	}
-    freez(&seqName);
-    freez(&seq);
-    freez(&qName);
-    freez(&qual);
     }
-carefulClose(&f);
 return errs;
 }
 
 // CS Fasta:
@@ -462,32 +449,28 @@
 // T022213002230311203200200322000
 // >920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0###
 // T01301010111200210102321210100112312
 
-int validateCsfasta(char *file)
+int validateCsfasta(struct lineFile *lf, char *file)
 {
 char *seqName = NULL;
 char *seq = NULL; 
 int line = 0;
 int errs = 0;
 boolean startOfFile = TRUE;
-FILE *f = mustOpen(file, "rb");
 verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
-while ( (seqName = readLine(f)) )
+while (lineFileNext(lf, &seqName, NULL))
     {
     ++line;
     if (startOfFile)
 	{
 	if (*seqName == '#')
-	    {
-	    freez(&seqName);
 	    continue;
-	    }
 	else
 	    startOfFile = FALSE;
 	}
     if (checkCsSeqName(file, line, seqName)
-	&& (seq = readLine(f))
+	&& (lineFileNext(lf, &seq, NULL))
 	&& checkSeq(file, ++line, seq, seq, "sequence") )
 	{
 	if (printOkLines)
 	    printf("%s\n%s\n", seqName, seq);
@@ -498,31 +481,41 @@
 	    printf("%s\n%s\n", seqName, seq);
 	if (++errs >= maxErrors)
 	    errAbort("Aborting .. found %d errors\n", errs);
 	}
-    freez(&seqName);
-    freez(&seq);
     }
-carefulClose(&f);
 return errs;
 }
 
-void validateFiles(int (*validate)(char *file), int numFiles, char *files[])
+void validateFiles(int (*validate)(struct lineFile *lf, char *file), int numFiles, char *files[])
 /* validateFile - validate format of different track input files. */
 {
 int i;
 int errs = 0;
 verbose(2,"[%s %3d] numFiles=%d \n", __func__, __LINE__, numFiles);
 for (i = 0; i < numFiles ; ++i)
     {
-    errs += validate(files[i]);
+    struct lineFile *lf = lineFileOpen(files[i], TRUE);
+    errs += validate(lf, files[i]);
+    lineFileClose(&lf);
     }
 verbose(2,"[%s %3d] done loop\n", __func__, __LINE__);
 if (errs > 0) 
     errAbort("Aborting ... found %d errors in total\n", errs);
 verbose(2,"[%s %3d] done\n", __func__, __LINE__);
 }
 
+int testFunc(char *f)
+{
+char *row;
+int size;
+struct lineFile *lf = lineFileOpen(f, TRUE);
+while (lineFileNext(lf, &row, &size))
+    printf("size=%d [%s]\n", size, row);
+printf("done.\n");
+return 0;
+}
+
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *type;
@@ -556,8 +549,9 @@
 hashAdd(funcs, "tagAlign", &validateTagAlign);
 hashAdd(funcs, "pairedTagAlign", &validatePairedTagAlign);
 hashAdd(funcs, "fastq", &validateFastq);
 hashAdd(funcs, "csfasta", &validateCsfasta);
+//hashAdd(funcs, "test", &testFunc);
 if (!(func = hashFindVal(funcs, type)))
     errAbort("Cannot validate %s type files\n", type);
 validateFiles(func, argc, argv);
 return 0;