src/hg/encode/validateFiles/validateFiles.c 1.15

1.15 2009/04/02 22:49:46 mikep
validation of fasta files (single-line of sequence, no quality)
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/hg/encode/validateFiles/validateFiles.c	2 Apr 2009 04:27:21 -0000	1.14
+++ src/hg/encode/validateFiles/validateFiles.c	2 Apr 2009 22:49:46 -0000	1.15
@@ -48,8 +48,9 @@
   "options:\n"
   "   -type=(a value from the list below)\n"
   "         tagAlign|pairedTagAlign|broadPeak|narrowPeak|gappedPeak|bedGraph\n"
   "                   : see http://genomewiki.cse.ucsc.edu/EncodeDCC/index.php/File_Formats\n"
+  "         fasta     : Fasta files (only one line of sequence, and no quality scores)\n"
   "         fastq     : Fasta with quality scores (see http://maq.sourceforge.net/fastq.shtml)\n"
   "         csfasta   : Colorspace fasta (implies -colorSpace) (see link below)\n"
   "         csqual    : Colorspace quality (see link below)\n"
   "                     (see http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm)\n"
@@ -643,8 +644,49 @@
 {
 return validateBedVariant(lf, file, BED_GRAPH);
 }
 
+// fasta:
+// >VHE-245683051005-13-1-2-1704
+// GTGTTAATTTTCTTGATCTTTCGTTC
+// >VHE-245683051005-13-1-2-1704
+// CTTGCTTTCTAGTTCTTTTAATTGTG
+
+int validateFasta(struct lineFile *lf, char *file)
+{
+char *seqName, *seq;
+int line = 0;
+int errs = 0;
+boolean startOfFile = TRUE;
+verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
+while ( lineFileNext(lf, &seqName, NULL))
+    {
+    ++line;
+    if (startOfFile)
+	{
+	if (*seqName == '#')
+	    continue;
+	else
+	    startOfFile = FALSE;
+	}
+    if (checkSeqName(file, line, seqName, '>', "sequence name")
+	&& (wantNewLine(lf, file, ++line, &seq, "fastq sequence line"))
+	&& checkSeq(file, line, seq, seq, "sequence") )
+	{
+	if (printOkLines)
+	    printf("%s\n%s\n", seqName, seq);
+	}
+    else
+	{
+	if (printFailLines)
+	    printf("%s\n%s\n", seqName, seq);
+	if (++errs >= maxErrors)
+	    errAbort("Aborting .. found %d errors\n", errs);
+	}
+    }
+return errs;
+}
+
 // fastq:
 // @NINA_1_FC30G3VAAXX:5:1:110:908
 // ATCGTCAGGTGGGATAATCCTTACCTTTTCCTCCTC
 // +NINA_1_FC30G3VAAXX:5:1:110:908
@@ -854,8 +896,9 @@
 verbose(2,"[%s %3d] type=%s\n", __func__, __LINE__, type);
 // Setup the function hash keyed by type
 hashAdd(funcs, "tagAlign",       &validateTagAlign);
 hashAdd(funcs, "pairedTagAlign", &validatePairedTagAlign);
+hashAdd(funcs, "fasta",          &validateFasta);
 hashAdd(funcs, "fastq",          &validateFastq);
 hashAdd(funcs, "csfasta",        &validateCsfasta);
 hashAdd(funcs, "csqual",         &validateCsqual);
 hashAdd(funcs, "broadPeak",      &validateBroadPeak);