src/hg/encode/validateFiles/validateFiles.c 1.15
1.15 2009/04/02 22:49:46 mikep
validation of fasta files (single-line of sequence, no quality)
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/hg/encode/validateFiles/validateFiles.c 2 Apr 2009 04:27:21 -0000 1.14
+++ src/hg/encode/validateFiles/validateFiles.c 2 Apr 2009 22:49:46 -0000 1.15
@@ -48,8 +48,9 @@
"options:\n"
" -type=(a value from the list below)\n"
" tagAlign|pairedTagAlign|broadPeak|narrowPeak|gappedPeak|bedGraph\n"
" : see http://genomewiki.cse.ucsc.edu/EncodeDCC/index.php/File_Formats\n"
+ " fasta : Fasta files (only one line of sequence, and no quality scores)\n"
" fastq : Fasta with quality scores (see http://maq.sourceforge.net/fastq.shtml)\n"
" csfasta : Colorspace fasta (implies -colorSpace) (see link below)\n"
" csqual : Colorspace quality (see link below)\n"
" (see http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm)\n"
@@ -643,8 +644,49 @@
{
return validateBedVariant(lf, file, BED_GRAPH);
}
+// fasta:
+// >VHE-245683051005-13-1-2-1704
+// GTGTTAATTTTCTTGATCTTTCGTTC
+// >VHE-245683051005-13-1-2-1704
+// CTTGCTTTCTAGTTCTTTTAATTGTG
+
+int validateFasta(struct lineFile *lf, char *file)
+{
+char *seqName, *seq;
+int line = 0;
+int errs = 0;
+boolean startOfFile = TRUE;
+verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
+while ( lineFileNext(lf, &seqName, NULL))
+ {
+ ++line;
+ if (startOfFile)
+ {
+ if (*seqName == '#')
+ continue;
+ else
+ startOfFile = FALSE;
+ }
+ if (checkSeqName(file, line, seqName, '>', "sequence name")
+ && (wantNewLine(lf, file, ++line, &seq, "fastq sequence line"))
+ && checkSeq(file, line, seq, seq, "sequence") )
+ {
+ if (printOkLines)
+ printf("%s\n%s\n", seqName, seq);
+ }
+ else
+ {
+ if (printFailLines)
+ printf("%s\n%s\n", seqName, seq);
+ if (++errs >= maxErrors)
+ errAbort("Aborting .. found %d errors\n", errs);
+ }
+ }
+return errs;
+}
+
// fastq:
// @NINA_1_FC30G3VAAXX:5:1:110:908
// ATCGTCAGGTGGGATAATCCTTACCTTTTCCTCCTC
// +NINA_1_FC30G3VAAXX:5:1:110:908
@@ -854,8 +896,9 @@
verbose(2,"[%s %3d] type=%s\n", __func__, __LINE__, type);
// Setup the function hash keyed by type
hashAdd(funcs, "tagAlign", &validateTagAlign);
hashAdd(funcs, "pairedTagAlign", &validatePairedTagAlign);
+hashAdd(funcs, "fasta", &validateFasta);
hashAdd(funcs, "fastq", &validateFastq);
hashAdd(funcs, "csfasta", &validateCsfasta);
hashAdd(funcs, "csqual", &validateCsqual);
hashAdd(funcs, "broadPeak", &validateBroadPeak);