src/hg/encode/validateFiles/validateFiles.c 1.35
1.35 2010/03/19 23:16:37 braney
add a check in BAM validation to require that chroms be specified and that they are the correct chroms with the right sizes
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.34
retrieving revision 1.35
diff -b -B -U 4 -r1.34 -r1.35
--- src/hg/encode/validateFiles/validateFiles.c 17 Dec 2009 18:46:57 -0000 1.34
+++ src/hg/encode/validateFiles/validateFiles.c 19 Mar 2010 23:16:37 -0000 1.35
@@ -5,8 +5,9 @@
#include "chromInfo.h"
#include "jksql.h"
#include "twoBit.h"
#include "dnaseq.h"
+#include "bamFile.h"
static char const rcsid[] = "$Id$";
static char *version = "$Revision$";
@@ -64,10 +65,8 @@
" fastq : Fasta with quality scores (see http://maq.sourceforge.net/fastq.shtml)\n"
" csfasta : Colorspace fasta (implies -colorSpace) (see link below)\n"
" csqual : Colorspace quality (see link below)\n"
" (see http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm)\n"
- " SAM : Sequence Alignment/Map\n"
- " (see http://samtools.sourceforge.net/SAM1.pdf)\n"
" BAM : Binary Alignment/Map\n"
" (see http://samtools.sourceforge.net/SAM1.pdf)\n"
"\n"
" -chromDb=db Specify DB containing chromInfo table to validate chrom names\n"
@@ -1054,18 +1053,57 @@
}
return errs;
}
-int validateSAM(struct lineFile *lf, char *file)
+int parseBamRecord(const bam1_t *bam, void *data)
+/* bam_fetch() calls this on each bam alignment retrieved. Translate each bam
+ * into a linkedFeatures item, and add it to tg->items. */
{
-int errs = 0;
-
-return errs;
+return 0;
}
int validateBAM(struct lineFile *lf, char *file)
{
+if (chrHash == NULL)
+ errAbort("BAM validation requires the -chromInfo or -chromDb option\n");
+
int errs = 0;
+samfile_t *fh = samopen(file, "rb", NULL);
+
+if (fh == NULL)
+ errAbort("Aborting... Cannot open BAM file: %s\n", file);
+
+bam_header_t *head = fh->header;
+
+if (head == NULL)
+ errAbort("Aborting... Bad BAM header in file: %s\n", file);
+
+int ii;
+
+for(ii=0; ii < head->n_targets; ii++)
+ {
+ unsigned *size;
+
+ if ( (size = hashFindVal(chrHash, head->target_name[ii])) == NULL)
+ {
+ printf("BAM contains invalid chromosome name: %s\n",
+ head->target_name[ii]);
+ errs++;
+ }
+ else
+ {
+ if (*size != head->target_len[ii])
+ {
+ printf("BAM contains chromosome with wrong length: %s should be %d bases, not %d bases\n",
+ head->target_name[ii],
+ *size, head->target_len[ii]);
+ errs++;
+ }
+ }
+ }
+
+if (errs)
+ errAbort("Aborting... %d errors found in BAM file\n", errs);
return errs;
}
@@ -1098,8 +1136,24 @@
printf("done.\n");
return 0;
}
+static struct chromInfo *chromInfoLoadFile(char *fileName)
+{
+struct chromInfo *list = NULL, *el;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[2];
+
+while (lineFileRow(lf, row))
+ {
+ el = chromInfoLoad(row);
+ slAddHead(&list, el);
+ }
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
int main(int argc, char *argv[])
/* Process command line. */
{
char *type;
@@ -1144,9 +1198,9 @@
chromInfoFree(&ci);
}
else if ( (chromInfo=optionVal("chromInfo", NULL)) != NULL)
{
- if (!(ci = chromInfoLoadAll(chromInfo)))
+ if (!(ci = chromInfoLoadFile(chromInfo)))
errAbort("could not load chromInfo file %s\n", chromInfo);
chrHash = chromHash(ci);
chromInfoFree(&ci);
}
@@ -1161,9 +1215,8 @@
hashAdd(funcs, "broadPeak", &validateBroadPeak);
hashAdd(funcs, "narrowPeak", &validateNarrowPeak);
hashAdd(funcs, "gappedPeak", &validateGappedPeak);
hashAdd(funcs, "bedGraph", &validateBedGraph);
-hashAdd(funcs, "SAM", &validateSAM);
hashAdd(funcs, "BAM", &validateBAM);
//hashAdd(funcs, "test", &testFunc);
if (!(func = hashFindVal(funcs, type)))
errAbort("Cannot validate %s type files\n", type);