src/hg/encode/validateFiles/validateFiles.c 1.35

1.35 2010/03/19 23:16:37 braney
add a check in BAM validation to require that chroms be specified and that they are the correct chroms with the right sizes
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.34
retrieving revision 1.35
diff -b -B -U 4 -r1.34 -r1.35
--- src/hg/encode/validateFiles/validateFiles.c	17 Dec 2009 18:46:57 -0000	1.34
+++ src/hg/encode/validateFiles/validateFiles.c	19 Mar 2010 23:16:37 -0000	1.35
@@ -5,8 +5,9 @@
 #include "chromInfo.h"
 #include "jksql.h"
 #include "twoBit.h"
 #include "dnaseq.h"
+#include "bamFile.h"
 
 static char const rcsid[] = "$Id$";
 static char *version = "$Revision$";
 
@@ -64,10 +65,8 @@
   "         fastq     : Fasta with quality scores (see http://maq.sourceforge.net/fastq.shtml)\n"
   "         csfasta   : Colorspace fasta (implies -colorSpace) (see link below)\n"
   "         csqual    : Colorspace quality (see link below)\n"
   "                     (see http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm)\n"
-  "         SAM       : Sequence Alignment/Map\n"
-  "                     (see http://samtools.sourceforge.net/SAM1.pdf)\n"
   "         BAM       : Binary Alignment/Map\n"
   "                     (see http://samtools.sourceforge.net/SAM1.pdf)\n"
   "\n"
   "   -chromDb=db                  Specify DB containing chromInfo table to validate chrom names\n"
@@ -1054,18 +1053,57 @@
     }
 return errs;
 }
 
-int validateSAM(struct lineFile *lf, char *file)
+int parseBamRecord(const bam1_t *bam, void *data)
+/* bam_fetch() calls this on each bam alignment retrieved.  Translate each bam 
+ * into a linkedFeatures item, and add it to tg->items. */
 {
-int errs = 0;
-
-return errs;
+return 0;
 }
 
 int validateBAM(struct lineFile *lf, char *file)
 {
+if (chrHash == NULL)
+    errAbort("BAM validation requires the -chromInfo or -chromDb option\n");
+
 int errs = 0;
+samfile_t *fh = samopen(file, "rb", NULL);
+
+if (fh == NULL)
+    errAbort("Aborting... Cannot open BAM file: %s\n", file);
+
+bam_header_t *head = fh->header;
+
+if (head == NULL)
+    errAbort("Aborting... Bad BAM header in file: %s\n", file);
+
+int ii;
+
+for(ii=0; ii < head->n_targets; ii++)
+    {
+    unsigned *size;
+
+    if ( (size = hashFindVal(chrHash, head->target_name[ii])) == NULL)
+	{
+	printf("BAM contains invalid chromosome name: %s\n", 
+	    head->target_name[ii]);
+	errs++;
+	}
+    else
+	{
+	if (*size != head->target_len[ii])
+	    {
+	    printf("BAM contains chromosome with wrong length: %s should be %d bases, not %d bases\n", 
+		head->target_name[ii],
+		*size, head->target_len[ii]);
+	    errs++;
+	    }
+	}
+    }
+
+if (errs)
+    errAbort("Aborting... %d errors found in BAM file\n", errs);
 
 return errs;
 }
 
@@ -1098,8 +1136,24 @@
 printf("done.\n");
 return 0;
 }
 
+static struct chromInfo *chromInfoLoadFile(char *fileName) 
+{
+struct chromInfo *list = NULL, *el;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[2];
+
+while (lineFileRow(lf, row))
+    {
+    el = chromInfoLoad(row);
+    slAddHead(&list, el);
+    }
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *type;
@@ -1144,9 +1198,9 @@
     chromInfoFree(&ci);
     }
 else if ( (chromInfo=optionVal("chromInfo", NULL)) != NULL)
     {
-    if (!(ci = chromInfoLoadAll(chromInfo)))
+    if (!(ci = chromInfoLoadFile(chromInfo)))
 	errAbort("could not load chromInfo file %s\n", chromInfo);
     chrHash = chromHash(ci);
     chromInfoFree(&ci);
     }
@@ -1161,9 +1215,8 @@
 hashAdd(funcs, "broadPeak",      &validateBroadPeak);
 hashAdd(funcs, "narrowPeak",     &validateNarrowPeak);
 hashAdd(funcs, "gappedPeak",     &validateGappedPeak);
 hashAdd(funcs, "bedGraph",       &validateBedGraph);
-hashAdd(funcs, "SAM",            &validateSAM);
 hashAdd(funcs, "BAM",            &validateBAM);
 //hashAdd(funcs, "test", &testFunc);
 if (!(func = hashFindVal(funcs, type)))
     errAbort("Cannot validate %s type files\n", type);