src/hg/encode/validateFiles/validateFiles.c 1.41

1.41 2010/05/11 02:06:22 braney
handle indels
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.40
retrieving revision 1.41
diff -b -B -U 4 -r1.40 -r1.41
--- src/hg/encode/validateFiles/validateFiles.c	29 Apr 2010 23:38:50 -0000	1.40
+++ src/hg/encode/validateFiles/validateFiles.c	11 May 2010 02:06:22 -0000	1.41
@@ -592,8 +592,103 @@
     }
 return TRUE;
 }
 
+boolean checkCigarMismatches(char *file, int line, char *chrom, unsigned chromStart, char strand, char *seq, unsigned int *cigarPacked, int nCigar)
+{
+static char cacheChrom[1024];
+static struct dnaSeq *cacheSeq = NULL;
+
+if (!genome)
+    errAbort("Need to specify genome if checking BAM files");
+
+if ((line % mmCheckOneInN) != 0)
+    return TRUE; // dont check if this is not one in N
+
+// read the whole chrom
+if ((cacheChrom == NULL) || !sameString(chrom, cacheChrom))
+    {
+    freeDnaSeq(&cacheSeq);
+    int size =  twoBitSeqSize(genome, chrom);
+    cacheSeq = twoBitReadSeqFragLower(genome, chrom, 0, size);
+    strcpy(cacheChrom, chrom);
+    verbose(2, "read in chrom %s size %d\n",cacheChrom, size);
+    }
+
+
+int curPosGenome = chromStart;
+int i, j;
+int mm = 0;
+char dna[10000], *dnaPtr = dna;
+for (i = 0;   (i < nCigar);  i++)
+    {
+    char op;
+    int size = bamUnpackCigarElement(cigarPacked[i], &op);
+    switch (op)
+	{
+	case 'M': // match or mismatch (gapless aligned block)
+            for (j=0 ; j < size; ++j)
+                {
+                *dnaPtr++ = cacheSeq->dna[curPosGenome];
+                curPosGenome++;
+                }
+	    break;
+	case 'I': // inserted in query
+	case 'S': // skipped query bases at beginning or end ("soft clipping")
+            for(j=0; j < size; j++)
+                *dnaPtr++ = '-';
+	    break;
+	case 'D': // deleted from query
+	case 'N': // long deletion from query (intron as opposed to small del)
+	    curPosGenome += size;
+	    break;
+	case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
+	case 'P': // P="silent deletion from padded reference sequence" -- ignore these.
+	    break;
+	default:
+	    errAbort("unrecognized CIGAR op %c -- update me", op);
+	}
+    }
+
+int length = strlen(seq);
+if (matchFirst && (matchFirst < length))
+    length = matchFirst;
+int checkLength = length;
+
+int start, incr;
+if (strand == '-')
+    {
+    start = strlen(seq) - 1;
+    incr = -1;
+    }
+else
+    {
+    start = 0;
+    incr = 1;
+    }
+
+for (i = start; (strand == '-') ? i <= 0 : i < strlen(seq); i += incr)
+    {
+    char c = tolower(seq[i]);
+    if ((dna[i] == '-') || (checkMismatch(c,  dna[i])))
+        ++mm;
+
+    if (--length == 0)
+        break;
+    }
+
+
+if (mm > mismatches)
+    {
+    warn("Error [file=%s, line=%d]: too many mismatches (found %d/%d, maximum is %d) (%s: %d  query %s  dna %s )\n",
+         file, line, mm, checkLength, mismatches, chrom, chromStart, seq, dna);
+    return FALSE;
+    }
+
+return TRUE;
+}
+
+
 boolean checkMismatchesSeq(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq)
 {
 int i, mm = 0;
 struct dnaSeq *g;
@@ -1113,17 +1208,15 @@
 return errs;
 }
 
 #ifdef USE_BAM
-unsigned char *seq;
-
 struct bamCallbackData
-{
-char *file;
-char *chrom;
-int *errs;
-int count;
-};
+    {
+    char *file;
+    char *chrom;
+    int *errs;
+    int count;
+    };
 
 int parseBamRecord(const bam1_t *bam, void *data)
 /* bam_fetch() calls this on each bam alignment retrieved.  Translate each bam 
  * into a linkedFeatures item, and add it to tg->items. */
@@ -1131,22 +1224,19 @@
 struct bamCallbackData *bd = data;
 char *chrom = bd->chrom;
 char *file = bd->file;
 int *errs = bd->errs;
-unsigned char *qname = &bam->data[bam->core.n_cigar];
-if (qname == NULL)
-    errAbort("bad qname");
+const bam1_core_t *core = &bam->core;
 
-//printf("%d\n",bamIsRc(bam));
-seq = &bam->data[bam->core.l_qname + bam->core.n_cigar];
+unsigned int *cigarPacked = bam1_cigar(bam);
+char *query = bamGetQuerySequence(bam, FALSE);
+char strand = bamIsRc(bam) ? '-' : '+';
 
-if (bam->core.n_cigar != 1)
-    errAbort("validateFiles doesn't support indel alignments at the moment");
-
-   if (! checkMismatchesSeq(file, bd->count, chrom, bam->core.pos,  bam->core.pos + bam->core.l_qseq, bamIsRc(bam) ? '-' : '+', bamGetQuerySequence(bam, TRUE)))
+   if (! checkCigarMismatches(file, bd->count, chrom, bam->core.pos, strand, query, cigarPacked, core->n_cigar))
     {
-printf("align: ciglen %d qlen %d pos %d lenght %d strand %c\n",bam->core.n_cigar, bam->core.l_qname, bam->core.pos,  bam->core.l_qseq, bamIsRc(bam) ? '-' : '+');
-printf("data: %s %s\n",bam->data,  bamGetQuerySequence(bam, FALSE));
+    char *cigar = bamGetCigar(bam);
+    warn("align: ciglen %d cigar %s qlen %d pos %d lenght %d strand %c\n",bam->core.n_cigar, cigar, bam->core.l_qname, bam->core.pos,  bam->core.l_qseq, bamIsRc(bam) ? '-' : '+');
+
     if (++(*errs) >= maxErrors)
         errAbort("Aborting .. found %d errors\n", *errs);
     }
     
@@ -1175,8 +1265,9 @@
 for(ii=0; ii < head->n_targets; ii++)
     {
     unsigned *size;
 
+    verbose(2,"has chrom %s\n", head->target_name[ii]);
     if ( (size = hashFindVal(chrHash, head->target_name[ii])) == NULL)
 	{
 	if (!allowOther)
 	    {
@@ -1203,8 +1294,11 @@
 if (!genome)
     return errs; // only check sequence if 2bit file specified
 
 bam_index_t *idx = bam_index_load(file);
+
+if (idx == NULL)
+    errAbort("couldn't find index file for %s\n", file);
 struct bamCallbackData *bd;
 AllocVar(bd);
 
 bd->file = file;
@@ -1221,9 +1315,9 @@
     bd->chrom = head->target_name[ii];
     ret = bam_fetch(fh->x.bam, idx, chromId, start, end, bd, parseBamRecord);
 
     }
-//    printf("final count %d\n", bd->count);
+    verbose(2,"final count %d\n", bd->count);
 
 return errs;
 }
 #endif
@@ -1257,17 +1351,30 @@
 printf("done.\n");
 return 0;
 }
 
+struct chromInfo *ourChromInfoLoad(char **row)
+/* Load a chromInfo from row fetched with select * from chromInfo
+ * from database.  Dispose of this with chromInfoFree(). */
+{
+struct chromInfo *ret;
+
+AllocVar(ret);
+ret->chrom = cloneString(row[0]);
+ret->size = sqlUnsigned(row[1]);
+ret->fileName = NULL;
+return ret;
+}
+
 static struct chromInfo *chromInfoLoadFile(char *fileName) 
 {
 struct chromInfo *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[2];
 
 while (lineFileRow(lf, row))
     {
-    el = chromInfoLoad(row);
+    el = ourChromInfoLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);