src/hg/encode/validateFiles/validateFiles.c 1.41
1.41 2010/05/11 02:06:22 braney
handle indels
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.40
retrieving revision 1.41
diff -b -B -U 4 -r1.40 -r1.41
--- src/hg/encode/validateFiles/validateFiles.c 29 Apr 2010 23:38:50 -0000 1.40
+++ src/hg/encode/validateFiles/validateFiles.c 11 May 2010 02:06:22 -0000 1.41
@@ -592,8 +592,103 @@
}
return TRUE;
}
+boolean checkCigarMismatches(char *file, int line, char *chrom, unsigned chromStart, char strand, char *seq, unsigned int *cigarPacked, int nCigar)
+{
+static char cacheChrom[1024];
+static struct dnaSeq *cacheSeq = NULL;
+
+if (!genome)
+ errAbort("Need to specify genome if checking BAM files");
+
+if ((line % mmCheckOneInN) != 0)
+ return TRUE; // dont check if this is not one in N
+
+// read the whole chrom
+if ((cacheChrom == NULL) || !sameString(chrom, cacheChrom))
+ {
+ freeDnaSeq(&cacheSeq);
+ int size = twoBitSeqSize(genome, chrom);
+ cacheSeq = twoBitReadSeqFragLower(genome, chrom, 0, size);
+ strcpy(cacheChrom, chrom);
+ verbose(2, "read in chrom %s size %d\n",cacheChrom, size);
+ }
+
+
+int curPosGenome = chromStart;
+int i, j;
+int mm = 0;
+char dna[10000], *dnaPtr = dna;
+for (i = 0; (i < nCigar); i++)
+ {
+ char op;
+ int size = bamUnpackCigarElement(cigarPacked[i], &op);
+ switch (op)
+ {
+ case 'M': // match or mismatch (gapless aligned block)
+ for (j=0 ; j < size; ++j)
+ {
+ *dnaPtr++ = cacheSeq->dna[curPosGenome];
+ curPosGenome++;
+ }
+ break;
+ case 'I': // inserted in query
+ case 'S': // skipped query bases at beginning or end ("soft clipping")
+ for(j=0; j < size; j++)
+ *dnaPtr++ = '-';
+ break;
+ case 'D': // deleted from query
+ case 'N': // long deletion from query (intron as opposed to small del)
+ curPosGenome += size;
+ break;
+ case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
+ case 'P': // P="silent deletion from padded reference sequence" -- ignore these.
+ break;
+ default:
+ errAbort("unrecognized CIGAR op %c -- update me", op);
+ }
+ }
+
+int length = strlen(seq);
+if (matchFirst && (matchFirst < length))
+ length = matchFirst;
+int checkLength = length;
+
+int start, incr;
+if (strand == '-')
+ {
+ start = strlen(seq) - 1;
+ incr = -1;
+ }
+else
+ {
+ start = 0;
+ incr = 1;
+ }
+
+for (i = start; (strand == '-') ? i <= 0 : i < strlen(seq); i += incr)
+ {
+ char c = tolower(seq[i]);
+ if ((dna[i] == '-') || (checkMismatch(c, dna[i])))
+ ++mm;
+
+ if (--length == 0)
+ break;
+ }
+
+
+if (mm > mismatches)
+ {
+ warn("Error [file=%s, line=%d]: too many mismatches (found %d/%d, maximum is %d) (%s: %d query %s dna %s )\n",
+ file, line, mm, checkLength, mismatches, chrom, chromStart, seq, dna);
+ return FALSE;
+ }
+
+return TRUE;
+}
+
+
boolean checkMismatchesSeq(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq)
{
int i, mm = 0;
struct dnaSeq *g;
@@ -1113,17 +1208,15 @@
return errs;
}
#ifdef USE_BAM
-unsigned char *seq;
-
struct bamCallbackData
-{
-char *file;
-char *chrom;
-int *errs;
-int count;
-};
+ {
+ char *file;
+ char *chrom;
+ int *errs;
+ int count;
+ };
int parseBamRecord(const bam1_t *bam, void *data)
/* bam_fetch() calls this on each bam alignment retrieved. Translate each bam
* into a linkedFeatures item, and add it to tg->items. */
@@ -1131,22 +1224,19 @@
struct bamCallbackData *bd = data;
char *chrom = bd->chrom;
char *file = bd->file;
int *errs = bd->errs;
-unsigned char *qname = &bam->data[bam->core.n_cigar];
-if (qname == NULL)
- errAbort("bad qname");
+const bam1_core_t *core = &bam->core;
-//printf("%d\n",bamIsRc(bam));
-seq = &bam->data[bam->core.l_qname + bam->core.n_cigar];
+unsigned int *cigarPacked = bam1_cigar(bam);
+char *query = bamGetQuerySequence(bam, FALSE);
+char strand = bamIsRc(bam) ? '-' : '+';
-if (bam->core.n_cigar != 1)
- errAbort("validateFiles doesn't support indel alignments at the moment");
-
- if (! checkMismatchesSeq(file, bd->count, chrom, bam->core.pos, bam->core.pos + bam->core.l_qseq, bamIsRc(bam) ? '-' : '+', bamGetQuerySequence(bam, TRUE)))
+ if (! checkCigarMismatches(file, bd->count, chrom, bam->core.pos, strand, query, cigarPacked, core->n_cigar))
{
-printf("align: ciglen %d qlen %d pos %d lenght %d strand %c\n",bam->core.n_cigar, bam->core.l_qname, bam->core.pos, bam->core.l_qseq, bamIsRc(bam) ? '-' : '+');
-printf("data: %s %s\n",bam->data, bamGetQuerySequence(bam, FALSE));
+ char *cigar = bamGetCigar(bam);
+ warn("align: ciglen %d cigar %s qlen %d pos %d lenght %d strand %c\n",bam->core.n_cigar, cigar, bam->core.l_qname, bam->core.pos, bam->core.l_qseq, bamIsRc(bam) ? '-' : '+');
+
if (++(*errs) >= maxErrors)
errAbort("Aborting .. found %d errors\n", *errs);
}
@@ -1175,8 +1265,9 @@
for(ii=0; ii < head->n_targets; ii++)
{
unsigned *size;
+ verbose(2,"has chrom %s\n", head->target_name[ii]);
if ( (size = hashFindVal(chrHash, head->target_name[ii])) == NULL)
{
if (!allowOther)
{
@@ -1203,8 +1294,11 @@
if (!genome)
return errs; // only check sequence if 2bit file specified
bam_index_t *idx = bam_index_load(file);
+
+if (idx == NULL)
+ errAbort("couldn't find index file for %s\n", file);
struct bamCallbackData *bd;
AllocVar(bd);
bd->file = file;
@@ -1221,9 +1315,9 @@
bd->chrom = head->target_name[ii];
ret = bam_fetch(fh->x.bam, idx, chromId, start, end, bd, parseBamRecord);
}
-// printf("final count %d\n", bd->count);
+ verbose(2,"final count %d\n", bd->count);
return errs;
}
#endif
@@ -1257,17 +1351,30 @@
printf("done.\n");
return 0;
}
+struct chromInfo *ourChromInfoLoad(char **row)
+/* Load a chromInfo from row fetched with select * from chromInfo
+ * from database. Dispose of this with chromInfoFree(). */
+{
+struct chromInfo *ret;
+
+AllocVar(ret);
+ret->chrom = cloneString(row[0]);
+ret->size = sqlUnsigned(row[1]);
+ret->fileName = NULL;
+return ret;
+}
+
static struct chromInfo *chromInfoLoadFile(char *fileName)
{
struct chromInfo *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[2];
while (lineFileRow(lf, row))
{
- el = chromInfoLoad(row);
+ el = ourChromInfoLoad(row);
slAddHead(&list, el);
}
lineFileClose(&lf);
slReverse(&list);