src/hg/snp/snpMask/snpMaskCutDeletions.c 1.2
1.2 2009/05/16 00:03:05 angie
Removed an old debugging message that confused the heck out of me when I saw it again today.
Index: src/hg/snp/snpMask/snpMaskCutDeletions.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/snp/snpMask/snpMaskCutDeletions.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/snp/snpMask/snpMaskCutDeletions.c 4 Feb 2008 20:16:54 -0000 1.1
+++ src/hg/snp/snpMask/snpMaskCutDeletions.c 16 May 2009 00:03:05 -0000 1.2
@@ -1,129 +1,128 @@
/* snpMaskCutDeletions -- Print genomic sequence with deletion SNPs removed. */
#include "common.h"
#include "linefile.h"
#include "dnaseq.h"
#include "twoBit.h"
static char const rcsid[] = "$Id$";
void usage()
/* Explain usage and exit. */
{
errAbort(
"snpMaskCutDeletions -- Print genomic sequence with deletion SNPs removed.\n"
"usage:\n"
" snpMaskCutDeletions snpNNN.bed db.2bit out.fa\n"
"Given a snpNNN.bed (NNN >= 125) and the corresponding 2bit assembly\n"
"sequence, write a fasta file of each assembly seq minus the sequence\n"
"from each deletion SNP.\n"
"Note1: snpNNN.bed input *must* be sorted by position.\n"
"Note2: It is a good idea to exclude SNPs with exceptions that indicate\n"
"problems, for example:\n"
"\n"
" awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \\\n"
" /cluster/data/dbSNP/NNN/human/snpNNNExceptions.bed \\\n"
" | sort -u \\\n"
" > snpNNNExcludeRsIds.txt\n"
"\n"
" grep -vFwf snpNNNExcludeRsIds.txt \\\n"
" /cluster/data/dbSNP/NNN/$organism/snpNNN.bed \\\n"
" | snpMaskCutDeletions stdin /cluster/data/$db/$db.2bit stdout \\\n"
" | faSplit byname stdin deletions/\n"
);
}
boolean isSimpleObserved(char *observed)
/* Return TRUE if observed is "-/" followed by alpha-only. */
{
int i;
if (!startsWith("-/", observed))
return FALSE;
for (i = 2; i < strlen(observed); i++)
if (!isalpha(observed[i]))
return FALSE;
return TRUE;
}
void snpMaskCutDeletions(char *snpBedFile, char *twoBitFile, char *outFile)
{
/* snpMaskCutDeletions -- Print genomic sequence with deletion SNPs removed. */
struct lineFile *lfSnp = lineFileOpen(snpBedFile, TRUE);
struct twoBitFile *tbfGenomic = twoBitOpen(twoBitFile);
FILE *outMasked = mustOpen(outFile, "w");
long long expectedTotalSize = twoBitTotalSize(tbfGenomic);
long long totalCutSnps = 0, totalCutBases = 0, totalSize = 0;
struct dnaSeq *seq = NULL;
int seqPos = 0;
char *words[17];
int wordCount;
while ((wordCount = lineFileChopTab(lfSnp, words)) > 0)
{
lineFileExpectWords(lfSnp, 17, wordCount);
char *chrom = words[0];
int chromStart = lineFileNeedFullNum(lfSnp, words, 1);
int chromEnd = lineFileNeedFullNum(lfSnp, words, 2);
char *observed = words[8];
char *class = words[10];
int weight = lineFileNeedFullNum(lfSnp, words, 16);
if (chromEnd < chromStart+1 ||
!sameString(class, "deletion") ||
weight != 1 ||
!isSimpleObserved(observed))
continue;
if (!seq || !sameString(seq->name, chrom))
{
if (seq)
{
/* Write the sequence from the last deletion point
* through the end of the chrom. */
writeSeqWithBreaks(outMasked, seq->dna+seqPos, seq->size - seqPos,
50);
totalSize += seq->size;
dnaSeqFree(&seq);
}
seq = twoBitReadSeqFrag(tbfGenomic, chrom, 0, 0);
seqPos = 0;
fprintf(outMasked, ">%s\n", chrom);
}
if (chromEnd <= seqPos)
continue;
if (chromStart > seqPos)
/* Write sequence preceding this deletion: */
writeSeqWithBreaks(outMasked, seq->dna+seqPos, chromStart - seqPos,
50);
totalCutSnps++;
/* Hop over the deleted genomic sequence: */
totalCutBases += (chromEnd - max(chromStart, seqPos));
seqPos = chromEnd;
}
if (seq)
{
/* Write the sequence from the last deletion point through the
* end of the chrom. */
writeSeqWithBreaks(outMasked, seq->dna+seqPos, seq->size - seqPos, 50);
totalSize += seq->size;
dnaSeqFree(&seq);
}
-verbose(0, "at line %d.\n", lfSnp->lineIx);
lineFileClose(&lfSnp);
twoBitClose(&tbfGenomic);
carefulClose(&outMasked);
verbose(1, "Cut %lld snps totaling %lld bases from %lld genomic bases\n",
totalCutSnps, totalCutBases, totalSize);
if (totalSize != expectedTotalSize)
verbose(0, "%s has %lld total bases, but the total number of bases "
"in sequences for which we masked snps is %lld "
"(difference is %lld)\n", twoBitFile, expectedTotalSize,
totalSize, (expectedTotalSize - totalSize));
}
int main(int argc, char *argv[])
/* Check args and call snpMaskCutDeletions. */
{
if (argc != 4)
usage();
snpMaskCutDeletions(argv[1], argv[2], argv[3]);
return 0;
}