src/hg/snp/snpMask/snpMaskSingle.c 4898794edd81be5285ea6e544acbedeaeb31bf78

4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/snp/snpMask/snpMaskSingle.c src/hg/snp/snpMask/snpMaskSingle.c
index a73a71d..b16d50f 100644
--- src/hg/snp/snpMask/snpMaskSingle.c
+++ src/hg/snp/snpMask/snpMaskSingle.c
@@ -1,270 +1,270 @@
 /* snpMaskSingle - Print sequence using IUPAC ambiguous nucleotide codes for single base substitutions. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "dnaseq.h"
 #include "dnautil.h"
 #include "fa.h"
 #include "twoBit.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 "snpMaskSingle - print sequence using IUPAC ambiguous nucleotide codes for single base substitutions\n"
 "usage:\n"
 "    snpMaskSingle snpNNN.bed db.2bit out.fa [diffObserved.txt]\n"
 "Given a snpNNN.bed (NNN >= 125) and the corresponding 2bit assembly\n"
 "sequence, write a fasta file with an IUPAC ambiguous nucleotide code\n"
 "representing the observed alleles at the position of each single-base\n"
 "SNP. If optional arg [diffOberved.txt] is given, then differing observed\n"
 "strings at the same location will be written to that file.\n"
 "Note1: This assumes that snpNNN.bed is sorted by position -- results will\n"
 "be incorrect if rows for different chroms are shuffled!\n"
 "Note2: It is a good idea to filter the snpNNN.bed input to exclude SNPs\n"
 "with exceptions that indicate problems, for example:\n"
 "    \n"
 "    awk '$5 ~ /^MultipleAlignments|ObservedWrongFormat|ObservedMismatch/ {print $4;}' \\\n"
 "      /cluster/data/dbSNP/NNN/human/snpNNNExceptions.bed \\\n"
 "      | sort -u \\\n"
 "      > snpNNNExcludeRsIds.txt\n"
 "\n"
 "    grep -vFwf snpNNNExcludeRsIds.txt \\\n"
 "      /cluster/data/dbSNP/NNN/$organism/snpNNN.bed \\\n"
 "    | snpMaskSingle stdin /cluster/data/$db/$db.2bit stdout \\\n"
 "    | faSplit byname stdin substitutions/\n"
 );
 }
 
 /* As Heather suggested, libify the IUPAC nucleotide stuff in dnaUtil? */
 
 /* This is a character array to be indexed by a number with one bit set for
  * each nucleotide found at that base -- it translates those base bitmaps
  * into IUPAC nucleotide characters. */
 char *iupacNt =  /* #  = A C G T */
 "?"              /* 0  = 0 0 0 0 (none) */
 "T"              /* 1  = 0 0 0 1 (T) */
 "G"              /* 2  = 0 0 1 0 (G) */
 "K"              /* 3  = 0 0 1 1 (G/T) */
 "C"              /* 4  = 0 1 0 0 (C) */
 "Y"              /* 5  = 0 1 0 1 (C/T) */
 "S"              /* 6  = 0 1 1 0 (C/G) */
 "B"              /* 7  = 0 1 1 1 (C/G/T) */
 "A"              /* 8  = 1 0 0 0 (A) */
 "W"              /* 9  = 1 0 0 1 (A/T) */
 "R"              /* 10 = 1 0 1 0 (A/G) */
 "D"              /* 11 = 1 0 1 1 (A/G/T) */
 "M"              /* 12 = 1 1 0 0 (A/C) */
 "H"              /* 13 = 1 1 0 1 (A/C/T) */
 "V"              /* 14 = 1 1 1 0 (A/C/G) */
 "N"              /* 15 = 1 1 1 1 (A/C/G/T) */;
 
 #define NT_BITOFFSET_A 3
 #define NT_BITOFFSET_C 2
 #define NT_BITOFFSET_G 1
 #define NT_BITOFFSET_T 0
 
 UBYTE iupacNtToBaseBits(char iupac)
 /* Translate the given character into a base bitmap -- which is the offset
  * of the character in iupacNt (or 0 if the character is not IUPAC). */
 {
 iupac = toupper(iupac);
 char *ptr = strchr(iupacNt, iupac);
 if (ptr == NULL)
     return 0;
 else
     return (UBYTE)(ptr - iupacNt);
 }
 
 char baseBitsToIupacNt(UBYTE baseBits)
 /* Translate the given base bitmap into the corresponding IUPAC code. 
  * A bitmap of 0 results in the non-IUPAC character '?'. */
 {
 if (baseBits > 0xf)
     errAbort("baseBits must be at most 15 (0xf, 0b1111).");
 return iupacNt[baseBits];
 }
 
 UBYTE baseBitsComplement(UBYTE baseBits)
 /* Complement each base included in baseBits by swizzling the bit
  * positions of baseBits: A<->T, C<->G. */
 {
 if (baseBits > 0xf)
     errAbort("baseBits must be at most 15 (0xf).");
 UBYTE baseBitsComp = 0;
 baseBitsComp |= ((baseBits >> NT_BITOFFSET_A) & 1) << NT_BITOFFSET_T;
 baseBitsComp |= ((baseBits >> NT_BITOFFSET_C) & 1) << NT_BITOFFSET_G;
 baseBitsComp |= ((baseBits >> NT_BITOFFSET_T) & 1) << NT_BITOFFSET_A;
 baseBitsComp |= ((baseBits >> NT_BITOFFSET_G) & 1) << NT_BITOFFSET_C;
 return baseBitsComp;
 }
 
 char iupacNtComplement(char iupac)
 /* Return the IUPAC nucleotide character representing the complements of
  * all bases represented by iupac -- but if iupac is not an iupac character,
  * leave it unchanged. */
 {
 UBYTE baseBits = iupacNtToBaseBits(iupac);
 if (baseBits == 0)
     return iupac;
 else
     return baseBitsToIupacNt(baseBitsComplement(baseBits));
 }
 
 boolean iupacNtIsAmbiguous(char iupac)
 /* Return TRUE if iupac is not A, C, G or T. */
 {
 iupac = toupper(iupac);
 return (iupac != 'A' && iupac != 'C' && iupac != 'G' && iupac != 'T');
 }
 
 UBYTE dnaToAggregateBaseBits(DNA *dna)
 /* Build up a base bitmap representing all bases that appear in dna by
  * OR-ing the bits for each char of dna.  Non-IUPAC characters are
  * ignored (bits=0000, no-op in OR). */
 {
 int len = strlen(dna);
 UBYTE baseBits = 0;
 int i;
 for (i = 0;  i < len;  i++)
     baseBits |= iupacNtToBaseBits(dna[i]);
 return baseBits;
 }
 
 char *bitsToObserved(UBYTE bits)
 /* Make a dbSNP-style observed string from bits. */
 {
 char observed[16];
 observed[0] = '\0';
 
 if ((bits >> NT_BITOFFSET_A) & 0x1)
     strcat(observed, "A");
 if ((bits >> NT_BITOFFSET_C) & 0x1)
     {
     if (observed[0] != '\0')
 	strcat(observed, "/");
     strcat(observed, "C");
     }
 if ((bits >> NT_BITOFFSET_G) & 0x1)
     {
     if (observed[0] != '\0')
 	strcat(observed, "/");
     strcat(observed, "G");
     }
 if ((bits >> NT_BITOFFSET_T) & 0x1)
     {
     if (observed[0] != '\0')
 	strcat(observed, "/");
     strcat(observed, "T");
     }
 return cloneString(observed);
 }
 
 char mergeBaseAndObserved(char oldIupac, char *observed, char strand,
 			  char *chrom, int chromStart, int chromEnd, FILE *diffObsOut)
 /* Return an IUPAC base encompassing the old base and all of the bases
  * contained in observed.  Alert the developer if different SNPs at the
  * same base have different bases in observed. */
 {
 boolean isAmbig = iupacNtIsAmbiguous(oldIupac);
 UBYTE oldBits = iupacNtToBaseBits(oldIupac);
 UBYTE obsBits = dnaToAggregateBaseBits(observed);
 if (strand == '-')
     obsBits = baseBitsComplement(obsBits);
 if (isAmbig && obsBits != oldBits)
     {
     if (diffObsOut != NULL)
 	fprintf(diffObsOut, "differing observed strings at %s|%d|%d: %s, %s\n",
 		chrom, chromStart, chromEnd,
 		bitsToObserved(oldBits), bitsToObserved(obsBits));
     else
 	verbose(1, "differing observed strings at %s|%d|%d: %s, %s\n",
 		chrom, chromStart, chromEnd,
 		bitsToObserved(oldBits), bitsToObserved(obsBits));
     }
 return baseBitsToIupacNt(oldBits | obsBits);
 }
 
 
 void snpMaskSingle(char *snpBedFile, char *twoBitFile, char *outFile, char *diffObsFile)
 {
 /* snpMaskSingle - Print sequence using IUPAC ambiguous nucleotide codes for single base substitutions. */
 struct lineFile *lfSnp = lineFileOpen(snpBedFile, TRUE);
 struct twoBitFile *tbfGenomic = twoBitOpen(twoBitFile);
 FILE *outMasked = mustOpen(outFile, "w");
 FILE *diffObsOut = (diffObsFile == NULL) ? NULL : mustOpen(diffObsFile, "w");
 long long expectedTotalSize = twoBitTotalSize(tbfGenomic);
 long long totalMaskedSnps = 0, totalMaskedBases = 0, totalSize = 0;
 int prevMaskedChromStart = -1;
 struct dnaSeq *seq = NULL;
 char *words[17];
 int wordCount;
 
 while ((wordCount = lineFileChopTab(lfSnp, words)) > 0)
     {
     lineFileExpectWords(lfSnp, 17, wordCount);
     char *chrom = words[0];
     int chromStart = lineFileNeedFullNum(lfSnp, words, 1);
     int chromEnd = lineFileNeedFullNum(lfSnp, words, 2);
     char strand = words[5][0];
     char *observed = words[8];
     char *class = words[10];
     int weight = lineFileNeedFullNum(lfSnp, words, 16);
     if (chromEnd != chromStart + 1 ||
 	!sameString(class, "single") ||
 	weight != 1)
 	continue;
     if (!seq || !sameString(seq->name, chrom))
 	{
 	if (seq)
 	    {
 	    faWriteNext(outMasked, seq->name, seq->dna, seq->size);
 	    totalSize += seq->size;
 	    dnaSeqFree(&seq);
 	    }
 	seq = twoBitReadSeqFrag(tbfGenomic, chrom, 0, 0);
 	}
     boolean inRep = islower(seq->dna[chromStart]);
     char oldCode = toupper(seq->dna[chromStart]);
     char newCode = mergeBaseAndObserved(oldCode, observed, strand,
 					chrom, chromStart, chromEnd, diffObsOut);
     if (oldCode != newCode)
 	{
 	if (inRep)
 	    newCode = tolower(newCode);
 	seq->dna[chromStart] = newCode;
 	totalMaskedSnps++;
 	if (chromStart != prevMaskedChromStart)
 	    totalMaskedBases++;
 	prevMaskedChromStart = chromStart;
 	}
     }
 if (seq)
     {
     faWriteNext(outMasked, seq->name, seq->dna, seq->size);
     totalSize += seq->size;
     dnaSeqFree(&seq);
     }
 lineFileClose(&lfSnp);
 twoBitClose(&tbfGenomic);
 carefulClose(&outMasked);
 verbose(1, "Masked %lld snps in %lld out of %lld genomic bases\n",
 	totalMaskedSnps, totalMaskedBases, totalSize);
 if (totalSize != expectedTotalSize)
     verbose(0, "%s has %lld total bases, but the total number of bases "
 	    "in sequences for which we masked snps is %lld "
 	    "(difference is %lld)\n", twoBitFile, expectedTotalSize,
 	    totalSize, (expectedTotalSize - totalSize));
 }
 
 int main(int argc, char *argv[])
 /* Check args and call snpMaskSingle. */
 {
 if (argc != 4 && argc != 5)
     usage();
 char *diffObsFile = (argc == 5) ? argv[4] : NULL;
 snpMaskSingle(argv[1], argv[2], argv[3], diffObsFile);
 return 0;
 }