src/hg/snp/snpLoad/snpNcbiToUcsc.c 1.12

1.12 2010/06/04 05:36:03 angie
Fixed detection of multiple mappings to handle chr*_hap correctly, e.g. if a SNP maps to one chr6 pos and one chr6_cox_hap2 pos it is not multiply mapped. Other misc changes to accomodate human snp131.
Index: src/hg/snp/snpLoad/snpNcbiToUcsc.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/snp/snpLoad/snpNcbiToUcsc.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -b -B -U 4 -r1.11 -r1.12
--- src/hg/snp/snpLoad/snpNcbiToUcsc.c	31 Aug 2009 23:38:46 -0000	1.11
+++ src/hg/snp/snpLoad/snpNcbiToUcsc.c	4 Jun 2010 05:36:03 -0000	1.12
@@ -143,19 +143,20 @@
 boolean locTypeStringsUsed[MAX_LOCTYPE+1];
 
 /* These strings and their positions must correspond to the values in
  * $ftpBcp/SnpValidationCode.bcp.gz / ASN. */
-#define MAX_VALID 31
-#define VALID_BITS 5
+#define VALID_BITS 6
+#define VALID_1KG 5
+#define MAX_VALID ((1<<VALID_BITS)-1)
 char *validBitStrings[] = {
 "by-cluster",
 "by-frequency",
 "by-submitter",
 "by-2hit-2allele",
 "by-hapmap",
-"by-1000genomes", // UCSC's local addition
+"by-1000genomes",
 };
-boolean validBitStringsUsed[VALID_BITS+1];
+boolean validBitStringsUsed[VALID_BITS];
 
 /* These strings and their positions must correspond to the values in
  * $ftpBcp/SnpFunctionCode.bcp.gz / ASN (but ASN doesn't have
  * the newer encodings (> 10). */
@@ -190,9 +191,10 @@
 /* 41 */ "nonsense",
 /* 42 */ "missense",
 /* 43 */ NULL,
 /* 44 */ "frameshift",
-/*45*/NULL,/*46*/NULL,/*47*/NULL,/*48*/NULL,/*49*/NULL,
+/* 45 */ "cds-indel",
+/*46*/NULL,/*47*/NULL,/*48*/NULL,/*49*/NULL,
 /* fifties: extensions of 5 above (untranslated): */
 /*50*/NULL,/*51*/NULL,/*52*/NULL,
 /* 53 */ "untranslated-3",
 /* 54 */ NULL,
@@ -228,9 +230,9 @@
 for (i = 0;  i < MAX_CLASS+2+1;  i++)
     classStringsUsed[i] = FALSE;
 for (i = 0;  i < MAX_LOCTYPE+1;  i++)
     locTypeStringsUsed[i] = FALSE;
-for (i = 0;  i < VALID_BITS+1;  i++)
+for (i = 0;  i < VALID_BITS;  i++)
     validBitStringsUsed[i] = FALSE;
 for (i = 0;  i < MAX_FUNC+1;  i++)
     functionStringsUsed[i] = FALSE;
 /* We might never have a class=unknown, but set the bit anyway because it is the 
@@ -301,9 +303,9 @@
 writeCodes(f, classStrings, classStringsUsed, MAX_CLASS+2+1);
 fprintf(f,
 ") NOT NULL default 'unknown',\n"
 "  valid set(");
-writeCodes(f, validBitStrings, validBitStringsUsed, VALID_BITS+1+1);
+writeCodes(f, validBitStrings, validBitStringsUsed, VALID_BITS+1);
 fprintf(f,
 ") NOT NULL default 'unknown',\n"
 "  avHet float NOT NULL default '0',\n"
 "  avHetSE float NOT NULL default '0',\n"
@@ -670,10 +672,10 @@
     validList = dyStringNew(validSize);
 dyStringClear(validList);
 if (oneKGenomesRsIds != NULL && is1000Genomes(rsId))
     {
-    dyStringPrintf(validList, "%s,", validBitStrings[VALID_BITS]); // UCSC "by-1000genomes"
-    validBitStringsUsed[VALID_BITS] = TRUE;
+    dyStringPrintf(validList, "%s,", validBitStrings[VALID_1KG]); // UCSC "by-1000genomes"
+    validBitStringsUsed[VALID_1KG] = TRUE;
     }
 if (validNum > 0)
     {
     /* Mask off each bit, most significant first.  If set, append 
@@ -1211,19 +1213,19 @@
     "^(-|[AGCT]+)(\\/["IUPAC" ]+)*$";
 const char *observedHetFormat =
     "^\\(HETEROZYGOUS\\)(\\/[ACGT])*$";
 const char *observedMicrosatFormat =
-    "^\\(["IUPAC"]+\\)[0-9]+(\\/[0-9]+)*(\\/-)?(\\/[ACGT]+)*$";
+    "^(\\/?\\(["IUPAC"]+\\)[0-9]*)+(\\/[0-9]+)*(\\/-)?(\\/[ACGT]+)*$";
 const char *observedNamedFormat =
     "^\\((LARGE(INSERTION|DELETION))|"
     "[0-9]+ ?BP ((INDEL|INSERTION|DELETED))?\\)"
     "\\/-(\\/[ACGT]+)*$";
 const char *observedNamedOddballFormat =
-    "^\\([A-Z0-9 ]+\\)" /* there's all sorts of stuff in there now */
-    "(\\/-)?(\\/\\(?[A-Z0-9 ]+)*\\)?$";
+    "^\\([A-Z0-9 ]+\\)?(\\/\\([A-Z0-9 ]+\\))*" /* there's all sorts of stuff in there now */
+    "(\\/-)?(\\/\\(?[A-Z0-9 ]+)*\\)*$";
 /* class=no-var (6): no SNPs use this class (intended for null results). */
 const char *observedMixedFormat =
-    "^-\\/[ACGT]+(\\/["IUPAC"]+)+$";
+    "^-\\/[ACGTN]+(\\/["IUPAC"]+)+$";
 const char *observedMnpFormat =
     "^[ACGTN]+(\\/["IUPAC"]+)+$";
 /* there is only one instance of iupac ambiguous */
 
@@ -1557,17 +1559,47 @@
 
 void reportMultipleMappings()
 /* Print exceptions for SNPs that have multiple mappings to the genome. */
 {
-int id;
+int chromCounts[nextChrId];
+int id, i;
 for (id = 0;  id <= lastRsId;  id++)
     if (mappings && slCount(mappings[id]) > 1)
 	{
+	boolean gotMult = FALSE;
+	char chromBase[256];
+	chromBase[0] = '\0';
+	for (i = 0;  i < nextChrId;  i++)
+	    chromCounts[i] = 0;
 	struct coords *map;
 	for (map = mappings[id];  map != NULL;  map = map->next)
+	    {
+	    // Is it mapped to multiple chroms, *not* including _hap's?
+	    char *chrom = idChrs[map->chrId];
+	    if (isEmpty(chromBase))
+		{
+		safecpy(chromBase, sizeof(chromBase), chrom);
+		if (strstr(chromBase, "_hap"))
+		    {
+		    char *p = strchr(chromBase, '_');
+		    *p = '\0';
+		    }
+		}
+	    // Is it mapped more than one time to the same chrom?
+	    chromCounts[map->chrId]++;
+	    if (!startsWith(chromBase, chrom) || chromCounts[map->chrId] > 1)
+		{
+		gotMult = TRUE;
+		break;
+		}
+	    }
+	if (gotMult)
+	    {
+	    for (map = mappings[id];  map != NULL;  map = map->next)
 	    writeExceptionDetailed(idChrs[map->chrId], map->start, map->end,
 				   id, MultipleAlignments);
 	}
+	}
 }
 
 
 /* Several fields are numeric unless they have a placeholder value: */