src/hg/snp/snpLoad/snpNcbiToUcsc.c 1.10

1.10 2009/05/22 20:24:58 angie
Added -1000GenomesRsIds for adding our own by-1000genomes validation status using a flatfile of 1000Genomes IDs from dbSNP. Also, treat NULL like MISSING, for input from one big sql query instead of from little queries and join commands.
Index: src/hg/snp/snpLoad/snpNcbiToUcsc.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/snp/snpLoad/snpNcbiToUcsc.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 4 -r1.9 -r1.10
--- src/hg/snp/snpLoad/snpNcbiToUcsc.c	18 May 2009 17:25:09 -0000	1.9
+++ src/hg/snp/snpLoad/snpNcbiToUcsc.c	22 May 2009 20:24:58 -0000	1.10
@@ -21,12 +21,12 @@
 errAbort(
 "snpNcbiToUcsc - Reformat NCBI SNP field values into UCSC, and flag exceptions.\n"
 "usage:\n"
 "   snpNcbiToUcsc ucscNcbiSnp.bed db.2bit snpNNN\n"
-/* could add options to provide support for different dbSNP versions.
 "options:\n"
-"   -xxx=XXX\n"
-*/
+"   -1000GenomesRsIds=file      Text file with one number per line, numerically\n"
+"                               sorted, specifying RS IDs for submissions by the\n"
+"                               1000 Genomes Project.\n"
 "\n"
 "Converts NCBI's representation of SNP data into UCSC's, and checks for\n"
 "consistency.  Typically NNN is the dbSNP release number; snpNNN is the\n"
 "SNP track's main table name.  This program was written for dbSNP 128, and\n"
@@ -55,11 +55,14 @@
   );
 }
 
 static struct optionSpec options[] = {
+    {"1000GenomesRsIds", OPTION_STRING},
    {NULL, 0},
 };
 
+char *oneKGenomesRsIds = NULL;
+
 #define UPDATE_CODE "update this program and possibly hgTracks/hgc/hgTrackUi."
 
 /*************************** IMPORTANT *********************************
  * NCBI gives integer codes for several fields that we translate into
@@ -148,10 +151,11 @@
 "by-frequency",
 "by-submitter",
 "by-2hit-2allele",
 "by-hapmap",
+"by-1000genomes", // UCSC's local addition
 };
-boolean validBitStringsUsed[VALID_BITS];
+boolean validBitStringsUsed[VALID_BITS+1];
 
 /* These strings and their positions must correspond to the values in
  * $ftpBcp/SnpFunctionCode.bcp.gz / ASN (but ASN doesn't have
  * the newer encodings (> 10). */
@@ -224,17 +228,19 @@
 for (i = 0;  i < MAX_CLASS+2+1;  i++)
     classStringsUsed[i] = FALSE;
 for (i = 0;  i < MAX_LOCTYPE+1;  i++)
     locTypeStringsUsed[i] = FALSE;
-for (i = 0;  i < VALID_BITS;  i++)
+for (i = 0;  i < VALID_BITS+1;  i++)
     validBitStringsUsed[i] = FALSE;
 for (i = 0;  i < MAX_FUNC+1;  i++)
     functionStringsUsed[i] = FALSE;
 /* We might never have a class=unknown, but set the bit anyway because it is the 
  * default value: */
 classStringsUsed[0] = 1;
 }
 
+#define isMissing(val) (sameString("MISSING", val) || sameString("NULL", val))
+
 void tallyMoltype(struct lineFile *lf, char *molType)
 /* Keep track of what molTypes are actually used. */
 {
 int i;
@@ -243,9 +249,9 @@
 	{
 	molTypeStringsUsed[i] = TRUE;
 	return;
 	}
-if (sameString(molType, "MISSING"))
+if (isMissing(molType))
     molTypeStringsUsed[0] = TRUE;
 else
     lineFileAbort(lf, "Unrecognized molType %s", molType);
 }
@@ -295,9 +301,9 @@
 writeCodes(f, classStrings, classStringsUsed, MAX_CLASS+2+1);
 fprintf(f,
 ") NOT NULL default 'unknown',\n"
 "  valid set(");
-writeCodes(f, validBitStrings, validBitStringsUsed, VALID_BITS+1);
+writeCodes(f, validBitStrings, validBitStringsUsed, VALID_BITS+1+1);
 fprintf(f,
 ") NOT NULL default 'unknown',\n"
 "  avHet float NOT NULL default '0',\n"
 "  avHetSE float NOT NULL default '0',\n"
@@ -596,25 +602,80 @@
 safef(class, classSize, "%s", classStrings[classNum]);
 classStringsUsed[classNum] = TRUE;
 }
 
+static void chopTrailingComma(char *str)
+/* MySQL doesn't like trailing commas in set values, so if present, chop it. */
+{
+int last = strlen(str) - 1;
+if (last >= 0 && str[last] == ',')
+    str[last] = '\0';
+}
+
+#define ONEKG_MAXCOUNT (8 * 1024 * 1024)
+
+int *oneKGIds = NULL;
+int oneKGIdCount = 0;
+
+void read1KGIds()
+/* Slurp in file of numerically sorted rs IDs into array of ints. */
+{
+struct lineFile *lf = lineFileOpen(oneKGenomesRsIds, TRUE);
+char *words[1];
+oneKGIds = needMem(ONEKG_MAXCOUNT * sizeof(int));
+while (lineFileRow(lf, words))
+    {
+    oneKGIds[oneKGIdCount++] = lineFileNeedFullNum(lf, words, 0);
+    if (oneKGIdCount >= 2 && oneKGIds[oneKGIdCount-1] <= oneKGIds[oneKGIdCount-2])
+	lineFileAbort(lf, "Error: IDs must be sorted numerically and unique (use sort -nu).");
+    }
+}
+
+boolean binSearchInt(int *target, int targetSize, int query)
+/* Perform binary search in sorted array of integers, return TRUE if found. */
+{
+int low = 0, high = targetSize-1;
+while (high >= low)
+    {
+    int mid = (low + high) >> 1;
+    if (query < target[mid])
+	high = mid-1;
+    else if (query > target[mid])
+	low = mid+1;
+    else
+	return TRUE;
+    }
+return FALSE;
+}
+
+static boolean is1000Genomes(int rsId)
+/* Return TRUE if rsID appears in numerically sorted file named by -1000GenomesRsIds. */
+{
+if (oneKGIds == NULL)
+    read1KGIds();
+return binSearchInt(oneKGIds, oneKGIdCount, rsId);
+}
+
 void processValid(struct lineFile *lf, int validNum,
-		  char valid[], size_t validSize)
+		  char valid[], size_t validSize, int rsId)
 /* Translate and check validation status bit flags. */
 {
 static struct dyString *validList = NULL;
 if (validNum < 0)
     validNum = 0;
 if (validNum > MAX_VALID)
-    lineFileAbort(lf, "Unrecognized class number %d (max is %d).\n"
+    lineFileAbort(lf, "Unrecognized validation number %d (max is %d).\n"
 		  "If %d is valid now, "UPDATE_CODE,
 		  validNum, MAX_VALID, validNum);
 if (validList == NULL)
     validList = dyStringNew(validSize);
 dyStringClear(validList);
-if (validNum == 0)
-    dyStringAppend(validList, "unknown");
-else
+if (oneKGenomesRsIds != NULL && is1000Genomes(rsId))
+    {
+    dyStringPrintf(validList, "%s,", validBitStrings[VALID_BITS]); // UCSC "by-1000genomes"
+    validBitStringsUsed[VALID_BITS] = TRUE;
+    }
+if (validNum > 0)
     {
     /* Mask off each bit, most significant first.  If set, append 
      * string value to form a comma-separated list. */
     int bit;
@@ -626,14 +687,13 @@
 	    validBitStringsUsed[bit] = TRUE;
 	    }
 	}
     }
-safef(valid, validSize, "%s", validList->string);
-if (endsWith(valid, ","))
-    {
-    int last = strlen(valid) - 1;
-    valid[last] = '\0';
-    }
+if (isEmpty(validList->string))
+    safecpy(valid, validSize, "unknown");
+else
+    safef(valid, validSize, "%s", validList->string);
+chopTrailingComma(valid);
 }
 
 void processFunc(struct lineFile *lf, char *funcCode,
 		 char func[], size_t funcSize)
@@ -643,9 +703,9 @@
 if (funcList == NULL)
     funcList = dyStringNew(funcSize);
 dyStringClear(funcList);
 
-if (sameString("MISSING", funcCode))
+if (isMissing(funcCode))
     {
     dyStringAppend(funcList, functionStrings[0]);
     functionStringsUsed[0] = TRUE;
     }
@@ -664,13 +724,9 @@
 	functionStringsUsed[codes[i]] = TRUE;
 	}
     }
 safef(func, funcSize, "%s", funcList->string);
-if (endsWith(func, ","))
-    {
-    int last = strlen(func) - 1;
-    func[last] = '\0';
-    }
+chopTrailingComma(func);
 }
 
 void processLocType(struct lineFile *lf, int locTypeNum,
 		    char locType[], size_t locTypeSize)
@@ -790,9 +846,9 @@
     }
 
 dyStringClear(expanded);
 /* Common case: no expansion required; don't bother with regex. */
-if (sameString("MISSING", refNcbiEnc))
+if (isMissing(refNcbiEnc))
     {
     dyStringAppend(expanded, "unknown");
     writeError("Missing refNCBI");
     return expanded->string;
@@ -1323,9 +1379,9 @@
 boolean checkObserved(struct lineFile *lf, char *class, char *observed,
 		      char *refAllele)
 /* If there is a formatting problem, don't proceed with other checks. */
 {
-if (sameString(observed, "MISSING"))
+if (isMissing(observed))
     {
     writeError("Missing observed value (deleted SNP?).");
     return FALSE;
     }
@@ -1516,12 +1572,12 @@
 	}
 }
 
 
-/* Several fields are numeric unless they have the placeholder "MISSING": */
-#define missingOrInt(lf, row, i) (sameString("MISSING", row[i]) ? -1 : lineFileNeedFullNum(lf, row, i))
+/* Several fields are numeric unless they have a placeholder value: */
+#define missingOrInt(lf, row, i) (isMissing(row[i]) ? -1 : lineFileNeedFullNum(lf, row, i))
 
-#define missingOrDouble(lf, row, i) (sameString("MISSING", row[i]) ? 0.0 : lineFileNeedDouble(lf, row, i))
+#define missingOrDouble(lf, row, i) (isMissing(row[i]) ? 0.0 : lineFileNeedDouble(lf, row, i))
 
 
 void snpNcbiToUcsc(char *rawFileName, char *twoBitFileName, char *outRoot)
 /* snpNcbiToUcsc - Reformat NCBI SNP field values into UCSC, and flag exceptions.. */
@@ -1587,9 +1643,9 @@
     char func[1024];
     char locType[128];
     char refAllele[MAX_SNPSIZE+1];
     processClass(lf, classNum, class, sizeof(class));
-    processValid(lf, validNum, valid, sizeof(valid));
+    processValid(lf, validNum, valid, sizeof(valid), rsId);
     processFunc(lf, funcCode, func, sizeof(func));
     processLocType(lf, locTypeNum, locType, sizeof(locType));
     refNCBI = processRefAllele(lf, refNCBI, locType);
     checkNcbiChrStart(ncbiChrStart);
@@ -1661,8 +1717,9 @@
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
+oneKGenomesRsIds = optionVal("1000GenomesRsIds", oneKGenomesRsIds);
 if (argc != 4)
     usage();
 #ifdef DEBUG
 pushCarefulMemHandler(1024 * 1024 * 1024);