src/hg/snp/snpLoad/snpNcbiToUcsc.c 1.9

1.9 2009/05/18 17:25:09 angie
Small tweaks for dbSNP 130: broadened some formats, raised MAX_SNPID.
Index: src/hg/snp/snpLoad/snpNcbiToUcsc.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/snp/snpLoad/snpNcbiToUcsc.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -b -B -U 4 -r1.8 -r1.9
--- src/hg/snp/snpLoad/snpNcbiToUcsc.c	15 Aug 2008 17:46:49 -0000	1.8
+++ src/hg/snp/snpLoad/snpNcbiToUcsc.c	18 May 2009 17:25:09 -0000	1.9
@@ -228,8 +228,11 @@
 for (i = 0;  i < VALID_BITS;  i++)
     validBitStringsUsed[i] = FALSE;
 for (i = 0;  i < MAX_FUNC+1;  i++)
     functionStringsUsed[i] = FALSE;
+/* We might never have a class=unknown, but set the bit anyway because it is the 
+ * default value: */
+classStringsUsed[0] = 1;
 }
 
 void tallyMoltype(struct lineFile *lf, char *molType)
 /* Keep track of what molTypes are actually used. */
@@ -1159,14 +1162,14 @@
     "[0-9]+ ?BP ((INDEL|INSERTION|DELETED))?\\)"
     "\\/-(\\/[ACGT]+)*$";
 const char *observedNamedOddballFormat =
     "^\\([A-Z0-9 ]+\\)" /* there's all sorts of stuff in there now */
-    "(\\/-)?(\\/[ACGT]+)*$";
+    "(\\/-)?(\\/\\(?[A-Z0-9 ]+)*\\)?$";
 /* class=no-var (6): no SNPs use this class (intended for null results). */
 const char *observedMixedFormat =
     "^-\\/[ACGT]+(\\/["IUPAC"]+)+$";
 const char *observedMnpFormat =
-    "^[ACGT]+(\\/["IUPAC"]+)+$";
+    "^[ACGTN]+(\\/["IUPAC"]+)+$";
 /* there is only one instance of iupac ambiguous */
 
 boolean checkObservedFormat(struct lineFile *lf, char *class, char *observed)
 /* For each value of class, make sure observed fits the expected pattern.
@@ -1274,10 +1277,11 @@
 	writeException(ObservedTooLong);
 	return FALSE;
 	}
     else if (regexec(&obsMixed, observed, 0, NULL, 0) != 0)
-	lineFileAbort(lf, "Encountered something that doesn't fit "
-		      "observedMixedFormat: %s", observed);
+	// Single oddball in human 130 -- not worth adding to regex and masking.
+	warn("Line %d of %s: Encountered something that doesn't fit observedMixedFormat: %s",
+	     lf->lineIx, lf->fileName, observed);
     flagIupac(observed);
     }
 else if (sameString(class, "mnp"))
     {
@@ -1444,9 +1448,10 @@
 
 /* If SNP ids exceed this, and there are <= 16M total SNP items, then 
  * hashing would be more memory efficient.  Until then, just use an array 
  * as big as the hash would need to alloc. */
-#define MAX_SNPID 64 * 1024 * 1024
+/* SNP130: now 18M items, max ID 74315166. */
+#define MAX_SNPID 80 * 1024 * 1024
 int lastRsId = 0;
 struct coords **mappings = NULL;
 
 struct hash *chrIds = NULL;