src/hg/snp/snpLoad/snpNcbiToUcsc.c 1.9
1.9 2009/05/18 17:25:09 angie
Small tweaks for dbSNP 130: broadened some formats, raised MAX_SNPID.
Index: src/hg/snp/snpLoad/snpNcbiToUcsc.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/snp/snpLoad/snpNcbiToUcsc.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -b -B -U 4 -r1.8 -r1.9
--- src/hg/snp/snpLoad/snpNcbiToUcsc.c 15 Aug 2008 17:46:49 -0000 1.8
+++ src/hg/snp/snpLoad/snpNcbiToUcsc.c 18 May 2009 17:25:09 -0000 1.9
@@ -228,8 +228,11 @@
for (i = 0; i < VALID_BITS; i++)
validBitStringsUsed[i] = FALSE;
for (i = 0; i < MAX_FUNC+1; i++)
functionStringsUsed[i] = FALSE;
+/* We might never have a class=unknown, but set the bit anyway because it is the
+ * default value: */
+classStringsUsed[0] = 1;
}
void tallyMoltype(struct lineFile *lf, char *molType)
/* Keep track of what molTypes are actually used. */
@@ -1159,14 +1162,14 @@
"[0-9]+ ?BP ((INDEL|INSERTION|DELETED))?\\)"
"\\/-(\\/[ACGT]+)*$";
const char *observedNamedOddballFormat =
"^\\([A-Z0-9 ]+\\)" /* there's all sorts of stuff in there now */
- "(\\/-)?(\\/[ACGT]+)*$";
+ "(\\/-)?(\\/\\(?[A-Z0-9 ]+)*\\)?$";
/* class=no-var (6): no SNPs use this class (intended for null results). */
const char *observedMixedFormat =
"^-\\/[ACGT]+(\\/["IUPAC"]+)+$";
const char *observedMnpFormat =
- "^[ACGT]+(\\/["IUPAC"]+)+$";
+ "^[ACGTN]+(\\/["IUPAC"]+)+$";
/* there is only one instance of iupac ambiguous */
boolean checkObservedFormat(struct lineFile *lf, char *class, char *observed)
/* For each value of class, make sure observed fits the expected pattern.
@@ -1274,10 +1277,11 @@
writeException(ObservedTooLong);
return FALSE;
}
else if (regexec(&obsMixed, observed, 0, NULL, 0) != 0)
- lineFileAbort(lf, "Encountered something that doesn't fit "
- "observedMixedFormat: %s", observed);
+ // Single oddball in human 130 -- not worth adding to regex and masking.
+ warn("Line %d of %s: Encountered something that doesn't fit observedMixedFormat: %s",
+ lf->lineIx, lf->fileName, observed);
flagIupac(observed);
}
else if (sameString(class, "mnp"))
{
@@ -1444,9 +1448,10 @@
/* If SNP ids exceed this, and there are <= 16M total SNP items, then
* hashing would be more memory efficient. Until then, just use an array
* as big as the hash would need to alloc. */
-#define MAX_SNPID 64 * 1024 * 1024
+/* SNP130: now 18M items, max ID 74315166. */
+#define MAX_SNPID 80 * 1024 * 1024
int lastRsId = 0;
struct coords **mappings = NULL;
struct hash *chrIds = NULL;