src/hg/snp/snpLoad/snpNcbiToUcsc.c 1.12
1.12 2010/06/04 05:36:03 angie
Fixed detection of multiple mappings to handle chr*_hap correctly, e.g. if a SNP maps to one chr6 pos and one chr6_cox_hap2 pos it is not multiply mapped. Other misc changes to accomodate human snp131.
Index: src/hg/snp/snpLoad/snpNcbiToUcsc.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/snp/snpLoad/snpNcbiToUcsc.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -b -B -U 4 -r1.11 -r1.12
--- src/hg/snp/snpLoad/snpNcbiToUcsc.c 31 Aug 2009 23:38:46 -0000 1.11
+++ src/hg/snp/snpLoad/snpNcbiToUcsc.c 4 Jun 2010 05:36:03 -0000 1.12
@@ -143,19 +143,20 @@
boolean locTypeStringsUsed[MAX_LOCTYPE+1];
/* These strings and their positions must correspond to the values in
* $ftpBcp/SnpValidationCode.bcp.gz / ASN. */
-#define MAX_VALID 31
-#define VALID_BITS 5
+#define VALID_BITS 6
+#define VALID_1KG 5
+#define MAX_VALID ((1<<VALID_BITS)-1)
char *validBitStrings[] = {
"by-cluster",
"by-frequency",
"by-submitter",
"by-2hit-2allele",
"by-hapmap",
-"by-1000genomes", // UCSC's local addition
+"by-1000genomes",
};
-boolean validBitStringsUsed[VALID_BITS+1];
+boolean validBitStringsUsed[VALID_BITS];
/* These strings and their positions must correspond to the values in
* $ftpBcp/SnpFunctionCode.bcp.gz / ASN (but ASN doesn't have
* the newer encodings (> 10). */
@@ -190,9 +191,10 @@
/* 41 */ "nonsense",
/* 42 */ "missense",
/* 43 */ NULL,
/* 44 */ "frameshift",
-/*45*/NULL,/*46*/NULL,/*47*/NULL,/*48*/NULL,/*49*/NULL,
+/* 45 */ "cds-indel",
+/*46*/NULL,/*47*/NULL,/*48*/NULL,/*49*/NULL,
/* fifties: extensions of 5 above (untranslated): */
/*50*/NULL,/*51*/NULL,/*52*/NULL,
/* 53 */ "untranslated-3",
/* 54 */ NULL,
@@ -228,9 +230,9 @@
for (i = 0; i < MAX_CLASS+2+1; i++)
classStringsUsed[i] = FALSE;
for (i = 0; i < MAX_LOCTYPE+1; i++)
locTypeStringsUsed[i] = FALSE;
-for (i = 0; i < VALID_BITS+1; i++)
+for (i = 0; i < VALID_BITS; i++)
validBitStringsUsed[i] = FALSE;
for (i = 0; i < MAX_FUNC+1; i++)
functionStringsUsed[i] = FALSE;
/* We might never have a class=unknown, but set the bit anyway because it is the
@@ -301,9 +303,9 @@
writeCodes(f, classStrings, classStringsUsed, MAX_CLASS+2+1);
fprintf(f,
") NOT NULL default 'unknown',\n"
" valid set(");
-writeCodes(f, validBitStrings, validBitStringsUsed, VALID_BITS+1+1);
+writeCodes(f, validBitStrings, validBitStringsUsed, VALID_BITS+1);
fprintf(f,
") NOT NULL default 'unknown',\n"
" avHet float NOT NULL default '0',\n"
" avHetSE float NOT NULL default '0',\n"
@@ -670,10 +672,10 @@
validList = dyStringNew(validSize);
dyStringClear(validList);
if (oneKGenomesRsIds != NULL && is1000Genomes(rsId))
{
- dyStringPrintf(validList, "%s,", validBitStrings[VALID_BITS]); // UCSC "by-1000genomes"
- validBitStringsUsed[VALID_BITS] = TRUE;
+ dyStringPrintf(validList, "%s,", validBitStrings[VALID_1KG]); // UCSC "by-1000genomes"
+ validBitStringsUsed[VALID_1KG] = TRUE;
}
if (validNum > 0)
{
/* Mask off each bit, most significant first. If set, append
@@ -1211,19 +1213,19 @@
"^(-|[AGCT]+)(\\/["IUPAC" ]+)*$";
const char *observedHetFormat =
"^\\(HETEROZYGOUS\\)(\\/[ACGT])*$";
const char *observedMicrosatFormat =
- "^\\(["IUPAC"]+\\)[0-9]+(\\/[0-9]+)*(\\/-)?(\\/[ACGT]+)*$";
+ "^(\\/?\\(["IUPAC"]+\\)[0-9]*)+(\\/[0-9]+)*(\\/-)?(\\/[ACGT]+)*$";
const char *observedNamedFormat =
"^\\((LARGE(INSERTION|DELETION))|"
"[0-9]+ ?BP ((INDEL|INSERTION|DELETED))?\\)"
"\\/-(\\/[ACGT]+)*$";
const char *observedNamedOddballFormat =
- "^\\([A-Z0-9 ]+\\)" /* there's all sorts of stuff in there now */
- "(\\/-)?(\\/\\(?[A-Z0-9 ]+)*\\)?$";
+ "^\\([A-Z0-9 ]+\\)?(\\/\\([A-Z0-9 ]+\\))*" /* there's all sorts of stuff in there now */
+ "(\\/-)?(\\/\\(?[A-Z0-9 ]+)*\\)*$";
/* class=no-var (6): no SNPs use this class (intended for null results). */
const char *observedMixedFormat =
- "^-\\/[ACGT]+(\\/["IUPAC"]+)+$";
+ "^-\\/[ACGTN]+(\\/["IUPAC"]+)+$";
const char *observedMnpFormat =
"^[ACGTN]+(\\/["IUPAC"]+)+$";
/* there is only one instance of iupac ambiguous */
@@ -1557,17 +1559,47 @@
void reportMultipleMappings()
/* Print exceptions for SNPs that have multiple mappings to the genome. */
{
-int id;
+int chromCounts[nextChrId];
+int id, i;
for (id = 0; id <= lastRsId; id++)
if (mappings && slCount(mappings[id]) > 1)
{
+ boolean gotMult = FALSE;
+ char chromBase[256];
+ chromBase[0] = '\0';
+ for (i = 0; i < nextChrId; i++)
+ chromCounts[i] = 0;
struct coords *map;
for (map = mappings[id]; map != NULL; map = map->next)
+ {
+ // Is it mapped to multiple chroms, *not* including _hap's?
+ char *chrom = idChrs[map->chrId];
+ if (isEmpty(chromBase))
+ {
+ safecpy(chromBase, sizeof(chromBase), chrom);
+ if (strstr(chromBase, "_hap"))
+ {
+ char *p = strchr(chromBase, '_');
+ *p = '\0';
+ }
+ }
+ // Is it mapped more than one time to the same chrom?
+ chromCounts[map->chrId]++;
+ if (!startsWith(chromBase, chrom) || chromCounts[map->chrId] > 1)
+ {
+ gotMult = TRUE;
+ break;
+ }
+ }
+ if (gotMult)
+ {
+ for (map = mappings[id]; map != NULL; map = map->next)
writeExceptionDetailed(idChrs[map->chrId], map->start, map->end,
id, MultipleAlignments);
}
+ }
}
/* Several fields are numeric unless they have a placeholder value: */