c73ce8bdfcc20de7da303476b3aeee1d2d3e313f max Tue Mar 12 03:59:50 2024 -0700 making blat dupl seq ID error message readable for human beings, no redmine, email from JM de sainte agathe diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c index 3b76af2..309b275 100644 --- src/hg/hgBlat/hgBlat.c +++ src/hg/hgBlat/hgBlat.c @@ -795,31 +795,36 @@ abbrv = nameClone; } else { abbrv = words[wordCount-1]; if (abbrv[0] == 0) abbrv = words[wordCount-2]; } if (hashLookup(hash, abbrv) == NULL) { freeMem(seq->name); seq->name = cloneString(abbrv); } freez(&nameClone); } } - hashAddUnique(hash, seq->name, hash); + if (hashLookup(hash, seq->name) != NULL) + errAbort("The sequence identifier '%s' is duplicated in the input. " + "FASTA sequence identifiers should be unique and they cannot contain spaces. " + "You can make them unique by adding a suffix such as _1, _2, ... to the duplicated names, e.g. '%s_1'.", + seq->name, seq->name); + hashAdd(hash, seq->name, hash); } freeHash(&hash); } int realSeqSize(bioSeq *seq, boolean isDna) /* Return size of sequence without N's or (for proteins) * X's. */ { char unknown = (isDna ? 'n' : 'X'); int i, size = seq->size, count = 0; char *s = seq->dna; for (i=0; i<size; ++i) if (s[i] != unknown) ++count; return count; }