c73ce8bdfcc20de7da303476b3aeee1d2d3e313f
max
  Tue Mar 12 03:59:50 2024 -0700
making blat dupl seq ID error message readable for human beings, no redmine, email from JM de sainte agathe

diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c
index 3b76af2..309b275 100644
--- src/hg/hgBlat/hgBlat.c
+++ src/hg/hgBlat/hgBlat.c
@@ -795,31 +795,36 @@
 		    abbrv = nameClone;
 		}
 	    else
 		{
 		abbrv = words[wordCount-1];
 		if (abbrv[0] == 0) abbrv = words[wordCount-2];
 		}
 	    if (hashLookup(hash, abbrv) == NULL)
 	        {
 		freeMem(seq->name);
 		seq->name = cloneString(abbrv);
 		}
 	    freez(&nameClone);
 	    }
 	}
-    hashAddUnique(hash, seq->name, hash);
+    if (hashLookup(hash, seq->name) != NULL)
+        errAbort("The sequence identifier '%s' is duplicated in the input. "
+                "FASTA sequence identifiers should be unique and they cannot contain spaces. "
+                "You can make them unique by adding a suffix such as _1, _2, ... to the duplicated names, e.g. '%s_1'.", 
+                seq->name, seq->name);
+    hashAdd(hash, seq->name, hash);
     }
 freeHash(&hash);
 }
 
 int realSeqSize(bioSeq *seq, boolean isDna)
 /* Return size of sequence without N's or (for proteins)
  * X's. */
 {
 char unknown = (isDna ? 'n' : 'X');
 int i, size = seq->size, count = 0;
 char *s = seq->dna;
 for (i=0; i<size; ++i)
     if (s[i] != unknown) ++count;
 return count;
 }