51227ee6c9fcdd3f5917826122b0b24c9fa57a9f
galt
  Fri Oct 22 15:11:27 2021 -0700
ixIxx fulltext limit of 31 characters was making it silently throw out terms that would otherwise be searchable from the .ix index. Added an option -maxWordLength=N to override the default of 31. Added status to stderr reporting the number of unique words that were too long and thus discarded, and the longest word length of discarded words, plus a message that -verbose=2 will list all words and their lengths that failed because they exceeded the allowed length.

diff --git src/index/ixIxx/ixIxx.c src/index/ixIxx/ixIxx.c
index a999daa..e02ea1e 100644
--- src/index/ixIxx/ixIxx.c
+++ src/index/ixIxx/ixIxx.c
@@ -1,47 +1,54 @@
 /* ixIxx - Create indices for simple line-oriented file of format 
  * <symbol> <free text>. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "trix.h"
 
 
 /* Variables that can be set from command line. */
 int prefixSize = trixPrefixSize;
 int binSize = 64*1024;
 
+int maxFailedWordLength = 0;
+int maxWordLength = 31;
+
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "ixIxx - Create indices for simple line-oriented file of format \n"
   "<symbol> <free text>\n"
   "usage:\n"
   "   ixIxx in.text out.ix out.ixx\n"
   "Where out.ix is a word index, and out.ixx is an index into the index.\n"
   "options:\n"
   "   -prefixSize=N Size of prefix to index on in ixx.  Default is 5.\n"
   "   -binSize=N Size of bins in ixx.  Default is 64k.\n"
+  "   -maxWordLength=N Maximum allowed word length. \n"
+  "     Words with more characters than this limit are ignored and will not appear in index or be searchable.  Default is %d.\n"
+    , maxWordLength
   );
 }
 
 static struct optionSpec options[] = {
    {"prefixSize", OPTION_INT},
    {"binSize", OPTION_INT},
+   {"maxWordLength", OPTION_INT},
    {NULL, 0},
 };
 
 bool wordMiddleChars[256];  /* Characters that may be part of a word. */
 bool wordBeginChars[256];
 
 void initCharTables()
 /* Initialize tables that describe characters. */
 {
 int c;
 for (c=0; c<256; ++c)
     if (isalnum(c))
        wordBeginChars[c] = wordMiddleChars[c] = TRUE;
 wordBeginChars['_'] = wordMiddleChars['_'] = TRUE;
 wordMiddleChars['.'] = TRUE;
@@ -87,102 +94,123 @@
     int wordIx;		/* Word number within doc. */
     };
 
 int wordPosCmp(const void *va, const void *vb)
 /* Compare two wordPos by itemId. */
 {
 const struct wordPos *a = *((struct wordPos **)va);
 const struct wordPos *b = *((struct wordPos **)vb);
 int dif;
 dif = strcmp(a->itemId, b->itemId);
 if (dif == 0)
     dif = a->wordIx - b->wordIx;
 return dif;
 }
 
-void indexWords(struct hash *wordHash, 
-	char *itemId, char *text, struct hash *itemIdHash)
+int indexWords(struct hash *wordHash, 
+	char *itemId, char *text, struct hash *itemIdHash, struct hash *failedWordHash)
 /* Index words in text and store in hash. */
 {
 char *s, *e = text;
-char word[32];
+char word[maxWordLength+1];
 int len;
 struct hashEl *hel;
 struct wordPos *pos;
 int wordIx;
+int failedCount = 0;
 
 tolowers(text);
 itemId = hashStoreName(itemIdHash, itemId);
 for (wordIx=1; ; ++wordIx)
     {
     s = skipToWord(e);
     if (s == NULL)
         break;
     e = skipOutWord(s);
     len = e - s;
     if (len < ArraySize(word))
         {
 	memcpy(word, s, len);
 	word[len] = 0;
 	hel = hashLookup(wordHash, word);
 	if (hel == NULL)
 	    hel = hashAdd(wordHash, word, NULL);
 	AllocVar(pos);
 	pos->itemId = itemId;
 	pos->wordIx = wordIx;
 	pos->next = hel->val;
 	hel->val = pos;
 	}
+    else
+	{
+        char *failedWord=cloneStringZ(s,len);
+	hel = hashLookup(failedWordHash, failedWord);
+	if (hel == NULL)
+	    {
+	    hashAdd(failedWordHash, failedWord, NULL);
+	    verbose(2, "word [%s] length %d is longer than max length %d.\n", failedWord, len, maxWordLength);
+	    ++failedCount;
+            maxFailedWordLength = max(len, maxFailedWordLength);
 	    }
+        freez(&failedWord);
+	}
+    }
+return failedCount;
 }
 
 void writeIndexHash(struct hash *wordHash, char *fileName)
 /* Write index to file.  This pretty much destroys the hash in the
  * process. */
 {
 struct hashEl *el, *els = hashElListHash(wordHash);
 FILE *f = mustOpen(fileName, "w");
 slSort(&els, hashElCmp);
 
 for (el = els; el != NULL; el = el->next)
     {
     struct wordPos *pos;
     fprintf(f, "%s", el->name);
     slSort(&el->val, wordPosCmp);
     for (pos = el->val; pos != NULL; pos = pos->next)
 	fprintf(f, " %s,%d", pos->itemId, pos->wordIx);
     fprintf(f, "\n");
     }
 carefulClose(&f);
 hashElFreeList(&els);
 }
 
 void makeIx(char *inFile, char *outIndex)
 /* Create an index file. */
 {
 struct lineFile *lf = lineFileOpen(inFile, TRUE);
-struct hash *wordHash = newHash(20), *itemIdHash = newHash(20);
+struct hash *wordHash = newHash(20), *failedWordHash = newHash(20), *itemIdHash = newHash(20);
 char *line;
+int failedWordCount = 0;
 initCharTables();
 while (lineFileNextReal(lf, &line))
     {
     char *id, *text;
     id = nextWord(&line);
     text = skipLeadingSpaces(line);
-     indexWords(wordHash, id, text, itemIdHash);
+    failedWordCount += indexWords(wordHash, id, text, itemIdHash, failedWordHash);
     }
 writeIndexHash(wordHash, outIndex);
+if (failedWordCount)
+    {
+    verbose(1, "%d words were longer than limit %d length and were ignored. Run with -verbose=2 to see them.\n", failedWordCount, maxWordLength);
+    verbose(1, "The longest failed word length was %d.\n", maxFailedWordLength);
+    }
 }
 
 void setPrefix(char *word, char *prefix)
 /* Copy first part of word to prefix.  If need be end pad with spaces. */
 {
 int len = strlen(word);
 if (len >= prefixSize)
     memcpy(prefix, word, prefixSize);
 else
     {
     memset(prefix, ' ', prefixSize);
     memcpy(prefix, word, len);
     }
 }
 
@@ -243,20 +271,21 @@
 
 void ixIxx(char *inText, char *outIx, char *outIxx)
 /* ixIxx - Create indices for simple line-oriented file of format 
  * <symbol> <free text>. */
 {
 makeIx(inText, outIx);
 makeIxx(outIx, outIxx);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 prefixSize = optionInt("prefixSize", prefixSize);
 binSize = optionInt("binSize", binSize);
+maxWordLength = optionInt("maxWordLength", maxWordLength);
 if (argc != 4)
     usage();
 ixIxx(argv[1], argv[2], argv[3]);
 return 0;
 }