a5baef17b79d806a60d886cec056f0991a4eee14
hiram
  Thu May 18 11:30:12 2017 -0700
add noSequenceVerify option to speed up procedure refs #13673

diff --git src/utils/bamToPsl/bamToPsl.c src/utils/bamToPsl/bamToPsl.c
index 603c86e..f0f64d1 100644
--- src/utils/bamToPsl/bamToPsl.c
+++ src/utils/bamToPsl/bamToPsl.c
@@ -13,50 +13,55 @@
 /* Explain usage and exit. */
 {
 errAbort(
   "bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.\n"
   "usage:\n"
   "   bamToPsl [options] in.bam out.psl\n"
   "options:\n"
   "   -fasta=output.fa - output query sequences to specified file\n"
   "   -chromAlias=file - specify a two-column file: 1: alias, 2: other name\n"
   "          for target name translation from column 1 name to column 2 name\n"
   "          names not found are passed through intact\n"
   "   -nohead          - do not output the PSL header, default has header output\n"
   "   -allowDups       - for fasta output, allow duplicate query sequences output\n"
   "                    - default is to eliminate duplicate sequences\n"
   "                    - runs much faster without the duplicate check\n"
+  "   -noSequenceVerify - when checking for dups, do not verify each sequence\n"
+  "                    - when the same name is identical, assume they are\n"
+  "                    - helps speed up the dup check but not thorough\n"
   "   -dots=N          - output progress dot(.) every N alignments processed\n"
   "\n"
   "note: a chromAlias file can be obtained from a UCSC database, e.g.:\n"
   " hgsql -N -e 'select alias,chrom from chromAlias;' hg38 > hg38.chromAlias.tab"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"fasta", OPTION_STRING},
    {"chromAlias", OPTION_STRING},
    {"nohead", OPTION_BOOLEAN},
    {"allowDups", OPTION_BOOLEAN},
+   {"noSequenceVerify", OPTION_BOOLEAN},
    {"dots", OPTION_INT},
    {NULL, 0},
 };
 
 static int dots = 0;
 static boolean nohead = FALSE;
 static boolean allowDups = FALSE;
+static boolean noSequenceVerify = FALSE;
 
 static struct hash *hashChromAlias(char *fileName)
 /* Read two column file into hash keyed by first column */
 {
 struct hash *hash = hashNew(0);
 struct lineFile *lf = netLineFileOpen(fileName);
 char *row[2];
 while (lineFileRow(lf, row))
     hashAdd(hash, row[0], cloneString(row[1]));
 
 lineFileClose(&lf);
 return hash;
 }
 
 static void bamToPsl(char *inBam, char *outPsl, char *outFasta, char *aliasFile)
@@ -110,54 +115,57 @@
            }
         pslTabOut(psl, f);  /* no free of this psl data, memory leak */
     }
     ++processCount;
     if (dots)
        if (0 == processCount % dots)
           verbose(1,".");
     if (faF != NULL)
         {
 	char *dna = bamGetQuerySequence(&one, TRUE);
 	char *qName = bam1_qname(&one);
         if (allowDups)
 	    faWriteNext(faF, qName, dna, strlen(dna));
         else
             {
-            char *md5sum = md5HexForString(dna);
             struct hashEl *hel = NULL;
-            if ((hel = hashLookup(fastaSums, qName)) == NULL)
+            if ((hel = hashLookup(fastaSums, qName)) == NULL) // first seen
                {
+               char *md5sum = md5HexForString(dna);
                hel = hashAdd(fastaSums, qName, md5sum);
                faWriteNext(faF, qName, dna, strlen(dna));
                }
-            else
-                {  /* verify sequence is identical for same name */
+            else if (! noSequenceVerify)  // repeated md5sum calculation
+               {
+               char *md5sum = md5HexForString(dna);
+               /* verify sequence is identical for same name */
                if (differentWord((char *)hel->val, md5sum))
                   verbose(1, "# warning: different sequence found for '%s'\n",
                              qName);
                }
             }
 	freez(&dna);
 	}
     }
     if (dots)
        verbose(1,"\n");
 
 samclose(in);
 carefulClose(&f);
 carefulClose(&faF);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 char *fastaName = optionVal("fasta", NULL);
 char *aliasFile = optionVal("chromAlias", NULL);
 dots = optionInt("dots", dots);
 nohead = optionExists("nohead");
 allowDups = optionExists("allowDups");
+noSequenceVerify = optionExists("noSequenceVerify");
 bamToPsl(argv[1], argv[2], fastaName, aliasFile);
 return 0;
 }