2bd5caf8ce518d13cefc69de0d084228cf7712fe
hiram
  Fri Nov 4 13:36:48 2016 -0700
adding useful options refs #13673

diff --git src/utils/bamToPsl/bamToPsl.c src/utils/bamToPsl/bamToPsl.c
index 195fa2c..603c86e 100644
--- src/utils/bamToPsl/bamToPsl.c
+++ src/utils/bamToPsl/bamToPsl.c
@@ -1,80 +1,163 @@
 /* bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "bamFile.h"
 #include "psl.h"
 #include "fa.h"
+#include "md5.h"
+#include "net.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.\n"
   "usage:\n"
-  "   bamToPsl in.bam out.psl\n"
+  "   bamToPsl [options] in.bam out.psl\n"
   "options:\n"
-  "   -fasta=output.fa\n"
+  "   -fasta=output.fa - output query sequences to specified file\n"
+  "   -chromAlias=file - specify a two-column file: 1: alias, 2: other name\n"
+  "          for target name translation from column 1 name to column 2 name\n"
+  "          names not found are passed through intact\n"
+  "   -nohead          - do not output the PSL header, default has header output\n"
+  "   -allowDups       - for fasta output, allow duplicate query sequences output\n"
+  "                    - default is to eliminate duplicate sequences\n"
+  "                    - runs much faster without the duplicate check\n"
+  "   -dots=N          - output progress dot(.) every N alignments processed\n"
+  "\n"
+  "note: a chromAlias file can be obtained from a UCSC database, e.g.:\n"
+  " hgsql -N -e 'select alias,chrom from chromAlias;' hg38 > hg38.chromAlias.tab"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"fasta", OPTION_STRING},
+   {"chromAlias", OPTION_STRING},
+   {"nohead", OPTION_BOOLEAN},
+   {"allowDups", OPTION_BOOLEAN},
+   {"dots", OPTION_INT},
    {NULL, 0},
 };
 
-void bamToPsl(char *inBam, char *outPsl, char *outFasta)
-/* bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.. */
+static int dots = 0;
+static boolean nohead = FALSE;
+static boolean allowDups = FALSE;
+
+static struct hash *hashChromAlias(char *fileName)
+/* Read two column file into hash keyed by first column */
+{
+struct hash *hash = hashNew(0);
+struct lineFile *lf = netLineFileOpen(fileName);
+char *row[2];
+while (lineFileRow(lf, row))
+    hashAdd(hash, row[0], cloneString(row[1]));
+
+lineFileClose(&lf);
+return hash;
+}
+
+static void bamToPsl(char *inBam, char *outPsl, char *outFasta, char *aliasFile)
+/* bamToPsl - Convert a bam file to a psl and optionally also a
+   fasta file that contains the reads.. */
 {
+unsigned long long processCount = 0;
+       /* record md5sums to avoid duplicate fasta output
+          key is name, value is md5sum */
+struct hash *fastaSums = NULL;  /* initialize later if needed */
 samfile_t *in = bamMustOpenLocal(inBam, "rb", NULL);
 bam_header_t *head = sam_hdr_read(in);
 if (head == NULL)
     errAbort("Aborting ... bad BAM header in %s", inBam);
 
-/* Open up psl output and write header. */
+/* Open up psl output and write header (if allowed). */
 FILE *f = mustOpen(outPsl, "w");
+if (! nohead)
     pslWriteHead(f);
 
+/* Optionally use alias file */
+struct hash *chromAlias = NULL;  /* initialize later if needed */
+if (aliasFile != NULL)
+    chromAlias = hashChromAlias(aliasFile);
+
 /* Optionally open up fasta output */
 FILE *faF = NULL;
 if (outFasta != NULL)
+    {
     faF = mustOpen(outFasta, "w");
+    fastaSums = newHashExt(20, TRUE);  /* using stack local memory */
+    }
 
 bam1_t one;
 ZeroVar(&one);	// This seems to be necessary!
 /* Write next sequence to fa file. */
 for (;;)
     {
     if (sam_read1(in, head, &one) < 0)
 	{
 	break;
 	}
     struct psl *psl = bamToPslUnscored(&one, head);
     if (psl != NULL)
-         pslTabOut(psl, f);
+       {
+       if (chromAlias)
+           {
+           struct hashEl *hel = NULL;
+           if ((hel = hashLookup(chromAlias, psl->tName)) != NULL)
+              psl->tName = cloneString((char *)hel->val); /* memory leak */
+           }
+        pslTabOut(psl, f);  /* no free of this psl data, memory leak */
+    }
+    ++processCount;
+    if (dots)
+       if (0 == processCount % dots)
+          verbose(1,".");
     if (faF != NULL)
         {
 	char *dna = bamGetQuerySequence(&one, TRUE);
 	char *qName = bam1_qname(&one);
+        if (allowDups)
 	    faWriteNext(faF, qName, dna, strlen(dna));
+        else
+            {
+            char *md5sum = md5HexForString(dna);
+            struct hashEl *hel = NULL;
+            if ((hel = hashLookup(fastaSums, qName)) == NULL)
+                {
+                hel = hashAdd(fastaSums, qName, md5sum);
+                faWriteNext(faF, qName, dna, strlen(dna));
+                }
+            else
+                {  /* verify sequence is identical for same name */
+                if (differentWord((char *)hel->val, md5sum))
+                   verbose(1, "# warning: different sequence found for '%s'\n",
+                          qName);
+                }
+            }
 	freez(&dna);
 	}
     }
+    if (dots)
+       verbose(1,"\n");
 
 samclose(in);
 carefulClose(&f);
 carefulClose(&faF);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 char *fastaName = optionVal("fasta", NULL);
-bamToPsl(argv[1], argv[2], fastaName);
+char *aliasFile = optionVal("chromAlias", NULL);
+dots = optionInt("dots", dots);
+nohead = optionExists("nohead");
+allowDups = optionExists("allowDups");
+bamToPsl(argv[1], argv[2], fastaName, aliasFile);
 return 0;
 }