bd19d0f0854ea7e27017395d96715b61071b3391
markd
  Thu Mar 7 22:49:24 2024 -0800
restructure code to make it easier to modify and then simplified by remove fasta checksum complexity checks that were added because of a missunderstanding about how supplemental alignments were represented

diff --git src/utils/bamToPsl/bamToPsl.c src/utils/bamToPsl/bamToPsl.c
index d9d8880..09cc543 100644
--- src/utils/bamToPsl/bamToPsl.c
+++ src/utils/bamToPsl/bamToPsl.c
@@ -1,200 +1,171 @@
 /* bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "bamFile.h"
 #include "psl.h"
 #include "fa.h"
-#include "md5.h"
 #include "net.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bamToPsl - Convert a bam file to a psl and optionally also a fasta file that contains the reads.\n"
   "usage:\n"
   "   bamToPsl [options] in.bam out.psl\n"
   "options:\n"
   "   -fasta=output.fa - output query sequences to specified file\n"
   "   -chromAlias=file - specify a two-column file: 1: alias, 2: other name\n"
   "          for target name translation from column 1 name to column 2 name\n"
   "          names not found are passed through intact\n"
   "   -nohead          - do not output the PSL header, default has header output\n"
-  "   -allowDups       - for fasta output, allow duplicate query sequences output\n"
-  "                    - default is to eliminate duplicate sequences\n"
-  "                    - runs much faster without the duplicate check\n"
-  "   -noSequenceVerify - when checking for dups, do not verify each sequence\n"
-  "                    - when the same name is identical, assume they are\n"
-  "                    - helps speed up the dup check but not thorough\n"
   "   -dots=N          - output progress dot(.) every N alignments processed\n"
-  "   -querySizes=file - two column tab file: 'name size' to fixup the query\n"
-  "                    - sequence sizes since bam does not provide qSize\n"
   "\n"
   "Note: a chromAlias file can be obtained from a UCSC database, e.g.:\n"
   " hgsql -N -e 'select alias,chrom from chromAlias;' hg38 > hg38.chromAlias.tab\n"
   " Or from the downloads server:\n"
   "  wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/chromAlias.txt.gz\n"
   "See also our tool chromToUcsc\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"fasta", OPTION_STRING},
    {"chromAlias", OPTION_STRING},
    {"nohead", OPTION_BOOLEAN},
    {"allowDups", OPTION_BOOLEAN},
    {"noSequenceVerify", OPTION_BOOLEAN},
    {"dots", OPTION_INT},
    {"querySizes", OPTION_STRING},
    {NULL, 0},
 };
 
 static int dots = 0;
 static boolean nohead = FALSE;
-static boolean allowDups = FALSE;
-static boolean noSequenceVerify = FALSE;
-static char *querySizes = NULL;	// use chrom.sizes file to set query size
-static struct hash *qSizeHash = NULL;
 
 static struct hash *hashChromAlias(char *fileName)
 /* Read two column file into hash keyed by first column */
 {
 struct hash *hash = hashNew(0);
 struct lineFile *lf = netLineFileOpen(fileName);
 char *row[2];
 while (lineFileRow(lf, row))
     hashAdd(hash, row[0], cloneString(row[1]));
 
 lineFileClose(&lf);
 return hash;
 }
 
+static void convertToPsl(bam1_t *aln, bam_header_t *head, struct hash *chromAlias, FILE *f)
+/* convert one alignment to a psl */
+{
+struct psl *psl = bamToPslUnscored(aln, head);
+if (psl != NULL)
+    {
+    if (chromAlias)
+        {
+        struct hashEl *hel = NULL;
+        if ((hel = hashLookup(chromAlias, psl->tName)) != NULL)
+            psl->tName = cloneString((char *)hel->val); /* memory leak */
+        }
+    pslTabOut(psl, f);  /* no free of this psl data, memory leak */
+    pslFree(&psl);
+    }
+}
+
+static void convertToFasta(bam1_t *aln, struct hash *fastaDoneSeqs, FILE *faF)
+/* output a FASTA record of the query sequence */
+{
+char *dna = bamGetQuerySequence(aln, TRUE);
+char *qName = bam1_qname(aln);
+if (hashLookup(fastaDoneSeqs, qName) == NULL) // first seen
+    {
+    hashAddInt(fastaDoneSeqs, qName, TRUE);
+    faWriteNext(faF, qName, dna, strlen(dna));
+    }
+freez(&dna);
+}
+
 static void bamToPsl(char *inBam, char *outPsl, char *outFasta, char *aliasFile)
 /* bamToPsl - Convert a bam file to a psl and optionally also a
    fasta file that contains the reads.. */
 {
 unsigned long long processCount = 0;
-       /* record md5sums to avoid duplicate fasta output
-          key is name, value is md5sum */
-struct hash *fastaSums = NULL;  /* initialize later if needed */
 samfile_t *in = bamMustOpenLocal(inBam, "rb", NULL);
 bam_header_t *head = sam_hdr_read(in);
 if (head == NULL)
     errAbort("Aborting ... bad BAM header in %s", inBam);
 
 /* Open up psl output and write header (if allowed). */
 FILE *f = mustOpen(outPsl, "w");
 if (! nohead)
     pslWriteHead(f);
 
 /* Optionally use alias file */
 struct hash *chromAlias = NULL;  /* initialize later if needed */
 if (aliasFile != NULL)
     chromAlias = hashChromAlias(aliasFile);
 
 /* Optionally open up fasta output */
+struct hash *fastaDoneSeqs = NULL; // avoids duplicates
 FILE *faF = NULL;
 if (outFasta != NULL)
     {
     faF = mustOpen(outFasta, "w");
-    fastaSums = newHashExt(20, TRUE);  /* using stack local memory */
+    fastaDoneSeqs = hashNew(20);
     }
 
 bam1_t one;
 ZeroVar(&one);	// This seems to be necessary!
 /* Write next sequence to fa file. */
 for (;;)
     {
     if (sam_read1(in, head, &one) < 0)
 	{
 	break;
 	}
     if (one.core.n_cigar != 0)
         {
-        struct psl *psl = bamToPslUnscored(&one, head);
-        if (psl != NULL)
-            {
-            if (chromAlias)
-                {
-                struct hashEl *hel = NULL;
-                if ((hel = hashLookup(chromAlias, psl->tName)) != NULL)
-                    psl->tName = cloneString((char *)hel->val); /* memory leak */
+        convertToPsl(&one, head, chromAlias, f);
         }
-	    if (qSizeHash)
-		{
-		int brokenSize = psl->qSize;
-		psl->qSize = hashIntVal(qSizeHash, psl->qName);
-		int offset = psl->qSize - brokenSize;
-		/* check to see if negative qStarts need to be fixed */
-		if ((offset > 0) && (psl->strand[0] == '-'))
+    if ((faF != NULL) && ((one.core.flag & BAM_FSUPPLEMENTARY) == 0))
         {
-		    for (int i = 0; i < psl->blockCount; ++i)
-			psl->qStarts[i] += offset;
-		    }
-		}
-            pslTabOut(psl, f);  /* no free of this psl data, memory leak */
-            pslFree(&psl);
-            }
+        // supplementary are not include as they don't have full sequence
+        convertToFasta(&one, fastaDoneSeqs, faF);
         }
     ++processCount;
     if (dots)
        if (0 == processCount % dots)
           verbose(1,".");
-    if (faF != NULL)
-        {
-	char *dna = bamGetQuerySequence(&one, TRUE);
-	char *qName = bam1_qname(&one);
-        if (allowDups)
-	    faWriteNext(faF, qName, dna, strlen(dna));
-        else
-            {
-            struct hashEl *hel = NULL;
-            if ((hel = hashLookup(fastaSums, qName)) == NULL) // first seen
-               {
-               char *md5sum = md5HexForString(dna);
-               hel = hashAdd(fastaSums, qName, md5sum);
-               faWriteNext(faF, qName, dna, strlen(dna));
-               }
-            else if (! noSequenceVerify)  // repeated md5sum calculation
-               {
-               char *md5sum = md5HexForString(dna);
-               /* verify sequence is identical for same name */
-               if (differentWord((char *)hel->val, md5sum))
-                  verbose(1, "# warning: different sequence found for '%s'\n",
-                             qName);
-               }
-            }
-	freez(&dna);
-	}
     }
     if (dots)
        verbose(1,"\n");
 
 samclose(in);
 carefulClose(&f);
 carefulClose(&faF);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 char *fastaName = optionVal("fasta", NULL);
 char *aliasFile = optionVal("chromAlias", NULL);
 
-querySizes = optionVal("querySizes", NULL);
-
 dots = optionInt("dots", dots);
 nohead = optionExists("nohead");
-allowDups = optionExists("allowDups");
-noSequenceVerify = optionExists("noSequenceVerify");
-if (querySizes != NULL)
-    qSizeHash = loadSizes(querySizes);
+if (optionExists("allowDups"))
+    fprintf(stderr, "Note: -allowDups is obsolete and ignored");
+if (optionExists("noSequenceVerify"))
+    fprintf(stderr, "Note: -noSequenceVerify is obsolete and ignored");
+if (optionExists("querySizes"))
+    fprintf(stderr, "Note: -querySizes is obsolete and ignored");
 bamToPsl(argv[1], argv[2], fastaName, aliasFile);
 return 0;
 }