src/utils/pslLiftSubrangeBlat/pslLiftSubrangeBlat.c 1.1

1.1 2010/06/02 07:07:45 markd
added program to lift blat subrange queries
Index: src/utils/pslLiftSubrangeBlat/pslLiftSubrangeBlat.c
===================================================================
RCS file: src/utils/pslLiftSubrangeBlat/pslLiftSubrangeBlat.c
diff -N src/utils/pslLiftSubrangeBlat/pslLiftSubrangeBlat.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/utils/pslLiftSubrangeBlat/pslLiftSubrangeBlat.c	2 Jun 2010 07:07:45 -0000	1.1
@@ -0,0 +1,131 @@
+/* pslLiftSubrangeBlat - lift PSLs from blat subrange alignments. */
+#include "common.h"
+#include "linefile.h"
+#include "psl.h"
+#include "pslReader.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "pslLiftSubrangeBlat - lift PSLs from blat subrange alignments\n"
+  "usage:\n"
+  "   pslLiftSubrangeBlat isPsl outPsl\n"
+  "\n"
+  "Lift a PSL with target coordinates from a blat subrange query\n"
+  "(e.g. blah/hg18.2bit:chr1:1000-20000) which has subrange\n"
+  "coordinates as the target name (e.g. chr1:1000-200000) to\n"
+  "actual target coordinates.\n"
+  "\n"
+  "options:\n"
+  "  -tSizes=szfile - lift target side based on tName, using target sizes from\n"
+  "                   this tab separated file.\n"
+  "  -qSizes=szfile - lift query side based on qName, using query sizes from\n"
+  "                   this tab separated file.\n"
+  "Must specify at least on of -tSizes or -qSize or both.\n"
+  );
+}
+
+static struct optionSpec options[] = {
+    {"tSizes", OPTION_STRING},
+    {"qSizes", OPTION_STRING},
+    {NULL, 0},
+};
+
+static struct hash *loadSizes(char *szFile)
+/* load sizes into a hash */
+{
+struct hash *sizes = hashNew(0);
+struct lineFile *lf = lineFileOpen(szFile, TRUE);
+char *row[2];
+while (lineFileNextRowTab(lf, row, 2))
+    hashAddInt(sizes, row[0], sqlSigned(row[1]));
+
+return sizes;
+}
+
+static boolean parseName(char *desc, char *name, int *start, int *end)
+/* parse and validate name into it's parts. Return FALSE if name doesn't
+ * contain a subrange specification. */
+{
+char *nameEnd = strchr(name, ':');
+if (nameEnd == NULL)
+    return FALSE;
+char *p = nameEnd+1;
+char *pp = strchr(p, '-');
+if (pp == NULL)
+    errAbort("invalid %s subrange: %s", desc, name);
+*pp = '\0';
+*start = sqlUnsigned(p);
+*pp = '-';
+pp++;
+*end = sqlUnsigned(pp);
+if (*end <= *start)
+    errAbort("range zero or negative length in %s subrange: %s", desc, name);
+*nameEnd = '\0';
+return TRUE;
+}
+
+static void liftSide(char *desc, struct hash *seqSizes, struct psl *psl, char *name, char strand, unsigned *seqSize, int *start, int *end, unsigned *starts)
+/* life one side of the alignment */
+{
+int regStart, regEnd, i;
+if (parseName(desc, name, &regStart, &regEnd))
+    {
+    *seqSize = hashIntVal(seqSizes, name);
+    if (*end > *seqSize)
+        errAbort("subrange %s:%d-%d extends past sequence end %ud", name, regStart, regEnd, *seqSize);
+    *start += regStart;
+    *end += regStart;
+    if (strand == '-')
+        reverseIntRange(&regStart, &regEnd, *seqSize);
+    for (i = 0; i < psl->blockCount; i++)
+        starts[i] += regStart;
+    }
+}
+
+static void liftIt(struct psl *psl, struct hash *qSizes, struct hash *tSizes)
+/* lift one subrange PSL based on tName */
+{
+if (qSizes != NULL)
+    liftSide("qName", qSizes, psl, psl->qName, pslQStrand(psl), &psl->qSize, &psl->qStart, &psl->qEnd, psl->qStarts);
+if (tSizes != NULL)
+    liftSide("tName", tSizes, psl, psl->tName, pslTStrand(psl), &psl->tSize, &psl->tStart, &psl->tEnd, psl->tStarts);
+}
+
+static void pslLiftSubrangeBlat(char *inPsl, char *outPsl, char *qSizesFile, char *tSizesFile)
+/* pslLiftSubrangeBlat - lift PSLs from blat subrange alignments. */
+{
+struct hash *qSizes = (qSizesFile != NULL) ? loadSizes(qSizesFile) : NULL;
+struct hash *tSizes = (tSizesFile != NULL) ? loadSizes(tSizesFile) : NULL;
+struct pslReader *inRd = pslReaderFile(inPsl, NULL);
+FILE *outFh = mustOpen(outPsl, "w");
+struct psl *psl;
+while ((psl = pslReaderNext(inRd)) != NULL)
+    {
+    liftIt(psl, qSizes, tSizes);
+    pslTabOut(psl, outFh);
+    pslFree(&psl);
+    }
+carefulClose(&outFh);
+pslReaderFree(&inRd);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+    usage();
+char *qSizesFile = optionVal("qSizes", NULL);
+char *tSizesFile = optionVal("tSizes", NULL);
+if ((qSizesFile == NULL) && (tSizesFile == NULL))
+    errAbort("must specify at least one of -tSizes or -qSizes");
+pslLiftSubrangeBlat(argv[1], argv[2], qSizesFile, tSizesFile);
+return 0;
+}