src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c 1.1

1.1 2009/11/12 19:30:41 kent
Seems to work.
Index: src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c
===================================================================
RCS file: src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c
diff -N src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c	12 Nov 2009 19:30:41 -0000	1.1
@@ -0,0 +1,148 @@
+/* txCdsRedoUniprotPicks - Update uniprot columns in pick file based on protein/protein alignment 
+ * at end of pipeline vs. mrna/protein alignment at start.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "cdsPick.h"
+#include "psl.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "txCdsRedoUniprotPicks - Update uniprot columns in pick file based on protein/protein alignment\n"
+  "at end of pipeline vs. mrna/protein alignment at start.\n"
+  "usage:\n"
+  "   txCdsRedoUniprotPicks old.picks ucscVsUniprot.psl uniCurated.tab new.picks\n"
+  "where:\n"
+  "    old.picks is file in format described in cdsPick.h\n"
+  "    ucscVsUniprot.psl is a protein/protein psl with ucsc in target, uniProt in query\n"
+  "    uniCurated.tab is two columns - uniProt acc, and then a 1 if SwissProt, 0 otherwise\n"
+  "    new.picks is the updated version of old.picks\n"
+  );
+}
+
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+double scoreAli(struct psl *psl, boolean isCurated)
+/* Make up a score.  Return 0 if it looks worthless, otherwise something that gets bigger the
+ * better the alignment. */
+{
+double aliCount = psl->match + psl->misMatch + psl->repMatch;
+double tCoverage = aliCount/psl->tSize;
+if (tCoverage < 0.5)
+    return 0;
+double qCoverage = aliCount/psl->qSize;
+if (qCoverage < 0.5)
+    return 0;
+double score = (psl->match + psl->repMatch/2) - 20 * 
+	psl->misMatch - 40 * psl->qNumInsert - 40 * psl->tNumInsert;
+score *= 10;	/* Make the actual score the most important thing */
+if (isCurated)  /* Add in curation factor. */
+    score += 1;
+score *= 10;	/* Make the coverage of UCSC the next most important */
+score += tCoverage;
+score *= 10;	/* Make the coverage of UniProt the next most important */
+score += qCoverage;
+return score;
+}
+
+struct psl *bestAliInList(struct psl *start, struct psl *end, struct hash *curatedHash)
+/* Return best scoring alignment in list. */
+{
+struct psl *psl, *bestPsl = NULL;
+double bestScore = 0;
+for (psl = start; psl != end; psl = psl->next)
+    {
+    boolean isCurated = (hashLookup(curatedHash, psl->qName) == NULL);
+    double score = scoreAli(psl, isCurated);
+    if (score > bestScore)
+        {
+	bestScore = score;
+	bestPsl = psl;
+	}
+    }
+return bestPsl;
+}
+
+void txCdsRedoUniprotPicks(char *oldPickFile, char *pslFile, char *curatedFile, char *newPickFile)
+/* txCdsRedoUniprotPicks - Update uniprot columns in pick file based on protein/protein alignment 
+ * at end of pipeline vs. mrna/protein alignment at start.. */
+{
+/* Read in curated file to hash with values just where there are ones. */
+struct hash *curatedHash = hashNew(20);
+    {
+    struct lineFile *lf = lineFileOpen(curatedFile, TRUE);
+    char *row[2];
+    while (lineFileRow(lf, row))
+	{
+	char c = row[1][0];
+	switch (c)
+	     {
+	     case '1':
+		hashAdd(curatedHash, row[0], NULL);
+		break;
+	     case '0':
+	        break;
+	     default:
+	        errAbort("Expecting 0 or 1 in second column, line %d of %s", 
+			lf->lineIx, lf->fileName);
+		break;
+	     }
+	}
+    lineFileClose(&lf);
+    }
+
+/* Read in PSL and make hash that just contains the best good alignment for each. */
+struct hash *bestHash = hashNew(19);
+    {
+    struct psl *pslList = pslLoadAll(pslFile);
+    slSort(&pslList, pslCmpTarget);
+    struct psl *start, *end;
+    for (start = pslList; start != NULL; start = end)
+	{
+	char *tName = start->tName;
+	for (end = start->next; end != NULL; end = end->next)
+	    if (!sameString(tName, end->tName))
+		break;
+	struct psl *bestAli = bestAliInList(start, end, curatedHash);
+	if (bestAli != NULL)
+	    hashAdd(bestHash, tName, bestAli);
+	}
+    }
+
+/* Loop through old file and replace values. */
+FILE *f = mustOpen(newPickFile, "w");
+struct lineFile *lf = lineFileOpen(oldPickFile, TRUE);
+char *row[CDSPICK_NUM_COLS];
+while (lineFileRowTab(lf, row))
+    {
+    struct cdsPick pick;
+    cdsPickStaticLoad(row, &pick);
+    pick.uniProt = pick.swissProt = "";
+    struct psl *psl = hashFindVal(bestHash, pick.name);
+    if (psl != NULL)
+        {
+	pick.uniProt = psl->qName;
+	if (hashLookup(curatedHash, pick.uniProt))
+	    pick.swissProt = pick.uniProt;
+	}
+    cdsPickTabOut(&pick, f);
+    }
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 5)
+    usage();
+txCdsRedoUniprotPicks(argv[1], argv[2], argv[3], argv[4]);
+return 0;
+}