src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c 1.1
1.1 2009/11/12 19:30:41 kent
Seems to work.
Index: src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c
===================================================================
RCS file: src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c
diff -N src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/txCds/txCdsRedoUniprotPicks/txCdsRedoUniprotPicks.c 12 Nov 2009 19:30:41 -0000 1.1
@@ -0,0 +1,148 @@
+/* txCdsRedoUniprotPicks - Update uniprot columns in pick file based on protein/protein alignment
+ * at end of pipeline vs. mrna/protein alignment at start.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "cdsPick.h"
+#include "psl.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "txCdsRedoUniprotPicks - Update uniprot columns in pick file based on protein/protein alignment\n"
+ "at end of pipeline vs. mrna/protein alignment at start.\n"
+ "usage:\n"
+ " txCdsRedoUniprotPicks old.picks ucscVsUniprot.psl uniCurated.tab new.picks\n"
+ "where:\n"
+ " old.picks is file in format described in cdsPick.h\n"
+ " ucscVsUniprot.psl is a protein/protein psl with ucsc in target, uniProt in query\n"
+ " uniCurated.tab is two columns - uniProt acc, and then a 1 if SwissProt, 0 otherwise\n"
+ " new.picks is the updated version of old.picks\n"
+ );
+}
+
+static struct optionSpec options[] = {
+ {NULL, 0},
+};
+
+double scoreAli(struct psl *psl, boolean isCurated)
+/* Make up a score. Return 0 if it looks worthless, otherwise something that gets bigger the
+ * better the alignment. */
+{
+double aliCount = psl->match + psl->misMatch + psl->repMatch;
+double tCoverage = aliCount/psl->tSize;
+if (tCoverage < 0.5)
+ return 0;
+double qCoverage = aliCount/psl->qSize;
+if (qCoverage < 0.5)
+ return 0;
+double score = (psl->match + psl->repMatch/2) - 20 *
+ psl->misMatch - 40 * psl->qNumInsert - 40 * psl->tNumInsert;
+score *= 10; /* Make the actual score the most important thing */
+if (isCurated) /* Add in curation factor. */
+ score += 1;
+score *= 10; /* Make the coverage of UCSC the next most important */
+score += tCoverage;
+score *= 10; /* Make the coverage of UniProt the next most important */
+score += qCoverage;
+return score;
+}
+
+struct psl *bestAliInList(struct psl *start, struct psl *end, struct hash *curatedHash)
+/* Return best scoring alignment in list. */
+{
+struct psl *psl, *bestPsl = NULL;
+double bestScore = 0;
+for (psl = start; psl != end; psl = psl->next)
+ {
+ boolean isCurated = (hashLookup(curatedHash, psl->qName) == NULL);
+ double score = scoreAli(psl, isCurated);
+ if (score > bestScore)
+ {
+ bestScore = score;
+ bestPsl = psl;
+ }
+ }
+return bestPsl;
+}
+
+void txCdsRedoUniprotPicks(char *oldPickFile, char *pslFile, char *curatedFile, char *newPickFile)
+/* txCdsRedoUniprotPicks - Update uniprot columns in pick file based on protein/protein alignment
+ * at end of pipeline vs. mrna/protein alignment at start.. */
+{
+/* Read in curated file to hash with values just where there are ones. */
+struct hash *curatedHash = hashNew(20);
+ {
+ struct lineFile *lf = lineFileOpen(curatedFile, TRUE);
+ char *row[2];
+ while (lineFileRow(lf, row))
+ {
+ char c = row[1][0];
+ switch (c)
+ {
+ case '1':
+ hashAdd(curatedHash, row[0], NULL);
+ break;
+ case '0':
+ break;
+ default:
+ errAbort("Expecting 0 or 1 in second column, line %d of %s",
+ lf->lineIx, lf->fileName);
+ break;
+ }
+ }
+ lineFileClose(&lf);
+ }
+
+/* Read in PSL and make hash that just contains the best good alignment for each. */
+struct hash *bestHash = hashNew(19);
+ {
+ struct psl *pslList = pslLoadAll(pslFile);
+ slSort(&pslList, pslCmpTarget);
+ struct psl *start, *end;
+ for (start = pslList; start != NULL; start = end)
+ {
+ char *tName = start->tName;
+ for (end = start->next; end != NULL; end = end->next)
+ if (!sameString(tName, end->tName))
+ break;
+ struct psl *bestAli = bestAliInList(start, end, curatedHash);
+ if (bestAli != NULL)
+ hashAdd(bestHash, tName, bestAli);
+ }
+ }
+
+/* Loop through old file and replace values. */
+FILE *f = mustOpen(newPickFile, "w");
+struct lineFile *lf = lineFileOpen(oldPickFile, TRUE);
+char *row[CDSPICK_NUM_COLS];
+while (lineFileRowTab(lf, row))
+ {
+ struct cdsPick pick;
+ cdsPickStaticLoad(row, &pick);
+ pick.uniProt = pick.swissProt = "";
+ struct psl *psl = hashFindVal(bestHash, pick.name);
+ if (psl != NULL)
+ {
+ pick.uniProt = psl->qName;
+ if (hashLookup(curatedHash, pick.uniProt))
+ pick.swissProt = pick.uniProt;
+ }
+ cdsPickTabOut(&pick, f);
+ }
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 5)
+ usage();
+txCdsRedoUniprotPicks(argv[1], argv[2], argv[3], argv[4]);
+return 0;
+}