e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/hg/pslPairs/pslPairs.c src/hg/pslPairs/pslPairs.c
index 0caf0fe..a3ba3b1 100644
--- src/hg/pslPairs/pslPairs.c
+++ src/hg/pslPairs/pslPairs.c
@@ -1,601 +1,604 @@
 /* pslPairs - join paired ends in psl alignments */
 /*
   File: pslPairs.c
   Author: Terry Furey
   Date: 10/1/2003
   Description: Create end sequence pairs from psl files
 */
 
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
 #include "common.h"
 #include "linefile.h"
 #include "memalloc.h"
 #include "hash.h"
 #include "jksql.h"
 #include "hdb.h"
 #include "fuzzyFind.h"
 #include "portable.h"
 #include "psl.h"
 #include "options.h"
 #include "binRange.h"
 
 static boolean VERBOSE = FALSE;
 static boolean SLOP = FALSE;
 static boolean SHORT = FALSE;
 static boolean LONG = FALSE;
 static boolean MISMATCH = FALSE;
 static boolean ORPHAN = FALSE;
 static boolean NORANDOM = FALSE;
 static boolean NOBIN = FALSE;
 static int MIN = 32000;
 static int MAX = 47000;
 static int SLOPVAL = 5000;
 static float NEARTOP = 0.001;
 static int TINSERT = 500;
 static int HARDMAX = 75000;
 static float MIN_ID = 0.96;
 static float MIN_ORPHAN_ID = 0.96;
 
 static void usage()
 /* Explain usage and exit */
 {
 errAbort(
 "pslPairs - join paired ends in psl alignments\n"
 "usage: pslPairs <pslFile> <pairFile> <pslTableName> <outFilePrefix>\n"
 "  creates: <outFilePrefix>.pairs file\n"
 "  pslFile\t- filtered psl alignments of ends from kluster run\n"
 "  pairFile\t- three column tab separated: forward reverse cloneId\n"
 "\t\t- forward and reverse columns can be comma separated end ids\n"
 "  pslTableName\t- table name the psl alignments have been loaded into\n"
 "  outFilePrefix\t- prefix used for each output file name\n"
 "Options:\n"
 "  -max=N\t- maximum length of clone sequence (default=%d)\n"
 "  -min=N\t- minimum length of clone sequence (default=%d)\n"
 "  -slopval=N\t- deviation from max/min clone lengths allowed for slop report\n\t\t- (default=%d)\n"
 "  -nearTop=N\t- maximium deviation from best match allowed (default=%0.3f)\n"
 "  -minId=N\t- minimum pct ID of at least one end (default=%0.2f)\n"
 "  -minOrphanId=N - minimum pct ID for orphan alignment (default=%0.2f)\n"
 "  -tInsert=N\t- maximum insert bases allowed in sequence alignment\n\t\t- (default=%d)\n"
 "  -hardMax=N\t- absolute maximum clone length for long report (default=%d)\n"
 "  -verbose\t- display all informational messages\n"
 "  -noBin\t- do not include bin column in output file\n"
 "  -noRandom\t- do not include placements on random portions\n\t\t- {length(chr name) < 7}\n"
 "  -slop\t\t- create <outFilePrefix>.slop file of pairs that fall within\n\t\t- slop length\n"
 "  -short\t- create <outFilePrefix>.short file of pairs shorter than\n\t\t- min size\n"
 "  -long\t\t- create <outFilePrefix>.long file of pairs longer than\n\t\t- max size, but less than hardMax size\n"
 "  -mismatch\t- create <outFilePrefix>.mismatch file of pairs with\n\t\t- bad orientation of ends\n"
 "  -orphan\t- create <outFilePrefix>.orphan file of unmatched end sequences",
 MAX, MIN, SLOPVAL, NEARTOP, MIN_ID, MIN_ORPHAN_ID, TINSERT, HARDMAX
 );
 }
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"verbose", OPTION_BOOLEAN},
     {"slop", OPTION_BOOLEAN},
     {"short", OPTION_BOOLEAN},
     {"long", OPTION_BOOLEAN},
     {"mismatch", OPTION_BOOLEAN},
     {"orphan", OPTION_BOOLEAN},
     {"noRandom", OPTION_BOOLEAN},
     {"noBin", OPTION_BOOLEAN},
     {"min", OPTION_INT},
     {"max", OPTION_INT},
     {"slopval", OPTION_INT},
     {"nearTop", OPTION_FLOAT},
     {"minId", OPTION_FLOAT},
     {"minOrphanId", OPTION_FLOAT},
     {"tInsert", OPTION_INT},
     {"hardMax", OPTION_INT},
     {NULL, 0}
 };
 
 FILE *of, *orf, *sf, *mf, *esf, *elf;
 
 struct cloneName
 {
   char name[32];
 };
 
 struct pslAli
 {
   struct pslAli *next;
   int score;
   float id;
   float cov;
   struct psl *psl;
 };
 
 struct pslPair
 {
   struct pslPair *next;
   struct pslAli *f;
   struct pslAli *r;
   int score;
   int distance;
   boolean orien;
   boolean random;
 };
 
 struct clone
 {
   struct clone *next;
   char name[32];
   struct pslAli *end1;
   struct pslAli *end2;
   struct pslPair *pairs;
   struct pslPair *pairsRandom;
   struct pslPair *pairsSlop;
   struct pslPair *pairsExtremeS;
   struct pslPair *pairsExtremeL;
   struct pslPair *pairsMM;
   boolean orphan;
 } *cloneList = NULL;
 
 struct hash *clones = NULL;
 struct hash *leftNames = NULL;
 struct hash *rightNames = NULL;
 
 void pslAliFree(struct pslAli **pa)
 {
   struct pslAli *el;
 
   if ((el = *pa) == NULL) return;
   pslFree(&(el->psl));
   freez(pa);
 }
 
 void pslAliListFree(struct pslAli **pList)
 {
   struct pslAli *i, *next;
 
   for (i = *pList; i != NULL; i = next)
     {
       next = i->next;
       pslAliFree(&i);
     }
   *pList = NULL;
 }
 
 void pslPairFree(struct pslPair **pp)
 {
   freez(pp);
 }
 
 void pslPairListFree(struct pslPair **pList)
 {
   struct pslPair *i, *next;
 
   for (i = *pList; i != NULL; i = next)
     {
       next = i->next;
       pslPairFree(&i);
     }
   *pList = NULL;
 }
 
 void cloneFree(struct clone **c)
 {
   struct clone *el;
 
   if ((el = *c) == NULL) return;
   pslAliListFree(&(el->end1));
   pslAliListFree(&(el->end2));
   pslPairListFree(&(el->pairs));
   freez(c);
 }
 
 struct pslAli *createPslAli(struct psl *psl)
 /* Create a pslAli element from a psl record */
 {
   struct pslAli *pa;
   
   AllocVar(pa);
   pa->next = NULL;
   pa->psl = psl;
   pa->score = pslScore(psl);
   pa->id = (float)(psl->match + psl->repMatch + psl->nCount)/(psl->match + psl->misMatch + psl->repMatch + psl->nCount);
   pa->cov = (float)(psl->match + psl->misMatch + psl->repMatch + psl->nCount)/psl->qSize;
   return(pa);
 }
 
 struct clone *createClone(char *name, char *eName1, char *eName2)
 /* Create a clone element from the names */
 {
   struct clone *clone;
 
   AllocVar(clone);
   clone->next = NULL;
   sprintf(clone->name,"%s",name);
   clone->end1 = NULL;
   clone->end2 = NULL;
   clone->pairs = NULL;
   clone->orphan = FALSE;
 
   return(clone);
 }
 struct pslPair *createPslPair(struct pslAli *e1, struct pslAli *e2)
 {
   struct pslPair *pp;
 
   if (strcmp(e1->psl->tName, e2->psl->tName)) return NULL;
   if ((e1->id < MIN_ID) && (e2->id < MIN_ID)) return NULL;
   AllocVar(pp);
   pp->next = NULL;
   if ((e1->psl->tStart) < (e2->psl->tStart))
     {
       pp->f = e1;
       pp->r = e2;
     }
   else
     {
        pp->f = e2;
       pp->r = e1;
     }
   pp->distance = pp->r->psl->tEnd - pp->f->psl->tStart;
   if ((pp->f->psl->strand[0] == '+') && (pp->r->psl->strand[0] == '-'))
     pp->orien = TRUE;
   else 
     pp->orien = FALSE;
   if (strlen(e1->psl->tName) > 6)
     pp->random = TRUE;
   else 
     pp->random = FALSE;
   pp->score = e1->score + e2->score;
 
   return(pp);
 }
 
 void readPairFile(struct lineFile *prf)
 /* Read in pairs and initialize clone list */
 {
   int lineSize, i;
   char *line;
   char *words[4];
   char *names[16];
   int wordCount, nameCount;
   struct clone *clone;
   struct cloneName *cloneName;
   
   while (lineFileNext(prf, &line, &lineSize))
     {
       wordCount = chopTabs(line,words);
       if (wordCount != 3)
 	errAbort("Bad line %d of %s\n", prf->lineIx, prf->fileName);
       if (!hashLookup(clones, words[2])) 
 	{
 	  clone = createClone(words[2],NULL,NULL);
 	  hashAdd(clones, words[2], clone);
 	  slAddHead(&cloneList,clone);
 	}
       AllocVar(cloneName);
       sprintf(cloneName->name, "%s", words[2]);
       nameCount = chopCommas(words[0],names);
       for (i = 0; i < nameCount; i++) 
 	hashAdd(leftNames, names[i], cloneName);
       nameCount = chopCommas(words[1],names);
       for (i = 0; i < nameCount; i++) 
 	hashAdd(rightNames, names[i], cloneName);
     }     
 }
 
 void readPslFile(struct lineFile *pf)
 /* Process all records in a psl file of mRNA alignments */
 {
  int lineSize;
  char *line;
  char *words[32];
  int  wordCount;
  struct psl *psl;
  struct clone *clone;
  struct pslAli *pa = NULL;
  struct cloneName *cloneName;
  
  while (lineFileNext(pf, &line, &lineSize))
    {
      wordCount = chopTabs(line, words);
      if (wordCount != 21)
        errAbort("Bad line %d of %s\n", pf->lineIx, pf->fileName);
      psl = pslLoad(words);
      if (hashLookup(leftNames, psl->qName))
        cloneName = hashMustFindVal(leftNames, psl->qName);
      else if (hashLookup(rightNames, psl->qName))
        cloneName = hashMustFindVal(rightNames, psl->qName);
      else
        continue;
      clone = hashMustFindVal(clones, cloneName->name);
      if ((psl->tBaseInsert < TINSERT) && ((!NORANDOM) || (strlen(psl->tName) < 7))) 
        {
 	 pa = createPslAli(psl);
 	 if (hashLookup(leftNames, psl->qName))
 	   slAddHead(&(clone->end1), pa);
 	 else
 	   slAddHead(&(clone->end2), pa);
        }
    }
 }
 
 void pslPairs()
 {
   struct clone *clone;
   struct pslAli *e1, *e2;
   struct pslPair *pp;
 
   for (clone = cloneList; clone != NULL; clone = clone->next)
     {
       for (e1 = clone->end1; e1 != NULL; e1 = e1->next)
 	for (e2 = clone->end2; e2 != NULL; e2 = e2->next)
 	  {
 	    if ((pp = createPslPair(e1, e2)) != NULL)
 	      {		
 		if ((pp->distance >= MIN) && (pp->distance <= MAX))
 		  {
 		    if ((!pp->orien) && (!pp->random) && (MISMATCH)) 
 		      slAddHead(&(clone->pairsMM),pp);
 		    else if ((pp->orien) && (pp->random) && (!NORANDOM))
 		      slAddHead(&(clone->pairsRandom),pp);
 		    else if ((pp->orien) && (!pp->random))
 		      slAddHead(&(clone->pairs),pp);
 		    else 
 		      pslPairFree(&pp);
 		  }
 		else if (!pp->random && pp->orien)
 		  {
 		    if ((pp->distance >= (MIN-SLOPVAL)) && (pp->distance <= (MAX+SLOPVAL)) && (SLOP))
 		      slAddHead(&(clone->pairsSlop),pp);
 		    else if ((pp->distance < MIN) && (SHORT))
 		      slAddHead(&(clone->pairsExtremeS),pp);
 		    else if ((pp->distance > MAX) && (pp->distance <= HARDMAX) && (LONG))
 		      slAddHead(&(clone->pairsExtremeL),pp);
 		    else 
 		      pslPairFree(&pp);
 		  }
 		else 
 		  pslPairFree(&pp);
 	      }
 	  }
       if (!(clone->pairs) && !(clone->pairsRandom) && !(clone->pairsSlop) 
 	  && !(clone->pairsExtremeS) && !(clone->pairsExtremeL) && !(clone->pairsMM))
 	{
 	  clone->orphan = TRUE;
 	}
     }
 }
 
 void printBed(FILE *out, struct pslPair *ppList, struct clone *clone, char *pslTable)
 {
   struct pslPair *pp, *ppPrev;
   int count = 0, best = 0;
 
   for (pp = ppList; pp != NULL; pp=pp->next)
     if (pp->score > best)
       best = pp->score;
   while (((ppList->score)/best) < NEARTOP)
     {
       struct pslPair *temp = ppList;
       ppList = ppList->next;
       pslPairFree(&temp);
     }
   ppPrev = ppList;
   for (pp = ppList; pp != NULL; pp=pp->next)
     {
       if (((pp->score)/best) >= NEARTOP)
 	count++;
       else
 	{
 	  ppPrev->next = pp->next;
 	  pslPairFree(&pp);
 	  pp = ppPrev;
 	}
       ppPrev = pp;
     }
   for (pp = ppList; pp != NULL; pp=pp->next)
     {
       int bin = binFromRange(pp->f->psl->tStart,pp->r->psl->tEnd);
       int score = 1000;
       char *strand;
       int d1, d2;
 
       if (count != 1)
 	score = 1500/count;
       if (hashLookup(leftNames, pp->f->psl->qName))
 	strand = "+";
       else 
 	strand = "-";
       d1 = pp->f->psl->tEnd - pp->f->psl->tStart;
       d2 = pp->r->psl->tEnd - pp->r->psl->tStart;
       
       if (!NOBIN) 
 	fprintf(out, "%d\t",bin);
       fprintf(out, "%s\t%d\t%d\t%s\t%d\t%s\t%s\t2\t%d,%d\t%d,%d\t%s,%s\n",
 	      pp->f->psl->tName,pp->f->psl->tStart,pp->r->psl->tEnd,clone->name,
 	      score, strand, pslTable, pp->f->psl->tStart, pp->r->psl->tStart, d1, d2, 
 	      pp->f->psl->qName, pp->r->psl->qName);
     }
 }
 
 void printOrphan(FILE *out, struct pslAli *paList, struct clone *clone, char *pslTable)
 {
   struct pslAli *pa;
   int best = 0, count = 0;
 
   for (pa = paList; pa != NULL; pa=pa->next)
     if (pa->score > best)
       best = pa->score;
   for (pa = paList; pa != NULL; pa=pa->next)
     if ((((pa->score)/best) > NEARTOP) && (pa->id >= MIN_ORPHAN_ID))
       count++;
   for (pa = paList; pa != NULL; pa=pa->next)
     if ((((pa->score)/best) > NEARTOP) && (pa->id >= MIN_ORPHAN_ID))
       {
       int bin = binFromRange(pa->psl->tStart,pa->psl->tEnd);
       int score = 1000;
       char *strand;
       int d1, genStart = 0, genEnd = 0;
 
       if (count != 1)
 	score = 1500/count;
       if (hashLookup(leftNames,pa->psl->qName)) 
 	{
 	  if (pa->psl->strand[0] == '+')
 	    {
 	      genStart = pa->psl->tStart;
 	      genEnd = pa->psl->tEnd+(MIN/2);
 	    } 
 	  else
 	    {
 	      genStart = pa->psl->tStart-(MIN/2);
 	      genEnd = pa->psl->tEnd;
 	    }
 	}
       else
 	{
 	  if (pa->psl->strand[0] == '-')
 	    {
 	      genStart = pa->psl->tStart-(MIN/2);
 	      genEnd = pa->psl->tEnd;
 	    }
 	  else
 	    {
 	      genStart = pa->psl->tStart;
 	      genEnd = pa->psl->tEnd+(MIN/2);
 	    }
 	}
       if (genStart < 0)
 	genStart = 0;
       if (((hashLookup(leftNames,pa->psl->qName)) && (pa->psl->strand[0] == '+')) ||
 	  ((hashLookup(rightNames,pa->psl->qName)) && (pa->psl->strand[0] == '-')))
 	strand = "+";
       else 
 	strand = "-";
       d1 = pa->psl->tEnd - pa->psl->tStart;
 
       if (!NOBIN) 
 	fprintf(out, "%d\t",bin);
       fprintf(out, "%s\t%d\t%d\t%s\t%d\t%s\t%s\t1\t%d\t%d\t%s\n",
 	      pa->psl->tName,genStart,genEnd,clone->name,
 	      score, strand, pslTable, pa->psl->tStart, d1, pa->psl->qName);        
       }
 }
 
 void printOut(char *pslTable)
 {
   struct clone *clone = NULL;
 
   for (clone = cloneList; clone != NULL; clone = clone->next)
     {
       if (clone->pairs)
 	printBed(of, clone->pairs, clone, pslTable);
       else if (clone->pairsRandom)
 	printBed(of, clone->pairsRandom, clone, pslTable);
       else if ((clone->pairsSlop) && (SLOP))
 	printBed(sf, clone->pairsSlop, clone, pslTable);
       else if ((clone->pairsMM) && (MISMATCH))
 	printBed(mf, clone->pairsMM, clone, pslTable);
       else if ((clone->pairsExtremeS) && (SHORT))
 	printBed(esf, clone->pairsExtremeS, clone, pslTable);
       else if ((clone->pairsExtremeL) && (LONG))
 	printBed(elf, clone->pairsExtremeL, clone, pslTable);
       else if ((clone->orphan) && (ORPHAN))
 	{
 	printOrphan(orf, clone->end1, clone, pslTable);
 	printOrphan(orf, clone->end2, clone, pslTable);
 	}
     }
 }
 
 int main(int argc, char *argv[])
 {
   struct lineFile *pf, *prf;
   char filename[64], *pslTable;
 
   optionInit(&argc, argv, optionSpecs);
   if (argc < 4)
     usage();
 
   VERBOSE = optionExists("verbose");
   SLOP = optionExists("slop");
   SHORT = optionExists("short");
   LONG = optionExists("long");
   MISMATCH = optionExists("mismatch");
   ORPHAN = optionExists("orphan");
   NORANDOM = optionExists("noRandom");
   NOBIN = optionExists("noBin");
   MIN = optionInt("min",MIN);
   MAX = optionInt("max",MAX);
   TINSERT = optionInt("tInsert",TINSERT);
   HARDMAX = optionInt("hardMax",HARDMAX);
   if (SLOP)
       SLOPVAL = optionInt("slopval",SLOPVAL);
   else
       SLOPVAL = 0;
   NEARTOP = optionFloat("nearTop",NEARTOP);
   NEARTOP = 1 - NEARTOP;
   MIN_ID = optionFloat("minId",MIN_ID);
   MIN_ORPHAN_ID = optionFloat("minOrphanId",MIN_ORPHAN_ID);
 
   pf = pslFileOpen(argv[1]);
   prf = lineFileOpen(argv[2],TRUE);
   pslTable = cloneString(argv[3]);
 
   sprintf(filename, "%s.pairs", argv[4]);
   of = mustOpen(filename, "w");
   if (SLOP)
     {
       sprintf(filename, "%s.slop", argv[4]);
       sf = mustOpen(filename, "w");
     }
   if (ORPHAN)
     {
       sprintf(filename, "%s.orphan", argv[4]);
       orf = mustOpen(filename, "w");
     }
   if (MISMATCH)
     {
       sprintf(filename, "%s.mismatch", argv[4]);
       mf = mustOpen(filename, "w");
     }
   if (SHORT)
     {
       sprintf(filename, "%s.short", argv[4]);
       esf = mustOpen(filename, "w");
     }
   if (LONG)
     {
       sprintf(filename, "%s.long", argv[4]);
       elf = mustOpen(filename, "w");
     }
       
   if (VERBOSE)
     printf("Reading pair file\n");
   clones = newHash(23);
   leftNames = newHash(23);
   rightNames = newHash(23);
   readPairFile(prf);
   if (VERBOSE)
     printf("Reading psl file\n");
   readPslFile(pf);
   if (VERBOSE)
     printf("Creating Pairs\n");
   pslPairs();
   if (VERBOSE) 
     printf("Writing to files\n");
   printOut(pslTable);
   
   lineFileClose(&pf);
   lineFileClose(&prf);
   fclose(of);
   if (ORPHAN)
     fclose(orf);
   if (SLOP)
     fclose(sf);
   if (MISMATCH)
     fclose(mf);
   if (SHORT)
     fclose(esf);
   if (LONG)
     fclose(elf);
  
   return 0;
 }