e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/hg/agpSangerUnfinished/agpSangerUnfinished.c src/hg/agpSangerUnfinished/agpSangerUnfinished.c
index f9e8aa9..8de8828 100644
--- src/hg/agpSangerUnfinished/agpSangerUnfinished.c
+++ src/hg/agpSangerUnfinished/agpSangerUnfinished.c
@@ -1,220 +1,223 @@
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
 /* agpSangerUnfinished - Fixes chrom.agp or scaffold.agp 
  *  Galt Barber 2009-06-22
  *  Produces an agp file that matches the unfinished contig names 
  *  that are found in contigs.fa.
  *  Apparently Sanger wants to use these unfinished ones
  *  but they are not allowed to submit them to Genbank.
  *  So the agp matches the genbank record, but the fasta
  *  file does not.  
  *  The needed information is actually in the files,
  *  but it is stored in a completely non-standard way.
  *  So, we change the frag names in the agp to match
  *  the fasta, and we use the frag start and end 
  *  also in the matching.  The final agp frag start and end
  *  for these must be modified to be 1, size-of-frag
  *  This is being developed for Zv8 (and the same structure was in Zv7).
 */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "cheapcgi.h"
 #include "fa.h"
 #include "agpFrag.h"
 #include "agpGap.h"
 #include "options.h"
 
 
 static void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "agpSangerUnfinished - Make agp file match the given contigs fasta file\n"
   "usage:\n"
   "   agpSangerUnfinished in.agp contigs.fa out.agp\n"
   "\n"
   "options:\n"
   "   -verbose=N - N=2 display extra information\n"
   );
 }
 
 /* global vars */
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {NULL, 0}
 };
 
 
 struct unfinishedContig
 {
     char *frag;      /* unfinished contig frag name */
     int fragStart;   /* unfinished contig frag start 1-based */
     int fragEnd;     /* unfinished contig frag end */
 };
 
 
 struct hash *hashFasta(char *fileName)
 /* Read names in fasta and put them in a hash. */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 struct hash *hash = newHash(0);
 verbose(1, "reading contig fasta file %s ...", fileName);
 verbose(2, "\n");
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     if (line[0] == '>')
         {
         char *sp = strchr(line, ' ');
         if (sp)
 	    {
 	    ++sp;
 	    char *under = strchr(line, '_');
 	    if (under && (under < sp))
 		{
 		sp = strchr(sp, ' ');
 		if (sp)
 		    {
 		    ++sp;
 		    ++line;
 		    struct unfinishedContig *u;
 		    AllocVar(u);
 		    u->frag = cloneString(nextWord(&line));
 		    nextWord(&line);
 		    char *key = nextWord(&line);
 		    u->fragStart = atoi(nextWord(&line)) - 1; // -1 to adjust to our internal coordintes
 		    u->fragEnd = atoi(nextWord(&line));
 		    hashAdd(hash, key, u);
 		    verbose(2, "key=[%s] %s %d %d\n", key, u->frag, u->fragStart, u->fragEnd);
 		    }
 		}
 	    }
         }
     }
 lineFileClose(&lf);
 verbose(1, " done.\n");
 return hash;
 }
 
 
 static void agpSangerUnfinished(char *agpFile, char *contigFasta, char *agpOut)
 /* Fix agp to match unfinished contigs in fasta */
 {
 struct lineFile *lf = lineFileOpen(agpFile, TRUE);
 char *line, *words[16];
 int lineSize, wordCount;
 unsigned lastPos = 0;
 struct agpFrag *agp;
 struct agpGap *gap;
 FILE *f;
 char *lastObj = NULL;
 f = mustOpen(agpOut, "w");
 char *newChrom = NULL;
 struct hash *hash = hashFasta(contigFasta);
 
 verbose(2,"#\tprocessing AGP file: %s\n", agpFile);
 while (lineFileNext(lf, &line, &lineSize))
     {
     if (line[0] == 0 || line[0] == '#' || line[0] == '\n')
         continue;
     //verbose(2,"#\tline: %d\n", lf->lineIx);
     wordCount = chopLine(line, words);
     if (wordCount < 5)
         errAbort("Bad line %d of %s: need at least 5 words, got %d\n",
 		 lf->lineIx, lf->fileName, wordCount);
 
     if (!lastObj || !sameString(words[0],lastObj))
 	{
 	freez(&newChrom);
 	newChrom = cloneString(words[0]);
 	lastPos = 0;
 	}
 
     	
 		 
     if (words[4][0] != 'N')
 	{
 	lineFileExpectAtLeast(lf, 9, wordCount);
 	agp = agpFragLoad(words);
 	/* agp is 1-based but agp loaders do not adjust for 0-based: */
     	agp->chromStart -= 1;
 	agp->fragStart  -= 1;
 	if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart)
 	    errAbort("Sizes don't match in %s and %s line %d of %s\n",
 		agp->chrom, agp->frag, lf->lineIx, lf->fileName);
 
 	char *root = cloneString(agp->frag);
 	chopSuffixAt(root, '.');
 
 	struct hashEl *e, *elist = hashLookup(hash, root);
 	for (e = elist; e; e = hashLookupNext(e))
 	    {
 	    struct unfinishedContig *u = e->val;
             if ((u->fragStart <= agp->fragStart) && (u->fragEnd >= agp->fragEnd))
 		{
 		agp->frag = cloneString(u->frag);
 		agp->fragEnd -= u->fragStart;
 		agp->fragStart -= u->fragStart;
 		}
 	    }
 	freeMem(root);
 	}
     else
         {
 	lineFileExpectAtLeast(lf, 8, wordCount);
 	gap = agpGapLoad(words);
 	/* to be consistent with agpFrag */
 	gap->chromStart -= 1;
 	agp = (struct agpFrag*)gap;
 	}
 
     if (agp->chromStart != lastPos)
 	errAbort("Start doesn't match previous end line %d of %s\n"
 	    "agp->chromStart: %u\n" 
 	    "agp->chromEnd: %u\n" 
 	    "lastPos: %u\n" 
 	    ,lf->lineIx, lf->fileName
 	    ,agp->chromStart
 	    ,agp->chromEnd
 	    ,lastPos
 	    );
 
     lastPos = agp->chromEnd;
     freez(&lastObj);
     lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */
 	
     if (words[4][0] != 'N')
 	{
 	/* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */
 	agpFragOutput(agp, f, '\t', '\n');
 	agpFragFree(&agp);
 	}
     else
         {
 	/* restore back to 1-based for agp 
 	 * because agpGapOutput doesn't compensate */
 	gap->chromStart += 1;
 	agpGapOutput(gap, f, '\t', '\n');
 	agpGapFree(&gap);
 	}
 	
     }
 
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 
 if (argc != 4)
     usage();
 //if (optionExists("agpSeq"))
   //  agpSeq = optionVal("agpSeq", NULL);
 
 
 agpSangerUnfinished(argv[1], argv[2], argv[3]);
 
 return 0;
 }