4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/ratStuff/mafToAnc/mafToAnc.c src/hg/ratStuff/mafToAnc/mafToAnc.c
index dc213ac..444c908 100644
--- src/hg/ratStuff/mafToAnc/mafToAnc.c
+++ src/hg/ratStuff/mafToAnc/mafToAnc.c
@@ -1,774 +1,774 @@
 /* mafToAnc - Find and report anchors in a maf file.  Anchors are defined
  *   as ungapped alignments with a minimum length of 100bp.  The user
  *   can utilize the -minLen paramater to change the minimum ungapped
  *   length required to define an anchor. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "errAbort.h"
 #include "hash.h"
 #include "linefile.h"
 #include "maf.h"
 #include "obscure.h"
 #include "options.h"
 #include "seg.h"
 
 #include <limits.h>	/* For CHAR_BIT */
 
 
 
 struct aliCont
 /* A container for an alignment block. */
 {
 	struct aliCont *next;
 	struct mafAli *ali;		/* The alignment block. */
 	int compCount;			/* Count of components in the alignment. */
 };
 
 struct speciesInfo {
 /* Species information used for determining breaks. */
 	char *chrom;	/* Name of chromosome. */
 	int start;		/* Start of segment. Zero based. If strand is - it is
 					   relative to source end. */
 	int size;		/* Size of segment. */
 	char strand;	/* Strand of segment. Either + or -. */
 	int srcSize;	/* Size of segment chromosome. */
 	char order;		/* Relative order. Either + or - or 0. */
 };
 
 static struct optionSpec options[] = {
 	{"coreSpecies", OPTION_STRING},
 	{"minLen", OPTION_INT},
 	{"noCheckSrc", OPTION_BOOLEAN},
 	{"noCheckStrand", OPTION_BOOLEAN},
 	{"noCheckCoverage", OPTION_BOOLEAN},
 	{NULL, 0},
 };
 
 static char    *coreSpecies    = NULL;
 static int     minLen          = 100;
 static boolean noCheckSrc      = FALSE;
 static boolean noCheckStrand   = FALSE;
 static boolean noCheckCoverage = FALSE;
 
 static struct hash *coreSpeciesHash = NULL;
 
 static unsigned int uint_bits = sizeof(unsigned int) * CHAR_BIT;
 static unsigned int uint_div  = 0U;
 static unsigned int uint_rem  = 0U;
 static unsigned int int_bits  = sizeof(int) * CHAR_BIT;
 static unsigned int *bitMap   = NULL;
 static unsigned int emptyUInt = 0U;
 static unsigned int fullUInt  = ~0U;
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "mafToAnc - Find anchors in a maf file\n"
   "usage:\n"
   "   mafToAnc in.maf out.anc\n"
   "arguments:\n"
   "   in.maf   name of the input maf file\n"
   "   out.anc  name of the output anchor file\n"
   "options:\n"
   "   -coreSpecies=spe1,...,speN\n"
   "                     Use only the listed species when building anchors\n"
   "   -minLen=<D>       Minimum length of an ungapped alignment to define an\n"
   "                     anchor.  (default = 100)\n"
   "   -noCheckSrc       Don't check that the src of the first component of\n"
   "                     every alignment block is the same.\n"
   "   -noCheckStrand    Don't check that the first component of every\n"
   "                     alignment block is on the + strand.\n"
   "   -noCheckCoverage  Don't check for single coverage of the first\n"
   "                     component.\n"
   );
 }
 
 static void freeSpeciesInfo(void **pObj)
 /* Free a speciesInfo struct. */
 {
 struct speciesInfo *si = *pObj;
 
 freeMem(si->chrom);
 freeMem(si);
 
 *pObj = NULL;
 }
 
 static struct segBlock *buildAnchorBlock(int len, struct mafAli *ma, int *cp)
 /* Build and anchor from the argument data. */
 {
 struct mafComp *mc;
 struct segBlock *sb;
 struct segComp *sc, *tail = NULL;
 int compIdx;
 
 /* Initialize the anchor block. */
 AllocVar(sb);
 sb->val = len;
 
 for (mc = ma->components, compIdx = 0; mc != NULL; mc = mc->next, compIdx++)
 	{
 	/* Initialize a new anchor component. */
 	AllocVar(sc);
 	sc->src     = cloneString(mc->src);
 	sc->start   = cp[compIdx] - len;
 	sc->size    = len;
 	sc->strand  = mc->strand;
 	sc->srcSize = mc->srcSize;
 
 	/* Add the new component to the current list. */
 	if (sb->components == NULL)
 		sb->components = sc;
 	else
 		tail->next = sc;
 	tail = sc;
 	}
 
 return(sb);
 }
 
 
 static struct segBlock *buildBreakBlock(struct mafAli *ma)
 /* Build a break block. */
 {
 struct mafComp *mc;
 struct segBlock *sb;
 struct segComp *sc, *tail = NULL;
 
 /* Initialize the break block. */
 AllocVar(sb);
 sb->name = cloneString("chromBreak");
 
 for (mc = ma->components; mc != NULL; mc = mc->next)
 	{
 	/* Initialize a new segment component. */
 	AllocVar(sc);
 	sc->src     = cloneString(mc->src);
 	sc->start   = mc->start;
 	sc->size    = mc->size;
 	sc->strand  = mc->strand;
 	sc->srcSize = mc->srcSize;
 
 	/* Add the new segment component to the current list. */
 	if (sb->components == NULL)
 		sb->components = sc;
 	else
 		tail->next = sc;
 	tail = sc;
 	}
 
 return(sb);
 }
 
 
 int hashPOTSize(int count)
 /* Return the power of two size needed for a hash with count elements. */
 {
 int left = count - 1, right = count - 1, shifts = 0;
 
 if (count <= 0)
 	return(0);
 
 NEXT_SHIFT:
 if (left < 0)
 	return(int_bits - shifts);
 
 if (right == 0)
 	return(shifts);
 
 shifts  += 1;
 left   <<= 1;
 right  >>= 1;
 
 goto NEXT_SHIFT;
 }
 
 static int checkForBreak(struct mafComp *mc, struct hash *speciesHash)
 /* Check for a break in chromosome.  I may expand this in the future to
    check fo breaks in order and orientation as well. */
 {
 char *p = NULL;
 struct hashEl *hel;
 struct speciesInfo *si;
 int foundBreak = 0, copyData = 0;
 
 /* Temporarily split src into species and chrom. */
 if ((p = strchr(mc->src, '.')) != NULL)
 	*p = '\0';
 
 if ((hel = hashLookup(speciesHash, mc->src)) == NULL)
 	{
 	AllocVar(si);
 	hel = hashAdd(speciesHash, mc->src, (void *) si);
 	copyData = 1;
 	}
 else
 	{
 	si = (struct speciesInfo *) hel->val;
 
 	/* Chromosomal break. */
 	if ((p && !sameString(si->chrom, p + 1)) || (!p && si->chrom != NULL))
 		{
 		freeMem(si->chrom);
 		si->chrom = NULL;
 		foundBreak = 1;
 		}
 	}
 
 if (foundBreak || copyData)
 	{
 	if (p)
 		si->chrom = cloneString(p + 1);
 	si->start   = mc->start;
 	si->size    = mc->size;
 	si->strand  = mc->strand;
 	si->srcSize = mc->srcSize;
 	}
 
 /* Restore src. */
 if (p)
 	*p = '.';
 
 return(foundBreak);
 }
 
 
 static void findAnchors(struct aliCont **acList, char *outAnc, int maxComps)
 /* Find and report anchors. */
 {
 struct aliCont *ac, *next;
 struct mafAli *ma;
 struct mafComp *mc;
 int idx, len, compIdx, alignBreak, gapCol = 0;
 int *compPos;
 struct segBlock *block;
 struct hash *speciesHash = newHash( hashPOTSize(maxComps) );
 FILE *f = mustOpen(outAnc, "w");
 
 /* Allocate the compPos array. */
 AllocArray(compPos, maxComps);
 
 segWriteStart(f);
 
 /* Find and report anchors. */
 for (ac = *acList; ac != NULL; ac = next)
 	{
 	next = ac->next;
 	ma = ac->ali;
 	idx = len = 0;
 
 	/* Initialize positions for each component; check for chromosomal
 	   breaks. */
 	alignBreak = 0;
 	for (mc = ma->components, compIdx = 0; mc != NULL; mc = mc->next)
 		{
 		compPos[compIdx++] = mc->start;
 		if (alignBreak == 0)
 			alignBreak = checkForBreak(mc, speciesHash);
 		}
 
 	/* */
 	if (alignBreak)
 		{
 		block = buildBreakBlock(ma);
 		segWrite(f, block);
 		segBlockFree(&block);
 		}
 
 	/* Check this container's alignment for anchors. */
 	while (idx < ma->textSize)
 		{
 		/* Check to see if there is a gap in this column. */
 		gapCol = 0;
 		for (mc = ma->components; mc != NULL; mc = mc->next)
 			if (mc->text[idx] == '-')
 				{
 				gapCol = 1;
 				break;
 				}
 
 		if (! gapCol)
 			len++;
 		else
 			{
 			/* Possibly report an anchor. */
 			if (len >= minLen)
 				{
 				block = buildAnchorBlock(len, ma, compPos);
 				segWrite(f, block);
 				segBlockFree(&block);
 				}
 			len = 0;
 			}
 
 		/* Update the positions for each component. */
 		for (mc = ma->components, compIdx = 0; mc != NULL; mc = mc->next)
 			{
 			if (mc->text[idx] != '-')
 				compPos[compIdx] += 1;
 			compIdx++;
 			}
 
 		idx++;
 		}
 
 	/* Is the final region of this alignment an anchor? */
 	if ((! gapCol) && (len >= minLen))
 		{
 		block = buildAnchorBlock(len, ma, compPos);
 		segWrite(f, block);
 		segBlockFree(&block);
 		}
 
 	mafAliFree(&ma);
 	freeMem(ac);
 	}
 
 segWriteEnd(f);
 carefulClose(&f);
 freeMem(compPos);
 hashFreeWithVals(&speciesHash, freeSpeciesInfo);
 }
 
 
 static void aliContFuse(struct aliCont *ac, int maxComps)
 /* Fuse adjacent alignment containers. */
 {
 struct aliCont *ac1, *ac2;
 struct mafComp *mc1, *mc2, **compArray;
 char *text;
 int compIdx;
 
 /* Used to put the components of two alignments in the same order. */
 AllocArray(compArray, maxComps);
 
 ac1 = ac;
 while (ac1->next != NULL)
 	{
 	ac2 = ac1->next;
 	compIdx = 0;
 
 	/* Prepare to fuse the two alignment containers. */
 	if (ac1->compCount == ac2->compCount)
 		{
 		for (mc1 = ac1->ali->components; mc1 != NULL; mc1 = mc1->next)
 			{
 			if ((mc2 = mafMayFindComponent(ac2->ali, mc1->src)) == NULL)
 				break;
 
 			if (mc1->start + mc1->size != mc2->start
 				|| mc1->strand != mc2->strand
 				|| !sameString(mc1->src, mc2->src))
 				break;
 
 			compArray[compIdx++] = mc2;
 			}
 		}
 
 	/* Fuse the alignment containers if it's possible. */
 	if (ac1->compCount == ac2->compCount && compIdx == ac1->compCount)
 		{
 		compIdx = 0;
 		for (mc1 = ac1->ali->components; mc1 != NULL; mc1 = mc1->next)
 			{
 			mc2 = compArray[compIdx++];
 			mc1->size += mc2->size;
 			AllocArray(text, ac1->ali->textSize + ac2->ali->textSize + 1);
 			strcpy(text, mc1->text);
 			strcat(text, mc2->text);
 			freeMem(mc1->text);
 			mc1->text = text;
 			}
 		ac1->ali->score += ac2->ali->score;
 		ac1->ali->textSize += ac2->ali->textSize;
 		ac1->next = ac2->next;
 		mafAliFree(&ac2->ali);
 		freeMem(ac2);
 		}
 	else
 		{
 		ac1 = ac2;
 		}
 	}
 
 freeMem(compArray);
 }
 
 
 static int cmpOrder(const void *a, const void *b)
 /* Comparison function for sorting alignment containers. */
 {
 struct mafComp *p, *q;
 
 p = (*((struct aliCont **) a))->ali->components;
 q = (*((struct aliCont **) b))->ali->components;
 
 if (p->start != q->start)
 	return(p->start - q->start);
 
 return(p->start + p->size - q->start - q->size);
 }
 
 
 static void aliContSortByPos(struct aliCont **acList, int aliCount)
 /* Sort the alignment containers by the position of the first component
  * in the contained alignment. */
 {
 struct aliCont **contArr, *ac;
 int i;
 
 /* Initialize the alignment container array. */
 AllocArray(contArr, aliCount);
 for (ac = *acList, i = 0; ac != NULL; ac = ac->next, i++)
 	contArr[i] = ac;
 
 qsort((void *) contArr, (size_t) aliCount, sizeof(struct aliCont *), cmpOrder);
 
 /* Update the alignment countainer array next pointers. */
 *acList = contArr[0];
 for (i = 0; i < aliCount - 1; i++)
 	contArr[i]->next = contArr[i + 1];
 contArr[aliCount - 1]->next = NULL;
 
 freeMem(contArr);
 }
 
 
 static int foundOverlap(struct mafComp *mc)
 {
 int start = mc->start;
 int stop  = mc->start + mc->size;
 unsigned int start_idx, start_rem, stop_idx, stop_rem, pattern, i;
 
 if (bitMap == NULL)
 	AllocArray(bitMap, ((mc->srcSize + 7) >> uint_div));
 
 start_idx = start >> uint_div;
 start_rem = start  & uint_rem;
 stop_idx  = stop  >> uint_div;
 stop_rem  = stop   & uint_rem;
 
 /* Special case if on same chunk. */
 if (start_idx == stop_idx)
 	{
 	pattern = (fullUInt >> start_rem) & (fullUInt << (uint_bits - stop_rem));
 	if ((pattern & bitMap[start_idx]) != emptyUInt)
 		return(1);
 	bitMap[start_idx] |= pattern;
 	return(0);
 	}
 
 /* Check and set the start of the range. */
 if (start_rem == 0)
 	start_idx--;
 else
 	{
 	pattern = fullUInt >> start_rem;
 	if ((pattern & bitMap[start_idx]) != emptyUInt)
 		return(1);
 	bitMap[start_idx] |= pattern;
 	}
 
 /* Check and set the end of the range. */
 if (stop_rem != 0)
 	{
 	pattern = fullUInt << (uint_bits - stop_rem);
 	if ((pattern & bitMap[stop_idx]) != emptyUInt)
 		printf("Stop Collision!\n");
 	bitMap[stop_idx] |= pattern;
 	}
 
 /* Check and set the middle of the range. */
 for (i = start_idx + 1; i < stop_idx; i++)
 	{
 	if (bitMap[i] != emptyUInt)
 		return(1);
 	bitMap[i] = fullUInt;
 	}
 
 return(0);
 }
 
 
 static int stripAndCountComponents(struct mafAli *ma)
 /* Strip out data that results in "e", "q", and "i" lines. Return the
  * number of components in the alignment. */
 {
 struct mafComp *prev, *curr;
 int compCount = 0;
 int keep;
 int dash = 0;
 int i,j;
 struct mafAli *a;
 struct mafComp *c;
 char *p;
 
 prev = curr = ma->components;
 while (curr != NULL)
 	{
 	keep = 1;
 
 	/* Strip out non-core species. */
 	if (coreSpeciesHash != NULL)
 		{
 		if ((p = strchr(curr->src, '.')) != NULL)
 			*p = '\0';
 
 		if (hashLookup(coreSpeciesHash, curr->src) == NULL)
 			{
 			keep = 0;
 			dash = 1;
 			}
 
 		if (p != NULL)
 			*p = '.';
 		}
 
 	/* Strip out "e" components. */
 	if ((curr->size == 0) && (curr->leftStatus))
 		keep = 0;
 
 	if (keep)
 		{
 		compCount++;
 
 		/* Strip out "q" data. */
 		if (curr->quality)
 			freeMem(curr->quality);
 
 		/* Strip out "i" data. */
 		curr->leftStatus = 0;
 
 		prev = curr;
 		curr = curr->next;
 		}
 	else
 		{
 		if (curr == ma->components)
 			{
 			ma->components = curr->next;
 			mafCompFree(&curr);
 			prev = curr = ma->components;
 			}
 		else
 			{
 			prev->next = curr->next;
 			mafCompFree(&curr);
 			curr = prev->next;
 			}
 		}
 	}
 
 /* remove columns containing only dashes */
 if (dash)
 	{
 	a = ma;
 
 	for (i = j = 0; i < a->textSize; i++)
 		{
 		for (c = a->components; c != NULL; c = c->next)
 			if (c->text[i] != '-')
 				break;
 
 		if (c != NULL)
 			{
 			if (j != i)
 				for (c = a->components; c != NULL; c = c->next)
 					c->text[j] = c->text[i];
 			j++;
 			}
 		}
 
 	if (j != a->textSize)
 		{
 		a->textSize = j;
 
 		for (c = a->components; c != NULL; c = c->next)
 			c->text[j] = '\0';
 		}
 
 	}
 
 return(compCount);
 }
 
 void mafToAnc(char *inMaf, char *outAnc)
 /* mafToAnc - Find anchors in a maf file. */
 {
 struct mafFile *mf = mafOpen(inMaf);
 struct mafAli *ma;
 struct aliCont *ac, *acList = NULL, *acListTail = NULL;
 int aliCount = 0, maxComps = 0, preSorted = 1, lastEnd = 0, compCount;
 char *firstCmpSrc = NULL;
 
 /* Read in and count the alignment blocks from the maf. Determine the
  * maximum number of components in an alignment. Make sure that the
  * source of the first component of every alignment is the same. */
 while ((ma = mafNext(mf)) != NULL)
 	{
 	/* Strip and count the components in the alignment. */
 	compCount = stripAndCountComponents(ma);
 
 	/* Skip alignments that contain only zero or one component. */
 	if (ma->components == NULL || ma->components->next == NULL)
 		{
 		mafAliFree(&ma);
 		continue;
 		}
 
 	/* Check that the source of the first components are all the same. */
 	if (!noCheckSrc)
 		{
 		if (firstCmpSrc == NULL)
 			firstCmpSrc = cloneString(ma->components->src);
 		else if (!sameString(firstCmpSrc, ma->components->src))
 			{
 			warn("Expecting the first component source to be %s.", firstCmpSrc);
 			mafWrite(stderr, ma);
 			exit(EXIT_FAILURE);
 			}
 		}
 	
 	/* Make sure the first component is on the + strand. */
 	if (!noCheckStrand && ma->components->strand != '+')
 		{
 		warn("Expecting the first component to be on the + strand:");
 		mafWrite(stderr, ma);
 		exit(EXIT_FAILURE);
 		}
 
 	/* Check for single coverage of bases in the first component. */
 	if (!noCheckCoverage && foundOverlap(ma->components))
 		{
 		warn("Found overlap:");
 		mafWrite(stderr, ma);
 		exit(EXIT_FAILURE);
 		}
 	
 	/* Check to see if we'll need to sort later. */
 	if (preSorted && lastEnd > ma->components->start)
 		preSorted = 0;
 	lastEnd = ma->components->start + ma->components->size;
 
 	/* Update maximum component count. */
 	if (compCount > maxComps)
 		maxComps = compCount;
 
 	/* Update the alignment count. */
 	aliCount++;
 
 	/* Create a new container for this alignment. */
 	AllocVar(ac);
 	ac->ali = ma;
 	ac->compCount = compCount;
 
 	/* Add this new container to our list */
 	if (acList == NULL)
 		acList = ac;
 	else
 		acListTail->next = ac;
 	acListTail = ac;
 	}
 mafFileFree(&mf);
 
 if (!noCheckCoverage)
 	freeMem(bitMap);
 
 freeMem(firstCmpSrc);
 
 /* Sort the alignment containers by the position of the first component. */
 if (!preSorted)
 	aliContSortByPos(&acList, aliCount);
 
 /* Combine adjacent alignment blocks that can be fused. */
 aliContFuse(acList, maxComps);
 
 /* Find and report anchors. */
 findAnchors(&acList, outAnc, maxComps);
 }
 
 static void buildCoreSpeciesHash(char *speciesList)
 {
 char delim = ',';
 char *p, *d;
 int speciesCount=1;
 struct hashEl *hel;
 
 /* We don't need a hash if there's nothing to store. */
 if (speciesList == NULL)
 	return;
 
 /* Count the number of species in the list. */
 for (p = speciesList; *p != '\0'; p++)
 	if (*p == delim)
 		speciesCount++;
 
 /* Create a hash with enough space. */
 coreSpeciesHash = newHash( hashPOTSize(speciesCount) );
 
 /* Populate the hash. */
 p = speciesList;
 while (1)
 	{
 	if ((d = strchr(p, delim)) != NULL)
 		*d = '\0';
 
 	if ((hel = hashLookup(coreSpeciesHash, p)) == NULL)
 		hashAddInt(coreSpeciesHash, p, 1);
 
 	if (d == NULL)
 		break;
 
 	p = d + 1;
 	}
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. Search maf file for anchors. */
 {
 optionInit(&argc, argv, options);
 
 if (argc != 3)
     usage();
 
 coreSpecies     = optionVal("coreSpecies", NULL);
 minLen          = optionInt("minLen", minLen);
 noCheckSrc      = optionExists("noCheckSrc");
 noCheckStrand   = optionExists("noCheckStrand");
 noCheckCoverage = optionExists("noCheckCoverage");
 
 if (!noCheckSrc)
 	noCheckStrand = noCheckCoverage = TRUE;
 
 if (!noCheckStrand)
 	noCheckCoverage = TRUE;
 
 if (!noCheckCoverage)
 	{
 	/* Is uint_bits a power of two? */
 	if ((uint_bits & (uint_bits - 1)) != 0)
 		{
 		warn("Turning off checking the first component for single coverage.\n"
 		     "unsigned int is not represented by a power of two bits.");
 		noCheckCoverage = TRUE;
 		}
 	else
 		{
 		/* Calculate some values we'll need later. */
 		uint_div = hashPOTSize(uint_bits);
 		uint_rem = uint_bits - 1;
 		}
 	}
 
 buildCoreSpeciesHash(coreSpecies);
 mafToAnc(argv[1], argv[2]);
 freeHash(&coreSpeciesHash);
 
 return(EXIT_SUCCESS);
 }