4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/utils/repeatSelect/repeatSelect.c src/hg/utils/repeatSelect/repeatSelect.c
index 5cf724e..2ea2567 100644
--- src/hg/utils/repeatSelect/repeatSelect.c
+++ src/hg/utils/repeatSelect/repeatSelect.c
@@ -1,246 +1,246 @@
 /* repeatSelect - Grab x % more repeats in addition to what we already got. */
 
 /* Copyright (C) 2006 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "bits.h"
 #include "binRange.h"
 #include "linefile.h"
 #include "rmskOutExtra.h"
 #include "bitHash.h"
 #include "repeatGroup.h"
 #include "options.h"
 
 #define REPEAT_NAME_HASH_SIZE 12
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"select", OPTION_STRING},
     {"rmskOverlap", OPTION_DOUBLE},
     {NULL, 0}
 };
 
 /* Other globals.  Maybe I'll make these local later. */
 int totalBases = 0;
 double desiredAddPerc = 0.2;
 double rmskOverlap;
 enum choiceType {copyNumber=0, byTotalBases, highScore} theChoice = copyNumber;
 
 static void usage(char *error)
 {
 if (error)
     errAbort("error: %s", error);
 else
     errAbort("repeatSelect - Filter a set of repeat predictions based on an already established\n"
 	     "   set of repeats.  This program's original intent was to add to a set of repeat-\n"
 	     "   masker repeats that is suspected of being too small.  Suppose RepeatMasker masks\n"
 	     "   5%% of bases and denovo predictions mask 25%%.  If another 5%% coverage is desired\n"
 	     "   then the correct amount of the \"best\" denovo predictions that don't coincide\n"
 	     "   with a RepeatMasker repeat are selected so the masking is 10%%.  The \"best\"\n"
 	     "   denovo predictions can be the ones with the highest scores, the highest number of\n"
 	     "   copies.\n"
 	     "usage:\n"
 	     "   repeatSelect percentMore chrom.sizes rmsk.out denovo.out\n"
 	     "options:\n"
 	     "   -select=choice     where choice is \"totalBases\", \"copyNumber\" (default),\n"
 	     "                      \"highScore\", or \"rmskThenCopyNumber\".  See README for an\n"
 	     "                      explanation.\n"
 	     "   -rmskOverlap=X     where X is in (0,1.0): percentage of a denovo repeat\n"
 	     "                      for a denovo repeat to be discarded.  ");
 }
 
 struct repeatGroup *repeatBaseCounts(struct hash *repHash)
 /* Count up how many bases each set of repeats is covering along with */
 /* some other basic stats and return all the groups. Then sort and return it. */
 {
 struct repeatGroup *rgList = NULL;
 struct hashEl *elList = hashElListHash(repHash), *el;
 for (el = elList; el != NULL; el = el->next)
     {
     struct rmskOutExtra *reList = el->val, *reEl;
     int total = 0;
     int copies = 0;
     double meanScore = 0;
     double meanLength = 0;
     for (reEl = reList; reEl != NULL; reEl = reEl->next)
 	{
 	total += reEl->numMasked;
 	copies++;
 	meanScore += reEl->rmsk->swScore;
 	meanLength += (reEl->rmsk->genoEnd - reEl->rmsk->genoStart);
 	}
     if (copies > 0)
 	{
 	meanScore /= copies;
 	meanLength /= copies;
 	}
     if (reList != NULL)
 	{
 	struct repeatGroup *one = NULL;
 	AllocVar(one);
 	one->name = cloneString(reList->rmsk->repName);
 	one->bases = total;
 	one->copies = copies;
 	one->meanScore = meanScore;
 	one->meanLength = meanLength;
 	slAddHead(&rgList, one);
 	}
     }
 switch (theChoice)
     {
     case highScore:
 	slSort(&rgList, repeatGroupScoreCmp);
 	break;
     case byTotalBases:
 	slSort(&rgList, repeatGroupBasesCmp);
 	break;
     default: 
 	slSort(&rgList, repeatGroupCopyCmp);
     }
 hashElFreeList(&elList);
 return rgList;
 }
 
 struct hash *repeatBigHash(char *repeatFile, struct hash *chromBits)
 /* Read repeats one-by-one. */
 /* Remove ones that intersect with preexisting repeats by x%. */
 /* Return a hash of rmskOutExtra lists indexed on 
 /* rmskOut->repName. */
 {
 boolean rmskRet;
 struct lineFile *rmskF = NULL;
 struct rmskOut *rmsk;
 struct hash *keepers = newHash(REPEAT_NAME_HASH_SIZE);
 rmskOutOpenVerify(repeatFile, &rmskF, &rmskRet);
 while ((rmsk = rmskOutReadNext(rmskF)) != NULL)
     {
     char *chrom = rmsk->genoName;
     Bits *cBits = hashFindVal(chromBits, chrom);
     int size = rmsk->genoEnd - rmsk->genoStart;
     int baseReps = bitCountRange(cBits, rmsk->genoStart, size);
     float maskPerc = (float)baseReps/size;
     if (maskPerc < rmskOverlap) 
 	{
 	struct rmskOutExtra *addme = NULL;
 	AllocVar(addme);
 	addme->rmsk = rmsk;
 	/* Keep track of the novel masking. */
 	addme->numMasked = size - baseReps;
 	addToRmskOutExtraHash(keepers, addme);
 	}
     else 
 	rmskOutFree(&rmsk);
     }
 lineFileClose(&rmskF);
 return keepers;
 }
 
 void repeatBits(char *repeatFile, struct hash *chromBits)
 /* Set bits on chroms based on repeats. */
 {
 boolean rmskRet;
 struct lineFile *rmskF = NULL;
 struct rmskOut *rmsk;
 rmskOutOpenVerify(repeatFile, &rmskF, &rmskRet);
 while ((rmsk = rmskOutReadNext(rmskF)) != NULL)
     {
     char *chrom = rmsk->genoName;
     Bits *cBits = hashFindVal(chromBits, chrom);
     bitSetRange(cBits, rmsk->genoStart, rmsk->genoEnd - rmsk->genoStart);
     }
 lineFileClose(&rmskF);
 }
 
 struct hash *chromHash(char *chromDotSizes)
 /* Return a hash of the chrom sizes. */
 {
 struct lineFile *lf = lineFileOpen(chromDotSizes, TRUE);
 struct hash *ret = newHash(12);
 char *strings[2];
 while (lineFileRowTab(lf, strings))
     {
     int size = atoi(strings[1]);
     totalBases += size;
     hashAddInt(ret, strings[0], size);
     }
 lineFileClose(&lf);
 return ret;
 }
 
 void selectPercRepeats(struct repeatGroup **pRGList)
 /* Select enough groups of repeats until there's too many. */
 {
 struct repeatGroup *newList = NULL, *cutOff, *prev = NULL;
 int runningTotal = 0;
 int desiredBases = (int)(desiredAddPerc * totalBases);
 if (!pRGList || !(*pRGList))
     return;
 cutOff = *pRGList;
 while (cutOff != NULL)
     {
     runningTotal += cutOff->bases;
     if (runningTotal > desiredBases)
 	{
 	break;
 	}
     prev = cutOff;
     cutOff = cutOff->next;
     }
 if (prev == NULL)
     repeatGroupFreeList(pRGList);
 else
     {
     prev->next = NULL;
     repeatGroupFreeList(&cutOff);
     }
 }
 
 void outputRepeats(struct repeatGroup *rgList, struct hash *repHash)
 {
 /* Simply output the denovo repeats that */
 struct repeatGroup *one;
 for (one = rgList; one != NULL; one = one->next)
     {
     struct rmskOutExtra *reList = hashFindVal(repHash, one->name), *re;
     for (re = reList; re != NULL; re = re->next)
 	rmskOutTabOut(re->rmsk, stdout);
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 struct hash *cHash = NULL;
 struct hash *bHash = NULL;
 struct hash *repHash = NULL;
 struct repeatGroup *rgList = NULL;
 char *selection;
 optionInit(&argc, argv, optionSpecs);
 if (argc != 5)
     usage(NULL);
 selection = optionVal("select", "copyNumber");
 desiredAddPerc = atof(argv[1]);
 if (sameWord(selection, "copyNumber"))
     theChoice = copyNumber;
 else if (sameWord(selection, "totalBases"))
     theChoice = byTotalBases;
 else if (sameWord(selection, "highScore"))
     theChoice = highScore;
 else
     usage("-select must be \"copyNumber\", \"totalBases\", or \"highScore\".");
 rmskOverlap = optionDouble("rmskOverlap", 0.8);
 if ((rmskOverlap < 0) || (rmskOverlap > 1))
     usage("-rmskOverlap must be in range [0,1]");
 cHash = chromHash(argv[2]);
 bHash = newBitHash(cHash);
 repeatBits(argv[3], bHash);
 repHash = repeatBigHash(argv[4], bHash);
 rgList = repeatBaseCounts(repHash);
 selectPercRepeats(&rgList);
 outputRepeats(rgList, repHash);
 freeBitHash(&bHash);
 freeRmskOutExtraHash(&repHash);
 repeatGroupFreeList(&rgList);
 return 0;
 }