4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/makeDb/hgGcPercent/hgGcPercent.c src/hg/makeDb/hgGcPercent/hgGcPercent.c
index f6aef60..0e76a0d 100644
--- src/hg/makeDb/hgGcPercent/hgGcPercent.c
+++ src/hg/makeDb/hgGcPercent/hgGcPercent.c
@@ -1,449 +1,449 @@
 /* hgGcPercent - Calculate GC Percentage in 20kb windows. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "portable.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "nib.h"
 #include "jksql.h"
 #include "hdb.h"
 #include "cheapcgi.h"
 #include "options.h"
 #include "twoBit.h"
 #include "bed.h"
 
 
 /* Command line switches. */
 int winSize = 20000;            /* window size */
 boolean noLoad = FALSE;		/* Suppress loading mysql table . */
 char *file = (char *)NULL;	/* file name for output */
 char *chr = (char *)NULL;	/* process only chromosome listed */
 boolean noRandom = FALSE;	/* process only non-random chromosomes  */
 boolean noDots = FALSE;	        /* TRUE == do not display ... progress */
 boolean doGaps = FALSE;	        /* TRUE == process gaps correctly */
 boolean wigOut = FALSE;	        /* TRUE == output wiggle ascii data */
 int overlap = 0;                /* overlap size */
 char *bedRegionInName = NULL;   /* file of regions for calculating gc content */
 char *bedRegionOutName = NULL;  /* output file of regions for gc content */
 FILE *bedRegionOutFile = NULL;
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"win", OPTION_INT},
     {"noLoad", OPTION_BOOLEAN},
     {"file", OPTION_STRING},
     {"chr", OPTION_STRING},
     {"noRandom", OPTION_BOOLEAN},
     {"noDots", OPTION_BOOLEAN},
     {"doGaps", OPTION_BOOLEAN},
     {"overlap", OPTION_INT},
     {"wigOut", OPTION_BOOLEAN},
     {"bedRegionIn", OPTION_STRING},
     {"bedRegionOut", OPTION_STRING},
     {NULL, 0}
 };
 
 static char * previousChrom = (char *) NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hgGcPercent - Calculate GC Percentage in 20kb windows\n"
   "usage:\n"
   "   hgGcPercent [options] database nibDir\n"
   "     nibDir can be a .2bit file, a directory that contains a\n"
   "     database.2bit file, or a directory that contains *.nib files.\n"
   "     Loads gcPercent table with counts from sequence.\n"
   "options:\n"
   "   -win=<size> - change windows size (default %d)\n"
   "   -noLoad - do not load mysql table - create bed file\n"
   "   -file=<filename> - output to <filename> (stdout OK) (implies -noLoad)\n"
   "   -chr=<chrN> - process only chrN from the nibDir\n"
   "   -noRandom - ignore randome chromosomes from the nibDir\n"
   "   -noDots - do not display ... progress during processing\n"
   "   -doGaps - process gaps correctly (default: gaps are not counted as GC)\n"
   "   -wigOut - output wiggle ascii data ready to pipe to wigEncode\n"
   "   -overlap=N - overlap windows by N bases (default 0)\n"
   "   -verbose=N - display details to stderr during processing\n"
   "   -bedRegionIn=input.bed   Read in a bed file for GC content in specific regions and write to bedRegionsOut\n"
   "   -bedRegionOut=output.bed Write a bed file of GC content in specific regions from bedRegionIn\n\n"
   "example:\n"
   "  calculate GC percent in 5 base windows using a 2bit assembly (dp2):\n"
   "    hgGcPercent -wigOut -doGaps -win=5 -file=stdout -verbose=0 \\\n"
   "      dp2 /cluster/data/dp2 \\\n"
   "    | wigEncode stdin gc5Base.wig gc5Base.wib"
       , winSize);
 }
 
 char *createTable = 
 "#Displays GC percentage in 20Kb blocks for genome\n"
 "CREATE TABLE gcPercent (\n"
     "chrom varchar(255) not null,	# Human chromosome number\n"
     "chromStart int unsigned not null,	# Start position in genoSeq\n"
     "chromEnd int unsigned not null,	# End position in genoSeq\n"
     "name varchar(255) not null,	# Constant string GCpct\n"
     "gcPpt int unsigned not null,	# GC percentage for 20Kb block\n"
               "#Indices\n"
     "UNIQUE(chrom(%d),chromStart)\n"
 ");\n";
 
 static void wigOutLine(FILE *f, char *chrom, int start, int end, int ppt)
 {
 /*	only full winSize spans are valid	*/
 if ((end - start) < winSize)
     return;
 
 /*	see if we are starting on a new chrom	*/
 if (! (previousChrom && (sameWord(previousChrom, chrom))))
     {
     freeMem(previousChrom);
     previousChrom = cloneString(chrom);
     fprintf(f, "variableStep chrom=%s span=%d\n", chrom, winSize-overlap);
     }
 fprintf(f, "%d\t%g\n", start+1, ppt/10.0);
 }
 
 void makeGcLineFromSeq(DNA *dna, char *chrom, int start, int end, FILE *f)
 /* Given a sequence and window within the sequence, print out a line of
  * BED 5 with the GC parts per thousand (not percent) in the window (or 
  * ascii-wiggle if -wigOut). */
 {
 static int dotMod = 0;
 int minCount = winSize/4;
 int i, count, gcCount, val, ppt, gapCount;
 
 if ((++dotMod&127) == 0)
     {
     if (!noDots)
 	{
 	verbose(1, ".");
 	if ((dotMod & 8191) == 0)
 	    verbose(1, "\n");
 	fflush(stdout);
 	}
     }
 
 gapCount = count = gcCount = 0;
 for (i = start;  i < end;  ++i)
     {
     if ((val = ntVal[(int)dna[i]]) >= 0)
 	{
 	++count;
 	if (val == G_BASE_VAL || val == C_BASE_VAL)
 	    {
 	    ++gcCount;
 	    }
 	}
     else
 	{
 	++gapCount;
 	}
     }
 if (count >= minCount)
     ppt = round(1000.0*(double)gcCount/(double)count);
 else
     ppt = 0;
 
 
 if (!doGaps || gapCount < (end - start))
     {
     if (wigOut)
 	wigOutLine(f, chrom, start, end, ppt);
     else
 	fprintf(f, "%s\t%d\t%d\t%s\t%d\n", chrom, start, end, "GC", ppt);
     }
 }
 
 
 void writeBedLineCountFromSeq(struct dnaSeq *seq, char *chrom, int start, int end,
 		       FILE *f)
 /* Given a window of sequence, print out a line of BED 5 with the GC counts
  * in the window. (not percents or parts per thousand)*/
 {
 static int dotMod = 0;
 DNA *dna = seq->dna;
 int i, count, gcCount, val, gapCount;
 
 if ((++dotMod&127) == 0 && !noDots)
 	{
 	verbose(1, ".");
 	if ((dotMod & 8191) == 0)
 	    verbose(1, "\n");
 	fflush(stdout);
 	}
 gapCount = count = gcCount = 0;
 for (i=0; i < seq->size; ++i)
     if ((val = ntVal[(int)dna[i]]) >= 0)
 	{
 	++count;
 	if (val == G_BASE_VAL || val == C_BASE_VAL)
 	    ++gcCount;
 	}
     else
 	++gapCount;
 
 if (doGaps)
     {
     if (gapCount < seq->size)
 	fprintf(f, "%s\t%d\t%d\t%d\t%s\n", chrom, start, end, gcCount, "gcCount");
     }
 else
     fprintf(f, "%s\t%d\t%d\t%d\t%s\n", chrom, start, end, gcCount, "gcCount");
 }
 
 
 void makeGcTabFromNib(char *nibFile, char *chrom, FILE *f, struct bed *bedRegions)
 /* Scan through nib file and write out GC percentage info. */
 {
 int start = 0, end = 0;
 int chromSize;
 FILE *nf = NULL;
 struct dnaSeq *seq = NULL;
 struct bed *bed = NULL;
     
 nibOpenVerify(nibFile, &nf, &chromSize);
 seq = nibLdPart(nibFile, nf, chromSize, 0, chromSize);
 
 if (f==NULL)
     errAbort("No output file");
 
 if ((bedRegionInName != NULL))
     for (bed=bedRegions; bed!=NULL; bed=bed->next)
 	{
 	if(sameString(bed->chrom, chrom))
 	    {
 	    struct dnaSeq *subSeq = NULL;
 	    DNA *subSeqDNA = NULL;
 	    if (bed->chromEnd > chromSize)
 		bed->chromEnd = chromSize;
 	    subSeqDNA = cloneStringZ(seq->dna + bed->chromStart, 
 				     bed->chromEnd - bed->chromStart);
 	    subSeq = newDnaSeq(subSeqDNA, bed->chromEnd - bed->chromStart, NULL);
 	    writeBedLineCountFromSeq(subSeq, chrom, bed->chromStart, bed->chromEnd, 
 				     bedRegionOutFile);
 	    freeDnaSeq(&subSeq);
 	    }
 	}
 else
     for (start=0, end=0;  start < chromSize && end < chromSize;  
 	 start = end - overlap)
 	{
 	end = start + winSize;
 	if (end > chromSize)
 	    end = chromSize;
 	makeGcLineFromSeq(seq->dna, chrom, start, end, f);
 	}
 freeDnaSeq(&seq);
 carefulClose(&nf);
 if (!noDots)
     verbose(1, "\n");
 }
 
 
 void makeGcTabFromTwoBit(char *twoBitFileName, FILE *f, struct bed *bedRegions)
 /* Scan through all sequenes in .2bit file and write out GC percentage info. */
 {
 struct slName *twoBitNames = twoBitSeqNames(twoBitFileName);
 struct slName *el = NULL;
 struct twoBitFile *tbf = twoBitOpen(twoBitFileName);
 struct bed *bed = NULL;
 
 if ((bedRegionInName != NULL))
     for (bed=bedRegions; bed!=NULL; bed=bed->next)
 	{
 	for (el = twoBitNames; el != NULL; el = el->next)
 	    {
 	    int chromSize = twoBitSeqSize(tbf, el->name);
 	    char *chrom = el->name;
 	    struct dnaSeq *seq = NULL;
 	    
 	    if (chr)
 		{
 		verbose(2, "#\tchecking name: %s =? %s\n", chrom, chr);
 		if (! sameString(chrom, chr))
 		    continue;
 		}
 	    if (! sameString(chrom, bed->chrom))
 		continue;
 	    verbose(2, "#\tProcessing twoBit sequence %s for bed item %s:%d-%d\n", chrom, bed->chrom, bed->chromStart, bed->chromEnd);
 	    if (bed->chromEnd > chromSize)
 		bed->chromEnd = chromSize;
 	    seq = twoBitReadSeqFrag(tbf, chrom, bed->chromStart, bed->chromEnd);
 	    writeBedLineCountFromSeq(seq, chrom, bed->chromStart, bed->chromEnd, 
 				     bedRegionOutFile);
 	    freeDnaSeq(&seq);
 	    }
 	}
 else
 	{
 	for (el = twoBitNames; el != NULL; el = el->next)
 	    {
 	    int start = 0, end = 0;
 	    int chromSize = twoBitSeqSize(tbf, el->name);
 	    char *chrom = el->name;
 	    struct dnaSeq *seq = NULL;
 	    
 	    if (chr)
 		{
 		verbose(2, "#\tchecking name: %s =? %s\n", chrom, chr);
 		if (! sameString(chrom, chr))
 		    continue;
 		}
 	    verbose(2, "#\tProcessing twoBit sequence %s\n", chrom);
 	    seq = twoBitReadSeqFrag(tbf, chrom, 0, chromSize);
 	    for (start=0, end=0;  start < chromSize && end < chromSize;  
 		 start = end - overlap)
 		{
 		end = start + winSize;
 		if (end > chromSize)
 		    end = chromSize;
 		makeGcLineFromSeq(seq->dna, chrom, start, end, f);
 		}
 	    freeDnaSeq(&seq);
 	    }
 	}
 if (!noDots)
     verbose(1, "\n");
 }
 
 
 void hgGcPercent(char *database, char *nibDir)
 /* hgGcPercent - Calculate GC Percentage on all chromosomes, load db table. */
 {
 char *tabFileName = file ? file : "gcPercent.bed";
 FILE *tabFile = mustOpen(tabFileName, "w");
 char twoBitFile[512];
 struct bed *bedRegionList = NULL;
 
 verbose(1, "#\tCalculating gcPercent with window size %d\n", winSize);
 verbose(2, "#\tWriting to tab file %s\n", tabFileName);
 
 if (bedRegionInName)
     {
     struct lineFile *lf = lineFileOpen(bedRegionInName, TRUE);
     struct bed *bed;
     char *row[3];
     
     while (lineFileRow(lf, row))
 	{
 	if (startsWith(row[0],"#"))
 	    continue;
 	bed = bedLoad3(row);
 	slAddHead(&bedRegionList, bed);
 	}
     lineFileClose(&lf);
     slReverse(&bedRegionList);
     }
 if (twoBitIsFile(nibDir))
     safef(twoBitFile, sizeof(twoBitFile), "%s", nibDir);
 else
     safef(twoBitFile, sizeof(twoBitFile), "%s/%s.2bit", nibDir, database);
 if (fileExists(twoBitFile))
     {
     verbose(1, "#\tUsing twoBit: %s\n", twoBitFile);
     makeGcTabFromTwoBit(twoBitFile, tabFile, bedRegionList);
     }  
 else
     {
     struct fileInfo *nibList, *nibEl;
     boolean gotNib = FALSE;
     nibList = listDirX(nibDir, "*.nib", TRUE);
     for (nibEl = nibList; nibEl != NULL; nibEl = nibEl->next)
         {
 	char dir[256], chrom[128], ext[64];
         splitPath(nibEl->name, dir, chrom, ext);
 	if (noRandom && endsWith(chrom,"random"))
 	    continue;
         if (chr)
 	    {
 	    verbose(2, "#\tchecking name: %s =? %s\n", chrom, chr);
 	    if (! sameString(chrom, chr))
 		continue;
 	    }
 	verbose(1, "#\tProcessing %s\n", nibEl->name);
 	makeGcTabFromNib(nibEl->name, chrom, tabFile, bedRegionList);
 	gotNib = TRUE;
         }
     slFreeList(&nibList);
     if (! gotNib)
 	warn("Warning: nibDir argument (%s) did not expand to any sequence "
 	     "files.", nibDir);
     }
 carefulClose(&tabFile);
 verbose(1, "#\tFile %s created\n", tabFileName);
 
 /* Load that file in database. */
 if (!noLoad)
     {
     struct sqlConnection *conn = sqlConnect(database);
     int indexLen = hGetMinIndexLength(database);
     char query[1024];
     sqlSafef(query, sizeof(query), createTable, indexLen);
     verbose(1, "#\tLoading gcPercent table\n");
     sqlRemakeTable(conn, "gcPercent", query);
     sqlUpdate(conn, NOSQLINJ "DELETE from gcPercent");
     sqlSafef(query, sizeof(query),
 	  "LOAD data local infile '%s' into table gcPercent", tabFileName);
     sqlUpdate(conn, query);
     sqlDisconnect(&conn);
     }
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 
 if (argc <3)
     usage();
 
 dnaUtilOpen();
 winSize = optionInt("win", 20000);
 noLoad = optionExists("noLoad");
 noDots = optionExists("noDots");
 doGaps = optionExists("doGaps");
 wigOut = optionExists("wigOut");
 overlap = optionInt("overlap", 0);
 file = optionVal("file", NULL);
 chr = optionVal("chr", NULL);
 noRandom = optionExists("noRandom");
 file = optionVal("file", NULL);
 bedRegionInName = optionVal("bedRegionIn", NULL);
 bedRegionOutName = optionVal("bedRegionOut", NULL);
 
 /*	sending output to file implies no loading of DB, and also if the
  *	stated file is "stdout" we should not do ... progress reports
  */
 if (file)
     {
     noLoad = TRUE;
     if (sameString(file,"stdout")) noDots = TRUE;
     }
 
 if (bedRegionOutName)
     bedRegionOutFile = mustOpen(bedRegionOutName, "w");
 
 if ((bedRegionInName && !bedRegionOutName) || (!bedRegionInName && bedRegionOutName))
     errAbort("bedRegionIn and bedRegionOut must both be specified");
 
 if (verboseLevel() >= 2)
     {
     fprintf(stderr, "#\thgGcPercent -win=%d", winSize);
     if (file) fprintf(stderr, " -file=%s", file);
     if (noLoad) fprintf(stderr, " -noLoad");
     if (noDots) fprintf(stderr, " -noDots");
     if (doGaps) fprintf(stderr, " -doGaps");
     if (wigOut) fprintf(stderr, " -wigOut");
     if (chr) fprintf(stderr, " -chr=%s", chr);
     if (bedRegionInName) fprintf(stderr, " -bedRegionInName=%s", bedRegionInName);
     if (bedRegionOutName) fprintf(stderr, " -bedRegionOutName=%s", bedRegionOutName);
     fprintf(stderr, "\n");
     }
 
 hgGcPercent(argv[1], argv[2]);
 return 0;
 }