4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/ratStuff/mafAddIRows/mafAddIRows.c src/hg/ratStuff/mafAddIRows/mafAddIRows.c
index a672ab9..3875bf7 100644
--- src/hg/ratStuff/mafAddIRows/mafAddIRows.c
+++ src/hg/ratStuff/mafAddIRows/mafAddIRows.c
@@ -1,638 +1,638 @@
 /* mafAddIRows - Filter out maf files. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "errAbort.h"
 #include "linefile.h"
 #include "hash.h"
 #include "chain.h"
 #include "options.h"
 #include "maf.h"
 #include "bed.h"
 #include "twoBit.h"
 #include "binRange.h"
 
 
 char *masterSpecies;
 char *masterChrom;
 struct hash *speciesHash;
 struct subSpecies *speciesList;
 struct strandHead *strandHeads;
 
 boolean addN = FALSE;
 boolean addDash = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "mafAddIRows - add 'i' rows to a maf\n"
   "usage:\n"
   "   mafAddIRows mafIn twoBitFile mafOut\n"
   "WARNING:  requires a maf with only a single target sequence\n"
   "options:\n"
   "   -nBeds=listOfBedFiles\n"
   "       reads in list of bed files, one per species, with N locations\n"
   "   -addN\n"
   "       adds rows of N's into maf blocks (rather than just annotating them)\n"
   "   -addDash\n"
   "       adds rows of -'s into maf blocks (rather than just annotating them)\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"nBeds", OPTION_STRING},
    {"addN", OPTION_BOOLEAN},
    {"addDash", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct bedHead
 {
     struct bed *list;
 };
 
 struct blockStatus
 {
     char *chrom;
     int start, end;
     char strand;
     struct mafComp *mc;
     int masterStart, masterEnd;
 };
 
 struct subSpecies
 {
     struct subSpecies *next;
     char *name;
     struct hash *hash;
     struct blockStatus blockStatus;
 };
 
 struct linkBlock
 {
     struct linkBlock *next;
     struct cBlock cb;
     struct mafComp *mc;
 };
 
 struct strandHead
 {
     struct strandHead *next;
     char strand;
     char *name;
     char *qName;
     int qSize;
     char *species;
     struct linkBlock *links;
 };
 
 struct mafAli *readMafs(struct mafFile *mf)
 {
 struct mafAli *maf;
 char buffer[2048];
 char buffer2[2048];
 struct strandHead *strandHead;
 struct mafAli *mafList = NULL;
 char *ourChrom = NULL;
 
 while((maf = mafNext(mf)) != NULL)
     {
     struct mafComp *mc, *masterMc = maf->components;
     char *species = buffer;
     char *chrom;
 
     if (ourChrom == NULL)
 	ourChrom = masterMc->src;
     else 
 	{
 	if (differentString(masterMc->src, ourChrom))
 	    errAbort("ERROR: mafAddIrows requires maf have only one target sequence.\n"
 		"Use mafSplit -byTarget -useFullSequenceName to split maf");
 	}
 
     strcpy(species, masterMc->src);
     chrom = strchr(species,'.');
     if (chrom)
 	*chrom++ = 0;
     else
 	errAbort("reference species has no chrom name\n");
 
     if (masterSpecies == NULL)
 	{
 	masterSpecies = cloneString(species);
 	masterChrom = cloneString(chrom);
 	//printf("master %s %s\n",masterSpecies,masterChrom);
 	}
     else
 	{
 	if (!sameString(masterSpecies, species))
 	    errAbort("first species (%s) not master species (%s)\n",species,masterSpecies);
 	}
 
     for(mc= masterMc->next; mc; mc = mc->next)
 	{
 	struct linkBlock *linkBlock;
 	struct subSpecies *subSpecies = NULL;
 
 	strcpy(species, mc->src);
 	chrom = strchr(species,'.');
 	*chrom++ = 0;
 
 	if ((subSpecies = hashFindVal(speciesHash, species)) == NULL)
 	    {
 	    //printf("new species %s\n",species);
 	    AllocVar(subSpecies);
 	    subSpecies->name = cloneString(species);
 	    subSpecies->hash = newHash(6);
 	    subSpecies->blockStatus.strand = '+';
 	    subSpecies->blockStatus.masterStart = masterMc->start;
 	    slAddHead(&speciesList, subSpecies);
 	    hashAdd(speciesHash, species, subSpecies);
 	    }
 	subSpecies->blockStatus.masterEnd = masterMc->start + masterMc->size ;
 	sprintf(buffer2, "%s%c%s", masterChrom,mc->strand,chrom);
 	if ((strandHead = hashFindVal(subSpecies->hash, buffer2)) == NULL)
 	    {
 	    //printf("new strand %s for species %s\n",buffer2, species);
 	    AllocVar(strandHead);
 	    hashAdd(subSpecies->hash, buffer2, strandHead);
 	    strandHead->name = cloneString(buffer2);
 	    strandHead->species = cloneString(species);
 	    strandHead->qName = cloneString(chrom);
 	    strandHead->qSize = mc->srcSize;
 	    strandHead->strand = mc->strand;
 	    slAddHead(&strandHeads, strandHead);
 	    }
 	AllocVar(linkBlock);
 	linkBlock->mc = mc;
 	linkBlock->cb.qStart = mc->start;
 	linkBlock->cb.qEnd = mc->start + mc->size;
 	linkBlock->cb.tStart = masterMc->start;
 	linkBlock->cb.tEnd = masterMc->start + masterMc->size;
 
 
 	slAddHead(&strandHead->links, linkBlock);
 	}
     slAddHead(&mafList, maf);
     }
 slReverse(&mafList);
 
 return mafList;
 }
 
 void chainStrands(struct strandHead *strandHead, struct hash *bedHash)
 {
 for(; strandHead ; strandHead = strandHead->next)
     {
     struct linkBlock *link, *prevLink;
     struct hashEl *hel = hashLookup(bedHash, strandHead->species);
     struct hash *chromHash = (hel != NULL) ? hel->val : NULL;
     struct bedHead *bedHead = (chromHash != NULL) ? 
 	hashFindVal(chromHash, strandHead->qName): NULL;
 
     slReverse(&strandHead->links);
 
     prevLink = strandHead->links;
     prevLink->mc->leftStatus = MAF_NEW_STATUS;
     for(link = prevLink->next; link; prevLink = link, link = link->next)
 	{
 	int tDiff = link->cb.tStart - prevLink->cb.tEnd;
 	int qDiff = link->cb.qStart - prevLink->cb.qEnd;
 	int nCount = 0;
 	int nStart, nEnd;
 	struct bed *bed;
 
 	if (strandHead->strand == '+')
 	    {
 	    nStart = prevLink->cb.qEnd;
 	    nEnd = link->cb.qStart;
 	    }
 	else
 	    {
 	    nEnd = strandHead->qSize - prevLink->cb.qEnd;
 	    nStart = strandHead->qSize - link->cb.qStart;
 	    }
 
 	/* a very inefficient search for an N bed */
 	if ((nEnd - nStart > 0) && (bedHead))
 	    {
 	    for(bed = bedHead->list; bed; bed = bed->next)
 		{
 		if (bed->chromStart >= nEnd)
 		    break;
 
 		if ( bed->chromEnd > nStart)
 		    {
 		    nCount += positiveRangeIntersection(nStart, nEnd, 
 			bed->chromStart, bed->chromEnd);
 		    }
 		}
 	    }
 
 
 	if ((qDiff && (100 * nCount / qDiff > 95))
 		&& (tDiff && (100 * nCount / tDiff > 10)))
 	    {
 	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_MISSING_STATUS;
 	    prevLink->mc->rightLen = link->mc->leftLen = nCount;
 	    }
 	else if  ((tDiff > 100000) ||  
 		  (qDiff > 100000) || (qDiff < -100000))
 	    {
 	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_NEW_STATUS;
 	    prevLink->mc->rightLen = link->mc->leftLen = 0;
 	    }
 	else if  (qDiff < 0)
 	    {
 	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_TANDEM_STATUS;
 	    prevLink->mc->rightLen = link->mc->leftLen = -qDiff;
 	    }
 	else if (qDiff == 0)
 	    {
 	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_CONTIG_STATUS;
 	    prevLink->mc->rightLen = link->mc->leftLen = 0;
 	    }
 	else
 	    {
 	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_INSERT_STATUS;
 	    prevLink->mc->rightLen = link->mc->leftLen = qDiff;
 	    }
 	}
     prevLink->mc->rightStatus = MAF_NEW_STATUS;
     }
 }
 
 void bridgeSpecies(struct mafAli *mafList, struct subSpecies *subSpecies)
 {
 struct mafAli *maf;
 int pushState, leftLen;
 struct mafComp *mc = NULL;
 // struct mafComp *masterMc = NULL, *prevMc = NULL: unused
 
 for(; subSpecies; subSpecies = subSpecies->next)
     {
     //printf("bridging %s\n",subSpecies->name);
     pushState = 0;
     leftLen = 0;
     for(maf = mafList; maf ;  maf = maf->next)
 	{
 	// masterMc = maf->components;
 	// prevMc = mc;  unused
 	if ((mc = mafMayFindCompPrefix(maf, subSpecies->name,NULL)) == NULL)
 	    {
 	    continue;
 	    }
 	if (mc->leftStatus == 0) 
 	    errAbort("zero left status %s:%d\n",mc->src, mc->start);
 #ifdef NOTNOW
 	if (pushState && (mc->leftStatus == MAF_INSERT_STATUS))
 	    {
 	    if (prevMc && 
 		!((prevMc->rightStatus == mc->leftStatus) &&
 		(prevMc->rightLen == mc->leftLen)))
 		{
 		printf("boo\n");
 		mc->leftStatus = MAF_NEW_NESTED_STATUS;
 		mc->leftLen = 0;
 		}
 	    }
 #endif
 	if (mc->leftStatus == MAF_NEW_STATUS)
 	    {
 	    if (pushState)
 		{
 		//printf("bridged on %s: %s:%d-%d\n",mc->src,"chr22",masterMc->start, masterMc->start + masterMc->size);
 		mc->leftStatus = MAF_NEW_NESTED_STATUS;
 		mc->leftLen = leftLen;
 		}
 	    pushState++;
 	    }
 
 	if (mc->rightStatus == 0)
 	    {
 	    errAbort("zero right status\n");
 	    }
 	else if (isContigOrTandem(mc->rightStatus) || (mc->rightStatus == MAF_INSERT_STATUS))
 	    leftLen = mc->rightLen;
 	else if (mc->rightStatus == MAF_NEW_STATUS)
 	    {
 	    pushState--;
 	    if (pushState)
 		{
 		//printf("arched on %s: %s:%d-%d\n",mc->src,"chr22",masterMc->start, masterMc->start + masterMc->size);
 		mc->rightStatus = MAF_NEW_NESTED_STATUS;
 		mc->rightLen = leftLen;
 		}
 	    }
 	}
     //printf("pushState %d\n",pushState);
     }
 }
 
 
 void fillHoles(struct mafAli *mafList, struct subSpecies *speciesList, struct twoBitFile *twoBit)
 {
 int lastEnd = 100000000;
 struct mafAli *prevMaf = NULL, *maf, *nextMaf;
 struct subSpecies *species;
 
 /*
 for(species = speciesList; species; species = species->next)
     {
     blockStatus = &species->blockStatus;
     blockStatus->mc->rightStatus = MAF_NEW_STATUS;
     blockStatus->mc->rightLen = 0;
     }
     */
 
 for(maf = mafList; maf ; prevMaf = maf, maf = nextMaf)
     {
     struct mafComp *mc = NULL, *masterMc, *lastMc = NULL;
     struct mafAli *newMaf = NULL;
     struct blockStatus *blockStatus;
 
     nextMaf = maf->next;
 
     masterMc=maf->components;
     if (masterMc->start > lastEnd)
 	{
 	struct subSpecies *species;
 
 	for(species = speciesList; species; species = species->next)
 	    {
 	    mc = NULL;
 //	    printf("looking at %s\n",species->name);
 	    blockStatus = &species->blockStatus;
 	    if (blockStatus->mc)
 		{
 //	    printf("should match at %s\n",blockStatus->mc->src);
 		switch (blockStatus->mc->rightStatus)
 		    {
 		    case MAF_MISSING_STATUS:
 		    //printf("missing right\n");
 		    case MAF_NEW_NESTED_STATUS:
 		    case MAF_MAYBE_NEW_NESTED_STATUS:
 		    case MAF_CONTIG_STATUS:
 		    case MAF_TANDEM_STATUS:
 		    case MAF_INSERT_STATUS:
 			AllocVar(mc);
 			mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus;
 			mc->rightLen = mc->leftLen = blockStatus->mc->rightLen;
 			mc->src = blockStatus->mc->src;
 			mc->srcSize = blockStatus->mc->srcSize;
 			mc->strand = blockStatus->mc->strand;
 			mc->start = blockStatus->mc->start + blockStatus->mc->size;
 			if (lastMc == NULL)
 			    {
 			    struct mafComp *miniMasterMc = NULL;
 			    char *seqName;
 			    struct dnaSeq *seq;
 
 			    AllocVar(miniMasterMc);
 			    miniMasterMc->next = mc;
 			    miniMasterMc->strand = '+';
 			    miniMasterMc->srcSize = masterMc->srcSize;
 			    miniMasterMc->src = masterMc->src;
 			    miniMasterMc->start = lastEnd;
 			    miniMasterMc->size =  masterMc->start - lastEnd;
 
 			    if ((seqName = strchr(miniMasterMc->src, '.')) != NULL)
 				seqName++;
 			    else 
 			    	seqName = miniMasterMc->src;
 
 //			    printf("hole filled from %d to %d\n",lastEnd, masterMc->start);
 			    seq = twoBitReadSeqFrag(twoBit, seqName, lastEnd, masterMc->start);
 			    miniMasterMc->text = seq->dna;
 
 			    AllocVar(newMaf);
 			    newMaf->textSize = maf->textSize;
 			    newMaf->components = miniMasterMc;
 			    newMaf->next = maf;
 			    if (prevMaf)
 				prevMaf->next = newMaf;
 			    else
 				mafList = newMaf;
 			    //masterMc = miniMasterMc;
 			    }
 			else
 			    {
 			    lastMc->next = mc;
 			    }
 			lastMc = mc;
 			if  (blockStatus->mc->rightStatus ==  MAF_MISSING_STATUS)
 			    {
 			    if (addN)
 				{
 				char buffer[256];
 
 				safef(buffer, sizeof(buffer), "%s.N",species->name);
 				mc->src = cloneString(buffer);
 				mc->start = 0;
 				mc->srcSize = 200000;
 				mc->size =  masterMc->start - lastEnd;
 				mc->text = needMem(mc->size + 1);
 				memset(mc->text, 'N', mc->size);
 				}
 			    }
 			else
 			    {
 			    if (addDash)
 				{
 				mc->size = masterMc->size;
 				mc->srcSize = blockStatus->mc->srcSize;
 				mc->text = needMem(mc->size + 1);
 				memset(mc->text, '-', mc->size);
 				mc->text[mc->size] = 0;
 				if (mc->size == 0)
 				    errAbort("bad dash add");
 				mc->size = 0;
 				}
 			    }
 			break;
 		    }
 		}
 	    }
 	}
     lastEnd = masterMc->start + masterMc->size;
     for(lastMc = masterMc; lastMc->next; lastMc = lastMc->next)
 	;
 
     for(species = speciesList; species; species = species->next)
 	{
 	blockStatus = &species->blockStatus;
 	
 	mc = NULL;
 	if ((blockStatus->masterStart <= masterMc->start) && 
 	    (blockStatus->masterEnd > masterMc->start) && 
 	 ((mc = mafMayFindCompPrefix(maf, species->name,NULL)) == NULL))
 	    {
 	    if (blockStatus->mc != NULL)
 		{
 		switch (blockStatus->mc->rightStatus)
 		    {
 		    case MAF_MISSING_STATUS:
 		    case MAF_CONTIG_STATUS:
 		    case MAF_TANDEM_STATUS:
 		    case MAF_INSERT_STATUS:
 		    case MAF_NEW_NESTED_STATUS:
 		    case MAF_MAYBE_NEW_NESTED_STATUS:
 			AllocVar(mc);
 			mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus;
 			if (mc->rightStatus == MAF_NEW_NESTED_STATUS)
 			    mc->rightStatus = MAF_INSERT_STATUS;
 			if (mc->leftStatus == MAF_NEW_NESTED_STATUS)
 			    mc->leftStatus = MAF_INSERT_STATUS;
 			mc->rightLen = mc->leftLen = blockStatus->mc->rightLen;
 			mc->src = blockStatus->mc->src;
 			mc->strand = blockStatus->mc->strand;
 			mc->srcSize = blockStatus->mc->srcSize;
 			mc->start = blockStatus->mc->start + blockStatus->mc->size ;
 			lastMc->next = mc;
 			lastMc = mc;
 			if  (addN && (blockStatus->mc->rightStatus ==  MAF_MISSING_STATUS))
 			    {
 			    char buffer[256];
 
 			    safef(buffer, sizeof(buffer), "%s.N",species->name);
 			    mc->src = cloneString(buffer);
 			    mc->start = 0;
 			    mc->srcSize = 200000;
 			    mc->size = maf->textSize;
 			    mc->text = needMem(mc->size + 1);
 			    memset(mc->text, 'N', mc->size);
 			    }
 			else if (addDash)
 			    {
 				mc->size = masterMc->size;
 			    mc->text = needMem(mc->size + 1);
 			    if (mc->size == 0)
 				errAbort("bad dash add");
 			    memset(mc->text, '-', mc->size);
 			    mc->text[mc->size] = 0;
 			    mc->size = 0;
 			    }
 			    
 			break;
 		    default:
 			break;
 		    }
 		}
 	    }
 	if (mc)
 	    {
 	    blockStatus->mc = mc;
 	    }
 	}
     }
 }
 
 struct hash *readBed(char *fileName)
 /* Read bed and return it as a hash keyed by chromName */
 {
 char *row[3];
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 struct bedHead *bedHead = NULL;
 struct hash *hash = newHash(6);
 struct hashEl *hel, *lastHel = NULL;
 struct bed3 *bed;
 
 while (lineFileRow(lf, row))
     {
     hel = hashLookup(hash, row[0]);
     if ((lastHel) && (hel != lastHel))
 	{
 	assert(bedHead != NULL);
 	slReverse(&bedHead->list);
 	}
 
     if (hel == NULL)
        {
 	char *ptr;
 
 	AllocVar(bedHead);
 	if ((ptr = strchr(row[0], '.')) != NULL)
 	    ptr++;
 	else 
 	    ptr = row[0];
 	hel = hashAdd(hash, ptr, bedHead);
 	}
     bedHead = hel->val;
     AllocVar(bed);
     bed->chrom = hel->name;
     bed->chromStart = lineFileNeedNum(lf, row, 1);
     bed->chromEnd = lineFileNeedNum(lf, row, 2);
     if (bed->chromStart > bed->chromEnd)
         errAbort("start after end line %d of %s", lf->lineIx, lf->fileName);
     slAddHead(&bedHead->list, (struct bed *)bed);
     lastHel = hel;
     }
 
 if (bedHead != NULL)
     slReverse(&bedHead->list);
 lineFileClose(&lf);
 return hash;
 }
 
 void addBed(char *file, struct hash *fileHash)
 {
 char name[128];
 
 if (!endsWith(file, ".bed"))
     errAbort("filenames in bed list must end in '.bed'");
 
 splitPath(file, NULL, name, NULL);
 hashAdd(fileHash, name, readBed(file));
 }
 
 void mafAddIRows(char *mafIn, char *twoBitIn,  char *mafOut, char *nBedFile)
 /* mafAddIRows - Filter out maf files. */
 {
 FILE *f = mustOpen(mafOut, "w");
 struct twoBitFile *twoBit = twoBitOpen(twoBitIn);
 struct mafAli *mafList, *maf;
 struct mafFile *mf = mafOpen(mafIn);
 struct hash *bedHash = newHash(6); 
 
 if (nBedFile != NULL)
     {
     struct lineFile *lf = lineFileOpen(nBedFile, TRUE);
     char *row[1];
     while (lineFileRow(lf, row))
 	{
 	addBed(row[0], bedHash);
 	}
     lineFileClose(&lf);
     }
 
 speciesHash = newHash(6);
 mafList = readMafs(mf);
 mafWriteStart(f, mf->scoring);
 mafFileFree(&mf);
 
 chainStrands(strandHeads, bedHash);
 bridgeSpecies(mafList, speciesList);
 fillHoles(mafList, speciesList, twoBit);
 
 for(maf = mafList; maf ; maf = maf->next)
     mafWrite(f, maf);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *nBedFile;
 
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 
 nBedFile = optionVal("nBeds", NULL);
 addN = optionExists("addN");
 addDash = optionExists("addDash");
 
 mafAddIRows(argv[1], argv[2], argv[3], nBedFile);
 return 0;
 }