e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/hg/affyGnf/affyPslAndAtlasToBed.c src/hg/affyGnf/affyPslAndAtlasToBed.c
index 8e18594..613dfa3 100644
--- src/hg/affyGnf/affyPslAndAtlasToBed.c
+++ src/hg/affyGnf/affyPslAndAtlasToBed.c
@@ -1,691 +1,694 @@
 /* program to fit affy data into multiple score bed format and associated
    expRecord file from the affy file and the pslFile */
+
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "options.h"
 #include "affyAtlas.h"
 #include "psl.h"
 #include "hash.h"
 #include "bed.h"
 #include "dystring.h"
 #include "expRecord.h"
 
 
 
 #define DEBUG 0
 
 void usage() 
 {
 errAbort("affyPslAndAtlasToBed - Takes an alignment of affy probe target\n"
 	 "sequences and the human atlas results file to create an msBed file.\n"
 	 "usage:\n"
 	 "    affyPslAndAtlasToBed pslFile atlasFile out.bed out.expRecord\n"
 	 "options:\n"
 	 "    -shortOut Do short output format (no mapping only spots)\n"
 	 "    -newType Use for newer type including mouse\n"
 	 "    -chip=XXXX (default HG-U95Av2)\n");
 }
 
 boolean shortOut = FALSE;
 boolean newType = FALSE;
 char *chip = "HG-U95Av2";
 
 FILE *scores = NULL;      /* output all scores here, used to generate a histogram of all results */
 int missingExpsCount = 0; /* how many missing data points are there */
 int noExpCount = 0;       /* how many points contain no experimental data */
 int missingVal = -10000;  /* use this value for missing data */
 
 static struct optionSpec options[] = {
    {"shortOut", OPTION_BOOLEAN},
    {"newType", OPTION_BOOLEAN},
    {"suffix", OPTION_STRING},
    {"chip", OPTION_STRING},
    {NULL, 0},
 };
 
 
 struct idVal
 /* generic structure with float val */
     {
     struct idVal *next;  /* Next in singly linked list. */
     char id[33];	/* identifier */
     float val;	/* the value of interest */
     };
 
 int statSortable(const void *e1, const void *e2)
 {
 const struct idVal *v1 = *((struct idVal**)e1);
 const struct idVal *v2 = *((struct idVal**)e2);
 float val = v1->val - v2->val;
 if(val > 0) return 1;
 if(val < 0) return -1;
 if(val == 0) return 0;
 }
 
 
 void statSort(struct idVal **statList) 
 /* sorts a stat in increasing order */
 {
 slSort(statList,statSortable);
 }
 
 float statMedian(struct idVal **statList)
 /* finds the median of the list, will sort list to do this */
 {
 boolean odd = slCount(*statList)%2;
 int half = slCount(*statList)/2;
 struct idVal *s=NULL, *s2=NULL;
 float median = -1;
 statSort(statList);
 
 /* if odd the median is the middle value */
 if(odd)
     {
     s = slElementFromIx(*statList,half);
     if(s != NULL)
 	median = s->val;
     }
 /* if even the median is the average of the two middle values */
 else 
     {
     s = slElementFromIx(*statList,half);
     s2 = slElementFromIx(*statList,half-1);
     if(s != NULL && s2 != NULL)
 	median = (s->val + s2->val)/2;
     }
 return median;
 }
 
 void statFree(struct idVal **pEl)
 /* Free a single dynamically allocated stat such as created
  * with statLoad(). */
 {
 struct idVal *el;
 
 if ((el = *pEl) == NULL) return;
 freez(pEl);
 }
 
 void statFreeList(struct idVal **pList)
 /* Free a list of dynamically allocated stat's */
 {
 struct idVal *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     statFree(&el);
     }
 *pList = NULL;
 }
 
 char *parseNameFromHgc(char *string)
 /** parses out the name of the probe set. free returned string
 with freez() */
 {
 char startPat[64];
 char *toRet = NULL;
 char *tmp = NULL;
 
 safef(startPat, sizeof(startPat), "%s:", chip);
 toRet = strstr(string, startPat);
 if(toRet == NULL)
     {
     toRet = string;
     }
 else
     {
     toRet++;
     toRet = strstr(toRet, ":");
     if(toRet == NULL)
 	errAbort("Can't parse target name from %s.", string);
     toRet++;
     tmp = strstr(toRet, ";");
     if(tmp != NULL)
         *tmp = '\0';
     }
 toRet = cloneString(toRet);
 return toRet;
 }
 
 struct bed *createBedsFromPsls(char *pslFile, int expCount)
 /** creates a list of beds from a pslfile, allocates memory for
 arrays as determined by expCount */
 {
 struct psl *pslList = NULL, *psl = NULL;
 struct bed *bedList = NULL, *bed = NULL;
 pslList = pslLoadAll(pslFile);
 for(psl = pslList; psl != NULL; psl = psl->next)
     {
 
     bed = bedFromPsl(psl);
     freez(&bed->name);
     bed->name=parseNameFromHgc(psl->qName);
     bed->score = 0;
     bed->expCount = 0;
     bed->expIds = needMem(sizeof(int)*expCount);
     bed->expScores = needMem(sizeof(float)*expCount);
     slAddHead(&bedList,bed);
     }
 slReverse(&bedList);
 pslFreeList(&pslList);
 return bedList;
 }
 
 struct hash *createBedHash(struct bed *bedList)
 /** takes a list of beds and puts them in a hash with duplicates
     numbered as name_1, name_2 */
 {
 struct hash *bedHash = newHash(5);
 struct bed *bed = NULL;
 struct dyString *ds = newDyString(1024);
 char *name = NULL;
 for(bed = bedList; bed != NULL; bed = bed->next)
     {
     int count = 0;
     char *targetName = NULL;
     struct bed *tmp = NULL;
 
     dyStringClear(ds);
     dyStringPrintf(ds,"%s_%d", bed->name, count);
     /* since we may have duplications, look for an empty slot in the hash */
     while(TRUE && (count < 1000))
 	{
 	tmp = hashFindVal(bedHash, ds->string);
 	if(tmp == NULL)
 	    {
 	    hashAddUnique(bedHash, ds->string, bed);
 	    break;
 	    }
 	else 
 	    {     
 	    dyStringClear(ds);
 	    dyStringPrintf(ds, "%s_%d", bed->name, ++count);
 	    }
 	}
     }
 return bedHash;
 }
 
 
 struct expRecord *expRecordFromAffyAtlas(struct affyAtlas *aa)
 /** constructs a simple experiment record from information in an affyAtlas record */
 {
 static int id = 0;
 struct expRecord *er = NULL;
 char name[256];
 AllocVar(er);
 sprintf(name, "%s_%s", aa->annName, aa->tissue);
 er->id = id++;
 er->name = cloneString(name);
 er->description = cloneString(aa->tissue);
 er->url = cloneString("http://www.affymetrix.com/analysis/index.affx");
 er->ref = cloneString("http://www.gnf.org/");
 er->credit = cloneString("http://www.gnf.org/");
 er->numExtras = 3;
 er->extras = needMem(sizeof(char*) * er->numExtras);
 er->extras[0] = cloneString(chip);
 er->extras[1] = cloneString(aa->annName);
 er->extras[2] = cloneString(aa->tissue);
 return er;
 }
 
 boolean differentExperiment(struct expRecord *er, struct affyAtlas *aa) 
 /* return TRUE if this expRecord was derived from an affyAtlas record similar
    to this one FALSE otherwise */
 {
 if(er == NULL && aa == NULL)
     return TRUE;
 if(er == NULL)
     return FALSE;
 if(aa == NULL)
     return FALSE;
 if(strstr(er->name, aa->annName) && strstr(er->name, aa->tissue))
     return FALSE;
 else
     return TRUE;
 }
 
 boolean differentAA(struct affyAtlas *aa1,struct affyAtlas *aa2)
 /** returns TRUE if from different chips FALSE otherwise */
 {
 return differentString(aa1->annName, aa2->annName);
 }
 
 int countExperiments(struct affyAtlas *aaList)
 /** takes an affy atlas file and counts the different experiments in it */
 {
 struct affyAtlas *aa=NULL, *aaMark = NULL;
 int count =1;
 aaMark = aaList;
 for(aa = aaList; aa != NULL; aa = aa->next)
     {
     if(differentAA(aaMark, aa))
 	{
 	count++;
 	aaMark = aa;
 	}
     }
 return count;
 }
 
 void addExperimentToBed(struct bed *bed, struct affyAtlas *aa, struct expRecord *er)
 /** add one affy atlas record to a ms bed record */
 {
 bed->expIds[bed->expCount] = er->id;
 bed->score += aa->signal;
 bed->expScores[bed->expCount] = aa->signal;
 bed->expCount++;
 }
 
 void appendNewExperiment(struct hash *bedHash, struct affyAtlas *aa, struct expRecord *er)
 /** finds the correct beds from the beds hash and adds the data in the affyAtlas
     record to them. */
 {
 char buff[256];
 int count =0;
 struct bed *bed = NULL;
 snprintf(buff, sizeof(buff), "%s_%d", aa->probeSet, count);
 while(TRUE)
     {
     bed = hashFindVal(bedHash, buff);
     if(bed == NULL)
 	break;
     else
 	{
 	addExperimentToBed(bed, aa, er);
 	snprintf(buff, sizeof(buff), "%s_%d", aa->probeSet, ++count);
 	}
     }
 }
 
 void appendExperiments(struct hash *bedHash, struct affyAtlas *aaList, struct expRecord **erList)
 /** loop through the altas file creating new experiments and appending them
    to the beds in the bed hash */
 {
 struct affyAtlas *aa=NULL;
 struct expRecord *er = NULL;
 
 assert(aaList);
 er = expRecordFromAffyAtlas(aaList);
 
 for(aa = aaList; aa != NULL; aa = aa->next)
     {
     if(differentExperiment(er, aa))
 	{
 	slAddHead(erList, er);
 	er = expRecordFromAffyAtlas(aa);
 	}
     appendNewExperiment(bedHash, aa, er);
     }
 /* add last experiment */
 slAddHead(erList,er);
 }
 
 void checkAllBeds(struct bed **bedList, int expCount)
 /** check to make sure that all the beds have the same
     number of experiments associated with them */
 {
 struct bed *bed = NULL;
 for(bed = *bedList; bed != NULL; )
     {
     if(bed->expCount != expCount)
 	{
 	struct bed *tmp = NULL;
 	if(bed->expCount != 0)
 	    {
 	    warn("Bed %s at %d has only %d exps, mark is %d", bed->name, slIxFromElement(*bedList, bed), bed->expCount, expCount);
 	    missingExpsCount++;
 	    }
 	else
 	    noExpCount++;
 	tmp = bed->next;
 	slRemoveEl(bedList, bed);
 	bed = tmp;
 	}
     else
 	bed = bed->next;
     }
 }
 
 
 float safeLog2(float num)
 /* returns log base 2 of num if num > 0, else
    returns 0 */
 {
 if(num > 0)
     return logBase2(num);
 else 
     return 0;
 }
 
 void convertIntensitiesToRatios(struct bed *bedList)
 /* for each bed calculate the median intensity not including missing 
 data and calcute scores as ratio of each intensity to median 
 intensity. */
 {
 FILE *mediansOut = NULL, *valuesOut = NULL;
 float median = 0;
 struct idVal *statList = NULL, *stat = NULL;
 struct bed *bed = NULL;
 int i = 0;
 #ifdef DEBUG
 valuesOut = mustOpen("values.debug", "w");
 mediansOut = mustOpen("medians.debug", "w");
 #endif
 for(bed = bedList; bed != NULL; bed = bed->next)
     {
     float ratio = 0;
     float logRatio = 0;
 #ifdef DEBUG
     fprintf(mediansOut, "%s", bed->name);
     fprintf(valuesOut, "%s", bed->name);
 #endif
     /* find the median value by sorting */
     for(i=0; i<bed->expCount; i++)
 	{
 	if(bed->expScores[i] != missingVal)
 	    {
 	    AllocVar(stat);
 	    stat->val = bed->expScores[i];
 	    slAddHead(&statList, stat);
 
 	    }
 	}
     median = statMedian(&statList);
 #ifdef DEBUG
     for(stat = statList; stat != NULL; stat = stat->next)
 	{
 	fprintf(valuesOut, "\t%f", stat->val);
 	}
     fprintf(valuesOut, "\n");
     fprintf(mediansOut, "\t%f\n", median);
 #endif
     statFreeList(&statList);
     for(i=0; i<bed->expCount; i++)
 	{
 	if(bed->expScores[i] != missingVal)
 	    {
 	    ratio = bed->expScores[i] / median;
 	    logRatio = safeLog2(ratio);
 	    bed->expScores[i] = logRatio;
 	    fprintf(scores, "%f\n", logRatio);
 	    }
 	}
     }
 #ifdef DEBUG
 carefulClose(&mediansOut);
 carefulClose(&valuesOut);
 #endif
 }
 
 void calculateAverages(struct bed *bedList)
 /** divide the sum of the intensities stored in the score by
     number of experiments */
 {
 struct bed *bed = NULL;
 for(bed = bedList; bed != NULL; bed = bed->next)
     {
     float tmp;
     assert(bed->expCount);
     tmp =  bed->score / bed->expCount;
     if(tmp <= 0)
 	bed->score = missingVal;
     else 
 	{
 	tmp = 10 * tmp;
 	bed->score = (int)tmp;
 	}
     }
 }
 
 void affyPslAndAtlasToBedOld(char *pslFile, char *atlasFile, char *bedOut, char *expRecOut)
 /** Main function that does all the work for old-style*/
 {
 struct hash *bedHash = NULL;
 struct affyAtlas *aaList=NULL, *aa=NULL;
 struct expRecord *erList=NULL, *er=NULL;
 struct bed *bedList=NULL, *bed=NULL;
 int expCount = 0;
 FILE *erOut = NULL, *bOut=NULL;
 warn("loading atlas file");
 aaList = affyAtlasLoadAll(atlasFile);
 expCount = countExperiments(aaList);
 warn("creating list of beds from alignments");
 bedList = createBedsFromPsls(pslFile, expCount);
 warn("creating hash from list of beds");
 bedHash = createBedHash(bedList);
 warn("appending experiments to beds in hash");
 appendExperiments(bedHash, aaList, &erList);
 warn("Running sanity Checks");
 checkAllBeds(&bedList, expCount);
 warn("%d beds were missing experiments." , missingExpsCount);
 warn("%d beds had no experiments.", noExpCount);
 warn("Calculating average intensities");
 convertIntensitiesToRatios(bedList);
 calculateAverages(bedList);
 
 warn("writing expRecords out");
 erOut = mustOpen(expRecOut, "w");
 for(er = erList; er != NULL; er = er->next)
     expRecordTabOut(er, erOut);
 carefulClose(&erOut);
 
 warn("writing beds out");
 bOut = mustOpen(bedOut, "w");
 for(bed = bedList; bed != NULL; bed = bed->next)
     bedTabOutN(bed, 15, bOut);
 carefulClose(&bOut);
 
 warn("cleaning up..");
 freeHash(&bedHash);
 bedFreeList(&bedList);
 
 warn("Done.");
 }
 
 struct expCounter
 /* Keep track when have multiple experiments on same tissue. */
     {
     char *name;	/* Not allocated here. */
     int count;	/* Count of times seen. */
     };
 
 int lineToExp(char *line, char *fileName)
 /* Convert line to an expression record file. 
  * Return number of expression records. */
 {
 FILE *f = mustOpen(fileName, "w");
 struct hash *hash = newHash(10);	/* Integer valued hash */
 char *word;
 int wordCount = 0;
 struct expCounter *ec;
 char *spaced;
 char name[128];
 
 while ((word = nextTabWord(&line)) != NULL)
     {
     if ((ec = hashFindVal(hash, word)) == NULL)
         {
 	AllocVar(ec);
 	hashAddSaveName(hash, word, ec, &ec->name);
 	}
     spaced = cloneString(word);
     subChar(spaced, '_', ' ');
     ec->count += 1;
     if (ec->count > 1)
         safef(name, sizeof(name), "%s %d", spaced, ec->count);
     else
         safef(name, sizeof(name), "%s", spaced);
     fprintf(f, "%d\t", wordCount);
     fprintf(f, "%s\t", name);
     fprintf(f, "%s\t", name);
     fprintf(f, "%s\t", "http://www.affymetrix.com/analysis/index.affx");
     fprintf(f, "%s\t", "http://expression.gnf.org");
     fprintf(f, "%s\t", "http://www.gnf.org");
     fprintf(f, "3\t");
     fprintf(f, "%s,%s,%s,\n", chip, "n/a", spaced);
     ++wordCount;
     }
 carefulClose(&f);
 return wordCount;
 }
 
 int findPositiveMedian(double *data, int count, double minVal)
 /* Find median of positive numbers in data. */
 {
 double *sorted;
 int i, realCount = 0;
 double median = -1;
 AllocArray(sorted, count);
 for (i=0; i<count; ++i)
     {
     if (data[i] >= minVal)
         {
 	sorted[realCount] = data[i];
 	++realCount;
 	}
     }
 if (realCount > 2)
     {
     median = doubleMedian(realCount, sorted);
     }
 freez(&sorted);
 return median;
 }
 
 void shortDataOut(FILE *f, char *name, int count, double *scores)
 /* Do short type output. */
 {
 int i;
 fprintf(f, "%s\t%d\t", name, count);
 for (i=0; i<count; ++i)
     fprintf(f, "%0.3f,", scores[i]);
 fprintf(f, "\n");
 }
 
 void affyPslAndAtlasToBedNew(char *pslFile, char *atlasFile, char *bedOut, 
 	char *expRecOut)
 /** Main function that does all the work for new-style*/
 {
 struct lineFile *lf = lineFileOpen(atlasFile, TRUE);
 char *line, *name;
 int i, wordCount, expCount;
 char **row;
 double *data, median;
 double invMedian, ratio, logRatio;
 char *affyId;
 struct hash *hash = newHash(17);
 struct psl *psl;
 struct bed *bed;
 FILE *f = NULL;
 int dataCount = 0, pslCount = 0, bedCount = 0;
 int minExpVal = 20;
 
 /* Open Atlas file and use first line to create experiment table. */
 if (!lineFileNextReal(lf, &line))
     errAbort("%s is empty", lf->fileName);
 if (startsWith("Affy", line))
     line += 4;
 if (line[0] != '\t')
     errAbort("%s doesn't seem to be a new format atlas file", lf->fileName);
 expCount = lineToExp(line+1, expRecOut);
 if (expCount <= 0)
     errAbort("No experiments in %s it seems", lf->fileName);
 warn("%d experiments\n", expCount);
 
 f = mustOpen(bedOut, "w");
 
 /* Build up a hash keyed by affyID with an int array of data
  * for value.  Do output in short case. */
 AllocArray(row, expCount);
 while (lineFileNextReal(lf, &line))
     {
     affyId = nextWord(&line);
 
     wordCount = chopByWhite(line, row, expCount);
     if (wordCount != expCount)
         errAbort("Expecting %d data points, got %d line %d of %s", 
 		expCount, wordCount, lf->lineIx, lf->fileName);
     if (hashLookup(hash, affyId))
 	{
         warn("Duplicate %s, skipping all but first.", affyId);
 	continue;
 	}
     AllocArray(data, expCount);
     for (i=0; i<expCount; ++i)
 	{
         data[i] = atof(row[i]);
         if (data[i] < minExpVal)
 	    data[i] = minExpVal;
 	}
     median = findPositiveMedian(data, expCount, minExpVal);
     if (median >= 0)
 	{
 	invMedian = 1.0/median;
 	for (i=0; i<expCount; ++i)
 	    {
 	    double val = data[i];
 	    val = safeLog2(invMedian*val);
 	    data[i] = val;
 	    }
 	if (shortOut)
 	    shortDataOut(f, affyId, expCount, data);
 	else
 	    hashAdd(hash, affyId, data);
         }
     data = NULL;
     ++dataCount;
     }
 lineFileClose(&lf);
 warn("%d rows of expression data\n", dataCount);
 
 /* Stream through psl file, converting it to bed with expression data. */
 if (!shortOut)
     {
     lf = pslFileOpen(pslFile);
     while ((psl = pslNext(lf)) != NULL)
 	{
 	++pslCount;
         /* get probe id from sequence name */
         name=parseNameFromHgc(psl->qName);
 	data = hashFindVal(hash, name);
         if (data != NULL)
 	    {
             struct bed *bed = bedFromPsl(psl);
 	    bed->expCount = expCount;
 	    AllocArray(bed->expIds, expCount);
 	    AllocArray(bed->expScores, expCount);
 	    for (i=0; i<expCount; ++i)
 		{
 		bed->expScores[i] = data[i];
 		bed->expIds[i] = i;
 		}
 	    bedTabOutN(bed, 15, f);
 	    ++bedCount;
 
 	    bedFree(&bed);
 	    }
 	pslFree(&psl);
 	}
     warn("%d records in %s", pslCount, pslFile);
     warn("%d records written to %s", bedCount, bedOut);
     }
 lineFileClose(&lf);
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 {
 optionInit(&argc, argv, options);
 shortOut = optionExists("shortOut");
 newType = optionExists("newType");
 chip = optionVal("chip", chip);
 if(argc != 5)
     usage();
 scores = mustOpen("scores.tab", "w");
 if (newType)
     affyPslAndAtlasToBedNew(argv[1], argv[2], argv[3], argv[4]);
 else
     affyPslAndAtlasToBedOld(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }