44ccfacbe3a3d4b300f80d48651c77837a4b571e galt Tue Apr 26 11:12:02 2022 -0700 SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix. diff --git src/hg/affyGnf/affyPslAndAtlasToBed.c src/hg/affyGnf/affyPslAndAtlasToBed.c index 5adf2f2..b5748a6 100644 --- src/hg/affyGnf/affyPslAndAtlasToBed.c +++ src/hg/affyGnf/affyPslAndAtlasToBed.c @@ -1,694 +1,694 @@ /* program to fit affy data into multiple score bed format and associated expRecord file from the affy file and the pslFile */ /* Copyright (C) 2011 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "options.h" #include "affyAtlas.h" #include "psl.h" #include "hash.h" #include "bed.h" #include "dystring.h" #include "expRecord.h" #define DEBUG 0 void usage() { errAbort("affyPslAndAtlasToBed - Takes an alignment of affy probe target\n" "sequences and the human atlas results file to create an msBed file.\n" "usage:\n" " affyPslAndAtlasToBed pslFile atlasFile out.bed out.expRecord\n" "options:\n" " -shortOut Do short output format (no mapping only spots)\n" " -newType Use for newer type including mouse\n" " -chip=XXXX (default HG-U95Av2)\n"); } boolean shortOut = FALSE; boolean newType = FALSE; char *chip = "HG-U95Av2"; FILE *scores = NULL; /* output all scores here, used to generate a histogram of all results */ int missingExpsCount = 0; /* how many missing data points are there */ int noExpCount = 0; /* how many points contain no experimental data */ int missingVal = -10000; /* use this value for missing data */ static struct optionSpec options[] = { {"shortOut", OPTION_BOOLEAN}, {"newType", OPTION_BOOLEAN}, {"suffix", OPTION_STRING}, {"chip", OPTION_STRING}, {NULL, 0}, }; struct idVal /* generic structure with float val */ { struct idVal *next; /* Next in singly linked list. */ char id[33]; /* identifier */ float val; /* the value of interest */ }; int statSortable(const void *e1, const void *e2) { const struct idVal *v1 = *((struct idVal**)e1); const struct idVal *v2 = *((struct idVal**)e2); float val = v1->val - v2->val; if(val > 0) return 1; if(val < 0) return -1; if(val == 0) return 0; } void statSort(struct idVal **statList) /* sorts a stat in increasing order */ { slSort(statList,statSortable); } float statMedian(struct idVal **statList) /* finds the median of the list, will sort list to do this */ { boolean odd = slCount(*statList)%2; int half = slCount(*statList)/2; struct idVal *s=NULL, *s2=NULL; float median = -1; statSort(statList); /* if odd the median is the middle value */ if(odd) { s = slElementFromIx(*statList,half); if(s != NULL) median = s->val; } /* if even the median is the average of the two middle values */ else { s = slElementFromIx(*statList,half); s2 = slElementFromIx(*statList,half-1); if(s != NULL && s2 != NULL) median = (s->val + s2->val)/2; } return median; } void statFree(struct idVal **pEl) /* Free a single dynamically allocated stat such as created * with statLoad(). */ { struct idVal *el; if ((el = *pEl) == NULL) return; freez(pEl); } void statFreeList(struct idVal **pList) /* Free a list of dynamically allocated stat's */ { struct idVal *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; statFree(&el); } *pList = NULL; } char *parseNameFromHgc(char *string) /** parses out the name of the probe set. free returned string with freez() */ { char startPat[64]; char *toRet = NULL; char *tmp = NULL; safef(startPat, sizeof(startPat), "%s:", chip); toRet = strstr(string, startPat); if(toRet == NULL) { toRet = string; } else { toRet++; toRet = strstr(toRet, ":"); if(toRet == NULL) errAbort("Can't parse target name from %s.", string); toRet++; tmp = strstr(toRet, ";"); if(tmp != NULL) *tmp = '\0'; } toRet = cloneString(toRet); return toRet; } struct bed *createBedsFromPsls(char *pslFile, int expCount) /** creates a list of beds from a pslfile, allocates memory for arrays as determined by expCount */ { struct psl *pslList = NULL, *psl = NULL; struct bed *bedList = NULL, *bed = NULL; pslList = pslLoadAll(pslFile); for(psl = pslList; psl != NULL; psl = psl->next) { bed = bedFromPsl(psl); freez(&bed->name); bed->name=parseNameFromHgc(psl->qName); bed->score = 0; bed->expCount = 0; bed->expIds = needMem(sizeof(int)*expCount); bed->expScores = needMem(sizeof(float)*expCount); slAddHead(&bedList,bed); } slReverse(&bedList); pslFreeList(&pslList); return bedList; } struct hash *createBedHash(struct bed *bedList) /** takes a list of beds and puts them in a hash with duplicates numbered as name_1, name_2 */ { struct hash *bedHash = newHash(5); struct bed *bed = NULL; -struct dyString *ds = newDyString(1024); +struct dyString *ds = dyStringNew(1024); char *name = NULL; for(bed = bedList; bed != NULL; bed = bed->next) { int count = 0; char *targetName = NULL; struct bed *tmp = NULL; dyStringClear(ds); dyStringPrintf(ds,"%s_%d", bed->name, count); /* since we may have duplications, look for an empty slot in the hash */ while(TRUE && (count < 1000)) { tmp = hashFindVal(bedHash, ds->string); if(tmp == NULL) { hashAddUnique(bedHash, ds->string, bed); break; } else { dyStringClear(ds); dyStringPrintf(ds, "%s_%d", bed->name, ++count); } } } return bedHash; } struct expRecord *expRecordFromAffyAtlas(struct affyAtlas *aa) /** constructs a simple experiment record from information in an affyAtlas record */ { static int id = 0; struct expRecord *er = NULL; char name[256]; AllocVar(er); sprintf(name, "%s_%s", aa->annName, aa->tissue); er->id = id++; er->name = cloneString(name); er->description = cloneString(aa->tissue); er->url = cloneString("http://www.affymetrix.com/analysis/index.affx"); er->ref = cloneString("http://www.gnf.org/"); er->credit = cloneString("http://www.gnf.org/"); er->numExtras = 3; er->extras = needMem(sizeof(char*) * er->numExtras); er->extras[0] = cloneString(chip); er->extras[1] = cloneString(aa->annName); er->extras[2] = cloneString(aa->tissue); return er; } boolean differentExperiment(struct expRecord *er, struct affyAtlas *aa) /* return TRUE if this expRecord was derived from an affyAtlas record similar to this one FALSE otherwise */ { if(er == NULL && aa == NULL) return TRUE; if(er == NULL) return FALSE; if(aa == NULL) return FALSE; if(strstr(er->name, aa->annName) && strstr(er->name, aa->tissue)) return FALSE; else return TRUE; } boolean differentAA(struct affyAtlas *aa1,struct affyAtlas *aa2) /** returns TRUE if from different chips FALSE otherwise */ { return differentString(aa1->annName, aa2->annName); } int countExperiments(struct affyAtlas *aaList) /** takes an affy atlas file and counts the different experiments in it */ { struct affyAtlas *aa=NULL, *aaMark = NULL; int count =1; aaMark = aaList; for(aa = aaList; aa != NULL; aa = aa->next) { if(differentAA(aaMark, aa)) { count++; aaMark = aa; } } return count; } void addExperimentToBed(struct bed *bed, struct affyAtlas *aa, struct expRecord *er) /** add one affy atlas record to a ms bed record */ { bed->expIds[bed->expCount] = er->id; bed->score += aa->signal; bed->expScores[bed->expCount] = aa->signal; bed->expCount++; } void appendNewExperiment(struct hash *bedHash, struct affyAtlas *aa, struct expRecord *er) /** finds the correct beds from the beds hash and adds the data in the affyAtlas record to them. */ { char buff[256]; int count =0; struct bed *bed = NULL; snprintf(buff, sizeof(buff), "%s_%d", aa->probeSet, count); while(TRUE) { bed = hashFindVal(bedHash, buff); if(bed == NULL) break; else { addExperimentToBed(bed, aa, er); snprintf(buff, sizeof(buff), "%s_%d", aa->probeSet, ++count); } } } void appendExperiments(struct hash *bedHash, struct affyAtlas *aaList, struct expRecord **erList) /** loop through the altas file creating new experiments and appending them to the beds in the bed hash */ { struct affyAtlas *aa=NULL; struct expRecord *er = NULL; assert(aaList); er = expRecordFromAffyAtlas(aaList); for(aa = aaList; aa != NULL; aa = aa->next) { if(differentExperiment(er, aa)) { slAddHead(erList, er); er = expRecordFromAffyAtlas(aa); } appendNewExperiment(bedHash, aa, er); } /* add last experiment */ slAddHead(erList,er); } void checkAllBeds(struct bed **bedList, int expCount) /** check to make sure that all the beds have the same number of experiments associated with them */ { struct bed *bed = NULL; for(bed = *bedList; bed != NULL; ) { if(bed->expCount != expCount) { struct bed *tmp = NULL; if(bed->expCount != 0) { warn("Bed %s at %d has only %d exps, mark is %d", bed->name, slIxFromElement(*bedList, bed), bed->expCount, expCount); missingExpsCount++; } else noExpCount++; tmp = bed->next; slRemoveEl(bedList, bed); bed = tmp; } else bed = bed->next; } } float safeLog2(float num) /* returns log base 2 of num if num > 0, else returns 0 */ { if(num > 0) return logBase2(num); else return 0; } void convertIntensitiesToRatios(struct bed *bedList) /* for each bed calculate the median intensity not including missing data and calcute scores as ratio of each intensity to median intensity. */ { FILE *mediansOut = NULL, *valuesOut = NULL; float median = 0; struct idVal *statList = NULL, *stat = NULL; struct bed *bed = NULL; int i = 0; #ifdef DEBUG valuesOut = mustOpen("values.debug", "w"); mediansOut = mustOpen("medians.debug", "w"); #endif for(bed = bedList; bed != NULL; bed = bed->next) { float ratio = 0; float logRatio = 0; #ifdef DEBUG fprintf(mediansOut, "%s", bed->name); fprintf(valuesOut, "%s", bed->name); #endif /* find the median value by sorting */ for(i=0; i<bed->expCount; i++) { if(bed->expScores[i] != missingVal) { AllocVar(stat); stat->val = bed->expScores[i]; slAddHead(&statList, stat); } } median = statMedian(&statList); #ifdef DEBUG for(stat = statList; stat != NULL; stat = stat->next) { fprintf(valuesOut, "\t%f", stat->val); } fprintf(valuesOut, "\n"); fprintf(mediansOut, "\t%f\n", median); #endif statFreeList(&statList); for(i=0; i<bed->expCount; i++) { if(bed->expScores[i] != missingVal) { ratio = bed->expScores[i] / median; logRatio = safeLog2(ratio); bed->expScores[i] = logRatio; fprintf(scores, "%f\n", logRatio); } } } #ifdef DEBUG carefulClose(&mediansOut); carefulClose(&valuesOut); #endif } void calculateAverages(struct bed *bedList) /** divide the sum of the intensities stored in the score by number of experiments */ { struct bed *bed = NULL; for(bed = bedList; bed != NULL; bed = bed->next) { float tmp; assert(bed->expCount); tmp = bed->score / bed->expCount; if(tmp <= 0) bed->score = missingVal; else { tmp = 10 * tmp; bed->score = (int)tmp; } } } void affyPslAndAtlasToBedOld(char *pslFile, char *atlasFile, char *bedOut, char *expRecOut) /** Main function that does all the work for old-style*/ { struct hash *bedHash = NULL; struct affyAtlas *aaList=NULL, *aa=NULL; struct expRecord *erList=NULL, *er=NULL; struct bed *bedList=NULL, *bed=NULL; int expCount = 0; FILE *erOut = NULL, *bOut=NULL; warn("loading atlas file"); aaList = affyAtlasLoadAll(atlasFile); expCount = countExperiments(aaList); warn("creating list of beds from alignments"); bedList = createBedsFromPsls(pslFile, expCount); warn("creating hash from list of beds"); bedHash = createBedHash(bedList); warn("appending experiments to beds in hash"); appendExperiments(bedHash, aaList, &erList); warn("Running sanity Checks"); checkAllBeds(&bedList, expCount); warn("%d beds were missing experiments." , missingExpsCount); warn("%d beds had no experiments.", noExpCount); warn("Calculating average intensities"); convertIntensitiesToRatios(bedList); calculateAverages(bedList); warn("writing expRecords out"); erOut = mustOpen(expRecOut, "w"); for(er = erList; er != NULL; er = er->next) expRecordTabOut(er, erOut); carefulClose(&erOut); warn("writing beds out"); bOut = mustOpen(bedOut, "w"); for(bed = bedList; bed != NULL; bed = bed->next) bedTabOutN(bed, 15, bOut); carefulClose(&bOut); warn("cleaning up.."); freeHash(&bedHash); bedFreeList(&bedList); warn("Done."); } struct expCounter /* Keep track when have multiple experiments on same tissue. */ { char *name; /* Not allocated here. */ int count; /* Count of times seen. */ }; int lineToExp(char *line, char *fileName) /* Convert line to an expression record file. * Return number of expression records. */ { FILE *f = mustOpen(fileName, "w"); struct hash *hash = newHash(10); /* Integer valued hash */ char *word; int wordCount = 0; struct expCounter *ec; char *spaced; char name[128]; while ((word = nextTabWord(&line)) != NULL) { if ((ec = hashFindVal(hash, word)) == NULL) { AllocVar(ec); hashAddSaveName(hash, word, ec, &ec->name); } spaced = cloneString(word); subChar(spaced, '_', ' '); ec->count += 1; if (ec->count > 1) safef(name, sizeof(name), "%s %d", spaced, ec->count); else safef(name, sizeof(name), "%s", spaced); fprintf(f, "%d\t", wordCount); fprintf(f, "%s\t", name); fprintf(f, "%s\t", name); fprintf(f, "%s\t", "http://www.affymetrix.com/analysis/index.affx"); fprintf(f, "%s\t", "http://expression.gnf.org"); fprintf(f, "%s\t", "http://www.gnf.org"); fprintf(f, "3\t"); fprintf(f, "%s,%s,%s,\n", chip, "n/a", spaced); ++wordCount; } carefulClose(&f); return wordCount; } int findPositiveMedian(double *data, int count, double minVal) /* Find median of positive numbers in data. */ { double *sorted; int i, realCount = 0; double median = -1; AllocArray(sorted, count); for (i=0; i<count; ++i) { if (data[i] >= minVal) { sorted[realCount] = data[i]; ++realCount; } } if (realCount > 2) { median = doubleMedian(realCount, sorted); } freez(&sorted); return median; } void shortDataOut(FILE *f, char *name, int count, double *scores) /* Do short type output. */ { int i; fprintf(f, "%s\t%d\t", name, count); for (i=0; i<count; ++i) fprintf(f, "%0.3f,", scores[i]); fprintf(f, "\n"); } void affyPslAndAtlasToBedNew(char *pslFile, char *atlasFile, char *bedOut, char *expRecOut) /** Main function that does all the work for new-style*/ { struct lineFile *lf = lineFileOpen(atlasFile, TRUE); char *line, *name; int i, wordCount, expCount; char **row; double *data, median; double invMedian, ratio, logRatio; char *affyId; struct hash *hash = newHash(17); struct psl *psl; struct bed *bed; FILE *f = NULL; int dataCount = 0, pslCount = 0, bedCount = 0; int minExpVal = 20; /* Open Atlas file and use first line to create experiment table. */ if (!lineFileNextReal(lf, &line)) errAbort("%s is empty", lf->fileName); if (startsWith("Affy", line)) line += 4; if (line[0] != '\t') errAbort("%s doesn't seem to be a new format atlas file", lf->fileName); expCount = lineToExp(line+1, expRecOut); if (expCount <= 0) errAbort("No experiments in %s it seems", lf->fileName); warn("%d experiments\n", expCount); f = mustOpen(bedOut, "w"); /* Build up a hash keyed by affyID with an int array of data * for value. Do output in short case. */ AllocArray(row, expCount); while (lineFileNextReal(lf, &line)) { affyId = nextWord(&line); wordCount = chopByWhite(line, row, expCount); if (wordCount != expCount) errAbort("Expecting %d data points, got %d line %d of %s", expCount, wordCount, lf->lineIx, lf->fileName); if (hashLookup(hash, affyId)) { warn("Duplicate %s, skipping all but first.", affyId); continue; } AllocArray(data, expCount); for (i=0; i<expCount; ++i) { data[i] = atof(row[i]); if (data[i] < minExpVal) data[i] = minExpVal; } median = findPositiveMedian(data, expCount, minExpVal); if (median >= 0) { invMedian = 1.0/median; for (i=0; i<expCount; ++i) { double val = data[i]; val = safeLog2(invMedian*val); data[i] = val; } if (shortOut) shortDataOut(f, affyId, expCount, data); else hashAdd(hash, affyId, data); } data = NULL; ++dataCount; } lineFileClose(&lf); warn("%d rows of expression data\n", dataCount); /* Stream through psl file, converting it to bed with expression data. */ if (!shortOut) { lf = pslFileOpen(pslFile); while ((psl = pslNext(lf)) != NULL) { ++pslCount; /* get probe id from sequence name */ name=parseNameFromHgc(psl->qName); data = hashFindVal(hash, name); if (data != NULL) { struct bed *bed = bedFromPsl(psl); bed->expCount = expCount; AllocArray(bed->expIds, expCount); AllocArray(bed->expScores, expCount); for (i=0; i<expCount; ++i) { bed->expScores[i] = data[i]; bed->expIds[i] = i; } bedTabOutN(bed, 15, f); ++bedCount; bedFree(&bed); } pslFree(&psl); } warn("%d records in %s", pslCount, pslFile); warn("%d records written to %s", bedCount, bedOut); } lineFileClose(&lf); carefulClose(&f); } int main(int argc, char *argv[]) { optionInit(&argc, argv, options); shortOut = optionExists("shortOut"); newType = optionExists("newType"); chip = optionVal("chip", chip); if(argc != 5) usage(); scores = mustOpen("scores.tab", "w"); if (newType) affyPslAndAtlasToBedNew(argv[1], argv[2], argv[3], argv[4]); else affyPslAndAtlasToBedOld(argv[1], argv[2], argv[3], argv[4]); return 0; }