f952b14fca3833e9d8fef0ae8a086ad15485b552 galt Wed Jan 1 02:11:38 2025 -0800 fix various errors for utils compiled on Rocky 9 hgwdev-new. diff --git src/hg/makeDb/outside/hgGtex/hgGtex.c src/hg/makeDb/outside/hgGtex/hgGtex.c index d6c9ca5..220b0d9 100644 --- src/hg/makeDb/outside/hgGtex/hgGtex.c +++ src/hg/makeDb/outside/hgGtex/hgGtex.c @@ -1,920 +1,920 @@ /* hgGtex - Load data from NIH Common Fund Gene Tissue Expression (GTEX) portal. In the style of hgGnfMicroarray */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "portable.h" #include "hgRelate.h" #include "gtexInfo.h" #include "gtexDonor.h" #include "gtexSample.h" #include "gtexTissue.h" #include "gtexSampleData.h" #include "gtexTissueData.h" #include "gtexTissueMedian.h" /* globals */ char *database = "hgFixed"; char *tabDir = "."; boolean doLoad = FALSE; boolean doData = FALSE; boolean doRound = FALSE; boolean median = FALSE; boolean exon = FALSE; boolean dropZeros = FALSE; char *releaseDate = NULL; int limit = 0; int dataSampleCount = 0; FILE *sampleDataFile, *tissueDataFile; FILE *tissueMedianAllFile, *tissueMedianFemaleFile, *tissueMedianMaleFile; struct hash *donorHash; #define DATA_FILE_VERSION "#1.2" void usage() /* Explain usage and exit. */ { errAbort( "hgGtex - Load GTEX data and sample files\n" "usage:\n" " hgGtex dataFile samplesFile outTissuesFile\n" " or\n" " hgGtex [options] tableRoot version dataFile samplesFile subjectsFile tissuesFile\n" "\n" "The first syntax generates a tissues file, with ids and candidate short names,\n" "intended for manual editing for clarity and conciseness.\n" "\n" "The second syntax creates tables in hgFixed:\n" " 1. All data (rootSampleData) : a row for each gene+sample, with RPKM expression level\n" " 2. Tissue data (rootTissueData): a row for each gene+tissue, with min/max/q1/q2/median\n" " 3. Median data (rootTissueMedian[All|Female|Male]): a row for each gene with a list of\n" " median RPKM expression levels by tissue. There are 3 of these\n" " 4. Sample (rootSample): a row per sample, with metadata from GTEX\n" " 5. Donor (rootDonor): a row per subject, with metadata from GTEX\n" " 6. Info: Info: version, release date, and max median score (merge this into existing\n" " file if any)\n" "\n" "options:\n" " -database=XXX (default %s)\n" " -tab=dir - Output tab-separated files to directory.\n" " -noLoad - Don't load database and don't clean up tab files\n" " -noData - Don't create data files/tables (just metadata)\n" " -doRound - Round data values\n" " -dropZeros - Ignore zero-valued data rows (not recommended)\n" " -limit=N - Only do limit rows of data table, for testing\n" " -exon - Create exon tables instead of gene tables\n" " 1. All data (rootSampleExonData)\n" " 2. Tissue data (rootTissueExonData)\n" " 3. Median data (rootTissueExonMedian)\n" " -releaseDate=YY-MM-DD - Set release date (o/w use 'now')\n" , database); } static struct optionSpec options[] = { {"database", OPTION_STRING}, {"tab", OPTION_STRING}, {"noLoad", OPTION_BOOLEAN}, {"noData", OPTION_BOOLEAN}, {"doRound", OPTION_BOOLEAN}, {"dropZeros", OPTION_BOOLEAN}, {"exon", OPTION_BOOLEAN}, {"limit", OPTION_INT}, {NULL, 0}, }; /****************************/ /* Deal with donors */ #define SUBJECT_FIRST_FIELD_LABEL "SUBJID" #define SUBJECT_NAME_FIELD 0 // GTEX-XXXX #define SUBJECT_GENDER_FIELD 1 // 1=Male (hmmf), 2=Female #define donorGetGender(x) (sqlUnsigned(x) == 1 ? "M" : "F") #define donorIsFemale(x) (sameString(x, "F")) #define SUBJECT_AGE_FIELD 2 // e.g. 60-69 years #define SUBJECT_DEATH_FIELD 3 // Hardy scale 0-4 or empty (unknown?). See .as for scale definitions. #define donorGetDeathClass(x) (isEmpty(x) ? -1 : sqlUnsigned(x)) #define SUBJECT_LAST_FIELD SUBJECT_DEATH_FIELD int donorGetAge(char *age) /* Change '60-69 yrs' to numeric 60 */ { char *pos; char *ageBuf = cloneString(age); pos = stringIn("-", ageBuf); if (pos == NULL) return 0; *pos = '\0'; return sqlUnsigned(ageBuf); } char *donorFromSampleId(char *sampleId) /* Parse donorId from sampleId */ /* donor is first 2 components of sampleId: GTEX-XXXX */ { char *donor = cloneString(sampleId); *strchr(strchr(donor, '-')+1, '-') = 0; return donor; } boolean sampleIsFemale(char *sampleId) /* Return TRUE if sample is from a female donor */ { char *donorId = donorFromSampleId(sampleId); struct gtexDonor *donor = hashMustFindVal(donorHash, donorId); return donorIsFemale(donor->gender); } struct gtexDonor *parseSubjectFile(struct lineFile *lf) { char *line; if (!lineFileNext(lf, &line, NULL)) errAbort("%s is empty", lf->fileName); if (!startsWith(SUBJECT_FIRST_FIELD_LABEL, line)) errAbort("unrecognized format - expecting subject file header in %s first line", lf->fileName); char *words[100]; int wordCount; struct gtexDonor *donor=NULL, *donors = NULL; while (lineFileNext(lf, &line, NULL)) { /* Convert line to donor record */ wordCount = chopTabs(line, words); lineFileExpectWords(lf, SUBJECT_LAST_FIELD+1, wordCount); AllocVar(donor); char *subject = cloneString(words[SUBJECT_NAME_FIELD]); char *gender = cloneString(words[SUBJECT_GENDER_FIELD]); char *age = cloneString(words[SUBJECT_AGE_FIELD]); char *deathClass = cloneString(words[SUBJECT_DEATH_FIELD]); verbose(3, "subject: %s %s %s %s\n", subject, gender, age, deathClass); donor->name = subject; donor->gender = donorGetGender(gender); donor->age = donorGetAge(age); donor->deathClass = donorGetDeathClass(deathClass); slAddTail(&donors, donor); //slAddHead(&donors, donor); //slReverse(&donors); } verbose(2, "Found %d donors\n", slCount(donors)); return(donors); } /****************************/ /* Process sample file */ #define SAMPLE_FIRST_FIELD_LABEL "SAMPID" #define SAMPLE_TISSUE_FIELD_LABEL "SMTSD" // NOTE: more robust to include map of GTEX field names (do this if we include RNA-seqC metrics) #define SAMPLE_NAME_FIELD_INDEX 0 #define SAMPLE_AUTOLYSIS_FIELD_INDEX 1 #define SAMPLE_CENTERS_FIELD_INDEX 2 #define SAMPLE_PATHOLOGY_FIELD_INDEX 3 #define SAMPLE_RIN_FIELD_INDEX 4 #define SAMPLE_ORGAN_FIELD_INDEX 5 #define SAMPLE_TISSUE_FIELD_INDEX 6 #define SAMPLE_ISCHEMIC_FIELD_INDEX 7 #define V6 /* Sample file changed format between V4 and V6 */ #ifdef V6 #define SAMPLE_BATCH_FIELD_INDEX 10 #define SAMPLE_ISOLATION_FIELD_INDEX 11 #define SAMPLE_DATE_FIELD_INDEX 12 #else #define SAMPLE_BATCH_FIELD_INDEX 8 #define SAMPLE_ISOLATION_FIELD_INDEX 9 #define SAMPLE_DATE_FIELD_INDEX 10 #endif int parseSampleFileHeader(struct lineFile *lf) /* Parse GTEX sample file header. Return number of columns */ /* TODO: return column headers in array */ { char *line; if (!lineFileNext(lf, &line, NULL)) errAbort("%s is empty", lf->fileName); if (!startsWith(SAMPLE_FIRST_FIELD_LABEL, line)) errAbort("unrecognized format - expecting sample file header in %s first line", lf->fileName); char *words[100]; int sampleCols = chopTabs(line, words); if (sampleCols < SAMPLE_TISSUE_FIELD_INDEX+1 || differentString(words[SAMPLE_TISSUE_FIELD_INDEX], SAMPLE_TISSUE_FIELD_LABEL)) errAbort("unrecognized format - expecting sample file header in %s first line", lf->fileName); return sampleCols; } struct sampleTissue { struct sampleTissue *next; char *name; char *tissue; char *organ; }; struct hash *parseSampleTissues(struct lineFile *lf, int expectedCols) /* Parse sample descriptions. Return hash of samples with tissue info */ { char *line; int wordCount; char *words[100]; struct sampleTissue *sample; struct hash *sampleHash = hashNew(0); while (lineFileNext(lf, &line, NULL)) { /* Convert line to sample record */ wordCount = chopTabs(line, words); lineFileExpectWords(lf, expectedCols, wordCount); AllocVar(sample); sample->name = cloneString(words[SAMPLE_NAME_FIELD_INDEX]); sample->organ = cloneString(words[SAMPLE_ORGAN_FIELD_INDEX]); // Handle missing tissue and organ if (!*sample->organ) sample->organ = "Unannotated"; sample->tissue = cloneString(words[SAMPLE_TISSUE_FIELD_INDEX]); if (!*sample->tissue) sample->tissue = "Unannotated"; hashAdd(sampleHash, sample->name, sample); } verbose(2, "Found %d samples in sample file\n", hashNumEntries(sampleHash)); return sampleHash; } struct hash *parseSamples(struct lineFile *lf, struct slName *sampleIds, int expectedCols, struct hash *tissueNameHash) /* Parse sample descriptions and populate sample objects using tissue name from hash. Limit to samples present in data file (in sampleId list). Return hash keyed on sample name */ { char *line; int wordCount; char *words[100]; struct hash *hash = hashNew(0); struct gtexSample *sample = NULL; struct hash *sampleNameHash = hashNew(0); struct slName *sampleId; for (sampleId = sampleIds; sampleId != NULL; sampleId = sampleId->next) { hashAdd(sampleNameHash, sampleId->name, NULL); } int i = 0; while (lineFileNext(lf, &line, NULL)) { /* Convert line to sample record */ wordCount = chopTabs(line, words); lineFileExpectWords(lf, expectedCols, wordCount); i++; char *sampleId = cloneString(words[SAMPLE_NAME_FIELD_INDEX]); if (!hashLookup(sampleNameHash, sampleId)) continue; AllocVar(sample); sample->sampleId = sampleId; sample->donor = donorFromSampleId(sampleId); verbose(4, "parseSamples: lookup %s in tissueNameHash\n", words[SAMPLE_TISSUE_FIELD_INDEX]); sample->tissue = hashMustFindVal(tissueNameHash, words[SAMPLE_TISSUE_FIELD_INDEX]); verbose(4, "autolysis=%s, ischemic=%s, rin=%s, pathNotes=%s, sites=%s, batch=%s, isolation=%s, date=%s\n", words[SAMPLE_AUTOLYSIS_FIELD_INDEX], words[SAMPLE_ISCHEMIC_FIELD_INDEX], words[SAMPLE_RIN_FIELD_INDEX], words[SAMPLE_PATHOLOGY_FIELD_INDEX], words[SAMPLE_CENTERS_FIELD_INDEX], words[SAMPLE_BATCH_FIELD_INDEX], words[SAMPLE_ISOLATION_FIELD_INDEX], words[SAMPLE_DATE_FIELD_INDEX]); char *word = words[SAMPLE_AUTOLYSIS_FIELD_INDEX]; sample->autolysisScore = (isNotEmpty(word) ? sqlSigned(word) : -1); //word = words[SAMPLE_ISCHEMIC_FIELD_INDEX]; //sample->ischemicTime = (word ? cloneString(word) : "unknown"); #ifdef V6 // Feb 2016: this field is not in posted V6 sample file. Broad has been notified. sample->ischemicTime = "n/a"; #else sample->ischemicTime = cloneString(words[SAMPLE_ISCHEMIC_FIELD_INDEX]); #endif word = words[SAMPLE_RIN_FIELD_INDEX]; sample->rin = (isNotEmpty(word) ? sqlFloat(word) : 0); //word = words[SAMPLE_PATHOLOGY_FIELD_INDEX]; sample->pathNotes = cloneString(words[SAMPLE_PATHOLOGY_FIELD_INDEX]); // Sites may be comma-sep list with embedded spaces. Strip the spaces word = cloneString(words[SAMPLE_CENTERS_FIELD_INDEX]); stripChar(word, ' '); sample->collectionSites = word; // These are always populated sample->batchId = cloneString(words[SAMPLE_BATCH_FIELD_INDEX]); // Another field with embedded spaces -- strip them out word = cloneString(words[SAMPLE_ISOLATION_FIELD_INDEX]); subChar(word, ' ', '_'); sample->isolationType = word; sample->isolationDate = cloneString(words[SAMPLE_DATE_FIELD_INDEX]); verbose(4, "Adding sample: \'%s'\n", sampleId); hashAdd(hash, sampleId, sample); } verbose(2, "Found %d data samples out of %d in sample file\n", hashNumEntries(hash), i); return hash; } struct sampleOffset { struct sampleOffset *next; char *sample; unsigned int offset; }; struct hash *groupSamplesByTissue(struct hash *sampleHash, struct slName *sampleIds, int sampleCount) /* Group samples by tissue for median option */ { struct hash *tissueOffsetHash = hashNew(0); struct sampleOffset *offset; struct hashEl *el; struct gtexSample *sample; int i; struct slName *sampleId; for (i=0, sampleId = sampleIds; sampleId != NULL; sampleId = sampleId->next, i++) { verbose(4, "groupSamplesByTissue: lookup %s in sampleHash\n", sampleId->name); sample = hashMustFindVal(sampleHash, sampleId->name); AllocVar(offset); offset->offset = i; offset->sample = cloneString(sampleId->name); el = hashLookup(tissueOffsetHash, sample->tissue); if (el) slAddHead((struct sampleOffset *)el->val, offset); else hashAdd(tissueOffsetHash, sample->tissue, offset); } //#define DEBUG 1 #ifdef DEBUG uglyf("tissue count: %d\n", slCount(tissueOffsets)); for (el = tissueOffsets; el != NULL; el = el->next) { uglyf("%s\t", el->name); for (offset = (struct slUnsigned *)el->val; offset->next; offset = offset->next) uglyf("%d,", offset->val); uglyf("\n"); } #endif return tissueOffsetHash; } /****************************/ /* Process data file */ #define DATA_GENE_COUNT_FIELD_INDEX 0 #define DATA_SAMPLE_COUNT_FIELD_INDEX 1 struct slName *parseDataFileHeader(struct lineFile *lf, int sampleCount, int *dataSampleCountRet) /* Parse version, info, and header lines. Return array of sample Ids in order from header */ { char *line; if (!lineFileNext(lf, &line, NULL)) errAbort("%s is empty", lf->fileName); if (!startsWith(DATA_FILE_VERSION, line)) errAbort("unrecognized format - expecting %s in %s first line", DATA_FILE_VERSION, lf->fileName); if (!lineFileNext(lf, &line, NULL)) errAbort("%s is truncated", lf->fileName); /* Parse #genes #samples */ char *words[100]; int wordCount = chopLine(line, words); if (wordCount != 2) errAbort("%s is truncated: expecting <#genes> <#samples>", lf->fileName); int geneCount = sqlUnsigned(words[DATA_GENE_COUNT_FIELD_INDEX]); int headerSampleCount = sqlUnsigned(words[DATA_SAMPLE_COUNT_FIELD_INDEX]); if (headerSampleCount > sampleCount) errAbort("data file has more samples than sample file"); verbose(2, "GTEX data file: %d genes, %d samples\n", geneCount, headerSampleCount); /* Parse header line containing sample names */ if (!lineFileNext(lf, &line, NULL)) errAbort("%s is truncated", lf->fileName); if (!startsWith("Name\tDescription", line)) errAbort("%s unrecognized format", lf->fileName); char *sampleIds[sampleCount+3]; dataSampleCount = chopTabs(line, sampleIds) - 2; if (headerSampleCount != dataSampleCount) warn("Sample count mismatch in data file: header=%d, columns=%d\n", headerSampleCount, dataSampleCount); verbose(3, "dataSampleCount=%d\n", dataSampleCount); if (dataSampleCountRet) *dataSampleCountRet = dataSampleCount; char **samples = &sampleIds[2]; struct slName *idList = slNameListFromStringArray(samples, sampleCount+3); return idList; } struct slName *parseExonDataFileHeader(struct lineFile *lf, int sampleCount, int *dataSampleCountRet) /* Parse header line. Return array of sample Ids in order from header */ { char *line; if (!lineFileNext(lf, &line, NULL)) errAbort("%s is truncated", lf->fileName); if (!startsWith("Id", line)) errAbort("%s unrecognized format", lf->fileName); char *sampleIds[sampleCount+2]; int dataSampleCount = chopTabs(line, sampleIds) - 1; verbose(3, "dataSampleCount=%d\n", dataSampleCount); if (dataSampleCountRet) *dataSampleCountRet = dataSampleCount; char **samples = &sampleIds[1]; // skip over Id column struct slName *idList = slNameListFromStringArray(samples, dataSampleCount+2); return idList; } void dataRowsOut(char **row, int tissueCount, char *tissueOrder[], struct hash *tissueOffsets, double *maxScoreRet, double *maxMedianRet) /* Output expression levels per sample and tissue for one gene. Return max score, median computed */ { int i=0, j=0; struct sampleOffset *sampleOffset, *sampleOffsets; double *sampleVals, *femaleSampleVals, *maleSampleVals; double maxMedian = 0; double maxScore = 0; /* Print geneId and tissue count to median table files */ char *gene = row[0]; fprintf(tissueMedianAllFile, "%s\t%d\t", gene, tissueCount); fprintf(tissueMedianFemaleFile, "%s\t%d\t", gene, tissueCount); fprintf(tissueMedianMaleFile, "%s\t%d\t", gene, tissueCount); verbose(3, "%s\n", gene); for (i=0; i<tissueCount; i++) { char *tissue = tissueOrder[i]; /* Get values for all samples for each tissue */ sampleOffsets = (struct sampleOffset *)hashMustFindVal(tissueOffsets, tissue); int tissueSampleCount = slCount(sampleOffsets); verbose(3, "%s\t%s\t%d samples\t", gene, tissue, tissueSampleCount); verbose(3, "\n"); AllocArray(sampleVals, tissueSampleCount); AllocArray(femaleSampleVals, tissueSampleCount); AllocArray(maleSampleVals, tissueSampleCount); int mj =0, fj = 0; for (j = 0, sampleOffset = sampleOffsets; j < tissueSampleCount; sampleOffset = sampleOffset->next, j++) { // skip over Name and Description fields to find first score for this gene // WARNING: row parsing should be handled in parse routines int skip = (exon ? 1 : 2); double val = sqlDouble(row[(sampleOffset->offset) + skip]); if (dropZeros && val == 0.0) continue; // Output to sample data file // TODO: use gtexSampleDataOut verbose(3, " %s\t%s\t%s\t%0.3f\n", gene, sampleOffset->sample, tissue, val); fprintf(sampleDataFile, "%s\t%s\t%s\t", gene, sampleOffset->sample, tissue); if (doRound) fprintf(sampleDataFile, "%d", round(val)); else fprintf(sampleDataFile, "%0.3f", val); fprintf(sampleDataFile, "\n"); sampleVals[j] = val; maxScore = max(val, maxScore); // create gender subsets if (sampleIsFemale(sampleOffset->sample)) femaleSampleVals[fj++] = val; else maleSampleVals[mj++] = val; } /* Compute stats for all samples */ double min, q1, median, q3, max; doubleBoxWhiskerCalc(j, sampleVals, &min, &q1, &median, &q3, &max); //medianVal = (float)doubleMedian(tissueSampleCount, sampleVals); maxMedian = max(median, maxMedian); verbose(3, "median %s %0.3f\n", tissue, median); /* If no rounding, then print as float, otherwise round */ if (doRound) fprintf(tissueMedianAllFile, "%d,", round(median)); else fprintf(tissueMedianAllFile, "%0.3f,", median); /* Compute stats for gender subsets */ median = 0.0; if (fj) doubleBoxWhiskerCalc(fj, femaleSampleVals, &min, &q1, &median, &q3, &max); if (doRound) fprintf(tissueMedianFemaleFile, "%d,", round(median)); else fprintf(tissueMedianFemaleFile, "%0.3f,", median); median = 0.0; if (mj) doubleBoxWhiskerCalc(mj, maleSampleVals, &min, &q1, &median, &q3, &max); if (doRound) fprintf(tissueMedianMaleFile, "%d,", round(median)); else fprintf(tissueMedianMaleFile, "%0.3f,", median); // calculate other stats // print row in tissue data file fprintf(tissueDataFile, "%s\t%s\t%0.3f\t%0.3f\t%0.3f\t%0.3f\t%0.3f\n", gene, tissue, min, q1, median, q3, max); freez(&sampleVals); } fprintf(tissueMedianAllFile, "\n"); fprintf(tissueMedianFemaleFile, "\n"); fprintf(tissueMedianMaleFile, "\n"); verbose(3, "max median: %0.3f\n", maxMedian); if (maxScoreRet) *maxScoreRet = maxScore; if (maxMedianRet) *maxMedianRet = maxMedian; } /****************************/ /* Deal with tissues */ char *makeTissueName(char *description) /* Create a single word camel-case name from a tissue description */ { char *words[10]; int count = chopByWhite(cloneString(description), words, sizeof(words)); struct dyString *ds = dyStringNew(0); int i; for (i=0; i<count; i++) { char *word = words[i]; if (!isalpha(word[0]) || !isalpha(word[strlen(word)-1])) continue; dyStringAppend(ds, word); } char *newName = dyStringCannibalize(&ds); newName[0] = tolower(newName[0]); return newName; } void tissuesOut(char *outFile, struct hash *tissueSampleHash) /* Write tissues file */ { struct hashEl *helList = hashElListHash(tissueSampleHash); slSort(&helList, hashElCmp); FILE *f = mustOpen(outFile, "w"); struct hashEl *el; struct gtexTissue *tissue; int i; for (i=0, el = helList; el != NULL; el = el->next, i++) { struct sampleTissue *sample = (struct sampleTissue *)el->val; AllocVar(tissue); tissue->id = i; tissue->description = sample->tissue; tissue->organ = sample->organ; tissue->name = makeTissueName(tissue->description); tissue->color = 0; // TODO: get GTEX colors from file gtexTissueOutput(tissue, f, '\t', '\n'); } carefulClose(&f); } /****************************/ /* Main functions */ void hgGtexTissues(char *dataFile, char *sampleFile, char *outFile) /* Create a tissues file with aliases for use in other tables. Limit to tissues * in the data file, since these will be indexes into comma-sep data values. * This is a separate step to support manual editing of aliases for clarity & conciseness */ { struct lineFile *lf; int i; /* Parse tissue info from samples file */ lf = lineFileOpen(sampleFile, TRUE); int sampleCols = parseSampleFileHeader(lf); struct hash *sampleHash = parseSampleTissues(lf, sampleCols); verbose(2, "%d samples in samples file\n", hashNumEntries(sampleHash)); lineFileClose(&lf); /* Get sample IDs from header in data file */ lf = lineFileOpen(dataFile, TRUE); int dataSampleCount = 0; struct slName *sampleIds = NULL; sampleIds = parseDataFileHeader(lf, hashNumEntries(sampleHash), &dataSampleCount); verbose(2, "%d samples in data file\n", dataSampleCount); lineFileClose(&lf); /* Gather tissues from samples */ struct hash *tissueSampleHash = hashNew(0); struct sampleTissue *sample; struct slName *sampleId; for (i=0, sampleId = sampleIds; sampleId != NULL; sampleId = sampleId->next, i++) { verbose(4, "hgGtexTissues: lookup %s in sampleHash\n", sampleId->name); sample = hashMustFindVal(sampleHash, sampleId->name); if (!hashLookup(tissueSampleHash, sample->tissue)) hashAdd(tissueSampleHash, sample->tissue, sample); } verbose(3, "%d tissues\n", hashNumEntries(tissueSampleHash)); /* Write tissues file */ tissuesOut(outFile, tissueSampleHash); } void hgGtex(char *tableRoot, char *version, char *dataFile, char *sampleFile, char *subjectFile, char *tissuesFile) /* Main function to load tables*/ { char *line; int wordCount; FILE *f = NULL; FILE *infoFile = NULL; int dataCount = 0; struct lineFile *lf; int dataSampleCount = 0; struct hash *sampleHash; /* Load tissue file as we will use short tissue names, not long descriptions as in sample file */ struct gtexTissue *tissue, *tissues; tissues = gtexTissueLoadAllByChar(tissuesFile, '\t'); struct hash *tissueNameHash = hashNew(0); donorHash = hashNew(0); char **tissueOrder = NULL; AllocArray(tissueOrder, slCount(tissues)); for (tissue = tissues; tissue != NULL; tissue = tissue->next) { verbose(4, "Adding to tissueNameHash: id=%d, key=%s, val=%s, group=%s\n", tissue->id, tissue->description, tissue->name, tissue->organ); hashAdd(tissueNameHash, tissue->description, tissue->name); tissueOrder[tissue->id] = tissue->name; } verbose(3, "tissues in hash: %d\n", hashNumEntries(tissueNameHash)); /* Count samples in sample file */ lf = lineFileOpen(sampleFile, TRUE); int sampleCols = parseSampleFileHeader(lf); sampleHash = parseSampleTissues(lf, sampleCols); int sampleCount = hashNumEntries(sampleHash); verbose(2, "%d samples in samples file\n", sampleCount); lineFileClose(&lf); /* Open GTEX expression data file, and read header lines. Return list of sample IDs */ lf = lineFileOpen(dataFile, TRUE); struct slName *sampleIds; if (exon) sampleIds = parseExonDataFileHeader(lf, hashNumEntries(sampleHash), &dataSampleCount); else sampleIds = parseDataFileHeader(lf, hashNumEntries(sampleHash), &dataSampleCount); verbose(3, "%d samples in data file\n", dataSampleCount); lineFileClose(&lf); /* Parse sample file, creating sample objects for all samples in data file */ lf = lineFileOpen(sampleFile, TRUE); parseSampleFileHeader(lf); sampleHash = parseSamples(lf, sampleIds, sampleCols, tissueNameHash); lineFileClose(&lf); /* Get offsets in data file for samples by tissue */ struct hash *tissueOffsets = groupSamplesByTissue(sampleHash, sampleIds, dataSampleCount); int tissueCount = hashNumEntries(tissueOffsets); verbose(2, "tissue count: %d\n", tissueCount); if (!exon) { /* Create sample table with samples ordered as in data file */ char sampleTable[64]; safef(sampleTable, sizeof(sampleTable), "%sSample", tableRoot); f = hgCreateTabFile(tabDir, sampleTable); struct gtexSample *sample; struct slName *sampleId; for (sampleId = sampleIds; sampleId != NULL; sampleId = sampleId->next) { verbose(4, "hgGtex: lookup %s in sampleHash\n", sampleId->name); sample = hashMustFindVal(sampleHash, sampleId->name); gtexSampleOutput(sample, f, '\t', '\n'); } /* Load subjects (donors) file and write to table file */ struct gtexDonor *donor, *donors; lf = lineFileOpen(subjectFile, TRUE); donors = parseSubjectFile(lf); verbose(2, "%d donors in subjects file\n", slCount(donors)); lineFileClose(&lf); char donorTable[64]; safef(donorTable, sizeof(donorTable), "%sDonor", tableRoot); FILE *donorFile = hgCreateTabFile(tabDir, donorTable); for (donor = donors; donor != NULL; donor = donor->next) { gtexDonorOutput(donor, donorFile, '\t', '\n'); hashAdd(donorHash, donor->name, donor); } if (doLoad) { struct sqlConnection *conn = sqlConnect(database); /* Load sample table */ verbose(2, "Creating sample table\n"); gtexSampleCreateTable(conn, sampleTable); verbose(3, "Load table %s from %s/%s\n", sampleTable, tabDir, sampleTable); hgLoadTabFile(conn, tabDir, sampleTable, &f); hgRemoveTabFile(tabDir, sampleTable); /* Load tissue table */ char tissueTable[64]; verbose(2, "Creating tissue table\n"); safef(tissueTable, sizeof(tissueTable), "%sTissue", tableRoot); gtexTissueCreateTable(conn, tissueTable); - char dir[128]; - char fileName[64]; + char dir[PATH_LEN]; + char fileName[FILENAME_LEN]; splitPath(tissuesFile, dir, fileName, NULL); if (dir[0] == 0) dir[0]= '.'; verbose(3, "Load table %s from %s/%s\n", tissueTable, dir, fileName); hgLoadNamedTabFile(conn, dir, tissueTable, fileName, NULL); /* Load donor table */ verbose(2, "Creating donor table\n"); gtexDonorCreateTable(conn, donorTable); verbose(3, "Load table %s from %s/%s\n", donorTable, dir, donorTable); hgLoadTabFile(conn, tabDir, donorTable, &donorFile); hgRemoveTabFile(tabDir, donorTable); sqlDisconnect(&conn); } else { carefulClose(&f); carefulClose(&donorFile); } } if (!doData) return; /* Ready to process data items */ /* Create sample data file */ char sampleDataTable[64]; safef(sampleDataTable, sizeof(sampleDataTable), "%s%sSampleData", tableRoot, exon ? "Exon": ""); sampleDataFile = hgCreateTabFile(tabDir,sampleDataTable); /* Create tissue median files */ char tissueMedianAllTable[64], tissueMedianFemaleTable[64], tissueMedianMaleTable[64]; safef(tissueMedianAllTable, sizeof(tissueMedianAllTable), "%s%sTissueMedianAll", tableRoot, exon ? "Exon": ""); tissueMedianAllFile = hgCreateTabFile(tabDir,tissueMedianAllTable); safef(tissueMedianFemaleTable, sizeof(tissueMedianFemaleTable), "%s%sTissueMedianFemale", tableRoot, exon ? "Exon": ""); tissueMedianFemaleFile= hgCreateTabFile(tabDir,tissueMedianFemaleTable); safef(tissueMedianMaleTable, sizeof(tissueMedianMaleTable), "%s%sTissueMedianMale", tableRoot, exon ? "Exon": ""); tissueMedianMaleFile = hgCreateTabFile(tabDir,tissueMedianMaleTable); /* Create tissue summary data table */ char tissueDataTable[64]; safef(tissueDataTable, sizeof(tissueDataTable), "%s%sTissueData", tableRoot, exon ? "Exon": ""); tissueDataFile = hgCreateTabFile(tabDir,tissueDataTable); /* Open GTEX expression data file, and read header lines. Return list of sample IDs */ lf = lineFileOpen(dataFile, TRUE); if (exon) parseExonDataFileHeader(lf, sampleCount, NULL); else parseDataFileHeader(lf, sampleCount, NULL); /* Parse expression values in data file. Each row is a gene (or exon)*/ char **row; AllocArray(row, dataSampleCount+2); double maxMedian = 0, maxScore = 0; double rowMaxMedian, rowMaxScore; while (lineFileNext(lf, &line, NULL)) { // WARNING: header parsing should be managed in one place wordCount = chopByChar(line, '\t', row, dataSampleCount+2); int expected = wordCount - (exon ? 1 : 2); if (expected != dataSampleCount) errAbort("Expecting %d data points, got %d line %d of %s", dataSampleCount, wordCount-2, lf->lineIx, lf->fileName); dataRowsOut(row, tissueCount, tissueOrder, tissueOffsets, &rowMaxMedian, &rowMaxScore); maxMedian = max(rowMaxMedian, maxMedian); maxScore = max(rowMaxScore, maxScore); dataCount++; if (limit != 0 && dataCount >= limit) break; } lineFileClose(&lf); /* Create info file */ char infoTable[64]; safef(infoTable, sizeof(infoTable), "%s%sInfo", tableRoot, exon ? "Exon": ""); infoFile = hgCreateTabFile(tabDir,infoTable); struct gtexInfo *info; AllocVar(info); info->version = version; info->releaseDate = releaseDate; info->maxMedianScore = maxMedian; gtexInfoOutput(info, infoFile, '\t', '\n'); if (doLoad) { struct sqlConnection *conn = sqlConnect(database); // Load info table verbose(2, "Creating info table\n"); gtexInfoCreateTable(conn, infoTable); hgLoadTabFile(conn, tabDir, infoTable, &infoFile); hgRemoveTabFile(tabDir, infoTable); // Load tissue median tables verbose(2, "Creating all tissue medians table\n"); gtexTissueMedianCreateTable(conn, tissueMedianAllTable); hgLoadTabFile(conn, tabDir, tissueMedianAllTable, &tissueMedianAllFile); hgRemoveTabFile(tabDir, tissueMedianAllTable); verbose(2, "Creating female tissue medians table\n"); gtexTissueMedianCreateTable(conn, tissueMedianFemaleTable); hgLoadTabFile(conn, tabDir, tissueMedianFemaleTable, &tissueMedianFemaleFile); hgRemoveTabFile(tabDir, tissueMedianFemaleTable); // Load tissue median tables verbose(2, "Creating male tissue medians table\n"); gtexTissueMedianCreateTable(conn, tissueMedianMaleTable); hgLoadTabFile(conn, tabDir, tissueMedianMaleTable, &tissueMedianMaleFile); hgRemoveTabFile(tabDir, tissueMedianMaleTable); // Load tissue data table #ifdef FAST_STATS // Finish implementation of this if we want to add mean+whiskers to hgTracks verbose(2, "Creating tissue data table\n"); gtexTissueDataCreateTable(conn, tissueDataTable); hgLoadTabFile(conn, tabDir, tissueDataTable, &tissueDataFile); #endif // Load sample data table verbose(2, "Creating sample data table\n"); gtexSampleDataCreateTable(conn, sampleDataTable); hgLoadTabFile(conn, tabDir, sampleDataTable, &sampleDataFile); hgRemoveTabFile(tabDir, sampleDataTable); sqlDisconnect(&conn); } else { carefulClose(&sampleDataFile); carefulClose(&tissueDataFile); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); database = optionVal("database", database); doLoad = !optionExists("noLoad"); doData = !optionExists("noData"); doRound = optionExists("doRound"); dropZeros = optionExists("dropZeros"); releaseDate = optionVal("releaseDate", "0"); exon = optionExists("exon"); if (exon) verbose(2, "Parsing exon data file\n"); if (optionExists("tab")) { tabDir = optionVal("tab", tabDir); makeDir(tabDir); } limit = optionInt("limit", limit); if (argc == 4) hgGtexTissues(argv[1], argv[2], argv[3]); else if (argc == 7) hgGtex(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); else usage(); return 0; }