f0c88ba99715e71d8868232507ddf166270e1e33 kent Fri Dec 18 18:19:57 2020 -0800 Allowing a gene name to be multiply mapped. diff --git src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c index b757bdb..ea28755 100644 --- src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c +++ src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c @@ -47,31 +47,31 @@ while (lineFileNextReal(lf, &line)) { if (colCount == 0) { *retColCount = colCount = chopByChar(line, '\t', NULL, 0); verbose(2, "Got %d columns in first real line\n", colCount); colAlloc = colCount + 1; // +1 so we can detect unexpected input and complain lmAllocArray(hash->lm, row, colAlloc); } int count = chopByChar(line, '\t', row, colAlloc); if (count != colCount) { errAbort("Expecting %d words, got more than that line %d of %s", colCount, lf->lineIx, lf->fileName); } - hashAddUnique(hash, row[keyColIx], lmCloneRow(hash->lm, row, colCount) ); + hashAdd(hash, row[keyColIx], lmCloneRow(hash->lm, row, colCount) ); } lineFileClose(&lf); return hash; } void hashSamplesAndClusters(char *tsvFile, struct hash **retSampleHash, struct hash **retClusterHash) /* Read two column tsv file into a hash keyed by first column */ { struct hash *sampleHash = hashNew(0); struct hash *clusterHash = hashNew(0); char *row[2]; struct lineFile *lf = lineFileOpen(tsvFile, TRUE); while (lineFileNextRowTab(lf, row, ArraySize(row)) ) { @@ -192,40 +192,45 @@ * start with # character. */ int lineLength = 0; char *line; if (!lineFileNext(lf, &line, &lineLength)) break; char *s = skipLeadingSpaces(line); char c = s[0]; if (c == 0 || c == '#') continue; /* Chop it into tabs */ int rowSize = chopByChar(line, '\t', matrixRow, colAlloc); lineFileExpectWords(lf, colCount, rowSize); char *geneName = matrixRow[0]; - char **geneBedVal = hashFindVal(geneHash, geneName); - if (geneBedVal == NULL) + struct hashEl *onePos = hashLookup(geneHash, geneName); + if (onePos == NULL) { warn("Can't find gene %s in %s", geneName, geneBed); ++missCount; continue; } else + { ++hitCount; + } + for (; onePos != NULL; onePos = hashLookupNext(onePos)) + { + char **geneBedVal = onePos->val; /* Zero out cluster histogram */ int i; for (i=0; i<clusterCount; ++i) { clusterTotal[i] = 0.0; clusterElements[i] = 0; } zeroBytes(&clusterTotal, sizeof(clusterTotal)); zeroBytes(&clusterElements, sizeof(clusterElements)); /* Loop through rest of row filling in histogram */ for (i=1; i<colCount; ++i) { @@ -247,30 +252,33 @@ { if (i != 0) fprintf(f, ","); if (clMean) fprintf(f, "%g", clusterTotal[i]/clusterElements[i]); else fprintf(f, "%g", doubleMedian(clusterElements[i], clusterSamples[i])); } /* Data file offset info */ if (clDataOffset) fprintf(f, "\t%lld\t%lld", (long long)lineFileTell(lf), (long long)lineLength); fprintf(f, "\n"); } + + + } verbose(1, "%d genes found, %d missed\n", hitCount, missCount); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); clDataOffset = (optionExists("_dataOffset") || optionExists("dataOffset")); clMean = optionExists("mean"); clusterMatrixToBarchartBed(argv[1], argv[2], argv[3], argv[4]); return 0; }