1c36e8ebf0e5ddd3d5c3ee9fa2fd492bef5c41a0 chmalee Fri Jan 22 11:00:09 2021 -0800 Allow bedJoinTabOffset to index on a field other than the name field, refs #26837 diff --git src/utils/bedJoinTabOffset/bedJoinTabOffset.c src/utils/bedJoinTabOffset/bedJoinTabOffset.c index e95e5b9..20fb7b3 100644 --- src/utils/bedJoinTabOffset/bedJoinTabOffset.c +++ src/utils/bedJoinTabOffset/bedJoinTabOffset.c @@ -1,41 +1,44 @@ /* bedJoinTabOffset - Add file offset and length of line in a text file with the same name * as the BED name to each row of BED. */ #include "common.h" #include "linefile.h" #include "localmem.h" #include "hash.h" #include "obscure.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort("bedJoinTabOffset - Add file offset and length of line in a text file with the same name as the BED name to each row of BED.\n" "usage:\n" " bedJoinTabOffset inTabFile inBedFile outBedFile\n" - "Given a bed file and tab file where each have a column with matching values:\n" - "first get the value of column0, the offset and line length from inTabFile.\n" - "Then go over the bed file, use the name field and append its offset and length\n" - "to the bed file as two separate fields. Write the new bed file to outBed.\n" -// "options:\n" -// " -xxx=XXX\n" + "\nGiven a bed file and tab file where each have a column with matching values:\n" + "1. first get the value of column0, the offset and line length from inTabFile.\n" + "2. Then go over the bed file, use the -bedKey (defaults to the name field)\n" + " field and append its offset and length to the bed file as two separate\n" + " fields. Write the new bed file to outBed.\n" + "options:\n" + " -bedKey=integer 0-based index key of the bed file to use to match up with\n" + " the tab file. Default is 3 for the name field.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { + {"bedKey", OPTION_INT}, {NULL, 0}, }; struct offsetLen /* File offset and length of a line; value of hash keyed by name/keyword in inTabFile. */ { off_t offset; // Line offset within inTabFile size_t len; // Length of line }; struct nameOffsetLen /* List of name/keyword, offset and length for each line in inTabFile as we read it in. */ { struct nameOffsetLen *next; char *name; // name/keyword from line of inTabFile @@ -68,67 +71,68 @@ verboseTime(2, "Done reading %s; %d items, hash size %d", lf->fileName, itemCount, hashSize); struct hash *nameToOffsetLen = hashNew(hashSize); struct nameOffsetLen *nol; for (nol = nolList; nol != NULL; nol = nol->next) { struct offsetLen *ol; AllocVar(ol); ol->offset = nol->offLen.offset; ol->len = nol->offLen.len; hashAdd(nameToOffsetLen, nol->name, ol); } lmCleanup(&lm); return nameToOffsetLen; } -void bedJoinTabOffset(char *inTabFile, char *inBedFile, char *outBedFile) +void bedJoinTabOffset(char *inTabFile, char *inBedFile, char *outBedFile, int bedKey) /* bedJoinTabOffset - Add file offset and length of line in a text file with the same name * as the BED name to each row of BED. */ { verboseTimeInit(); struct lineFile *tLf = lineFileOpen(inTabFile, TRUE); struct lineFile *bLf = lineFileOpen(inBedFile, TRUE); FILE *outF = mustOpen(outBedFile, "w"); struct hash *nameToOffsetLen = parseTabFile(tLf); lineFileClose(&tLf); struct dyString *dy = dyStringNew(0); verboseTime(2, "Done making hash; reading bed input %s", inBedFile); char *line; int size; while (lineFileNext(bLf, &line, &size)) { char *postSpace = skipLeadingSpaces(line); if (isEmpty(postSpace) || postSpace[0] == '#') fprintf(outF, "%s\n", line); else { dyStringClear(dy); dyStringAppend(dy, line); - char *words[5]; + char *words[bedKey + 2]; int wordCount = chopTabs(dy->string, words); - if (wordCount < 4) - lineFileAbort(bLf, "Expected at least 4 words but got %d (%s).", wordCount, line); - char *name = words[3]; - struct offsetLen *ol = hashFindVal(nameToOffsetLen, name); + if (wordCount < bedKey + 1) + lineFileAbort(bLf, "Expected at least %d words but got %d (%s).", bedKey + 1, wordCount, line); + char *indexField = words[bedKey]; + struct offsetLen *ol = hashFindVal(nameToOffsetLen, indexField); if (ol == NULL) - lineFileAbort(bLf, "Unable to find corresponding line in %s for name '%s'", - inTabFile, name); + lineFileAbort(bLf, "Unable to find corresponding line in %s for field '%s'", + inTabFile, indexField); fprintf(outF, "%s\t%lld\t%lld\n", line, (long long)ol->offset, (long long)ol->len); } } lineFileClose(&bLf); carefulClose(&outF); freeHashAndVals(&nameToOffsetLen); dyStringFree(&dy); verboseTime(2, "Done writing bed output %s.", outBedFile); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); +int bedKey = optionInt("bedKey", 3); if (argc != 4) usage(); -bedJoinTabOffset(argv[1], argv[2], argv[3]); +bedJoinTabOffset(argv[1], argv[2], argv[3], bedKey); return 0; }