05a94d31f6ff2235ef85a349732500e8b6b40564 kent Mon Nov 29 10:48:06 2021 -0800 First cut at a join utility that is a bit slower but more careful than Unix's. diff --git src/tabFile/tabJoin/tabJoin.c src/tabFile/tabJoin/tabJoin.c new file mode 100644 index 0000000..e6e35e4 --- /dev/null +++ src/tabFile/tabJoin/tabJoin.c @@ -0,0 +1,98 @@ +/* tabJoin - Join together two tab-separated files based on a common field. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "fieldedTable.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "tabJoin - Join together two tab-separated files based on a common field\n" + "usage:\n" + " tabJoin file1.tsv field1 file2.tsv field2 output.tsv\n" + "The file1.tsv and file2.tsv should both be tab-separated files that have the\n" + "same number of lines and the same items in the columns specified by field1\n" + "and field2. The output is file in the same order as file1.tsv with the matching line from\n" + "file2 appended to the line from file1.\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +void tabJoin(char *fileName1, char *fieldName1, char *fileName2, char *fieldName2, char *outFile) +/* tabJoin - Join together two tab-separated files based on a common field. */ +{ +struct fieldedTable *table1 = fieldedTableFromTabFile(fileName1, fileName1, &fieldName1, 1); +struct fieldedTable *table2 = fieldedTableFromTabFile(fileName2, fileName2, &fieldName2, 1); +if (table1->rowCount != table2->rowCount) + errAbort("%s has %d rows but %s has %d. They must have same number of rows.\n", + fileName1, table1->rowCount, fileName2, table2->rowCount); +struct hash *hash1 = fieldedTableUniqueIndex(table1, fieldName1); +struct hash *hash2 = fieldedTableUniqueIndex(table2, fieldName2); +int fieldIx1 = fieldedTableMustFindFieldIx(table1, fieldName1); + +/* Open output and write out header */ +FILE *f = mustOpen(outFile, "w"); +if (table1->startsSharp) + fputc('#', f); +fputs(table1->fields[0], f); +int i; +for (i=1; ifieldCount; ++i) + { + fputc('\t', f); + fputs(table1->fields[i], f); + } +for (i=0; ifieldCount; ++i) + { + fputc('\t', f); + fputs(table2->fields[i], f); + } +fputc('\n', f); + +/* Write out rest of fields */ +struct fieldedRow *row1; +for (row1 = table1->rowList; row1 != NULL; row1 = row1->next) + { + char *key = row1->row[fieldIx1]; + struct fieldedRow *row2 = hashFindVal(hash2, key); + if (row2 == NULL) + errAbort("%s is found in %s.%s but not %s.%s", key, fileName1, fieldName1, + fileName2, fieldName2); + + /* Write out data from table1 */ + for (i=0; ifieldCount; ++i) + { + fputs(row1->row[i], f); + fputc('\t', f); + } + /* Write out data from table 2 */ + int lastField = table2->fieldCount-1; + for (i=0; irow[i], f); + fputc('\t', f); + } + fputs(row2->row[lastField], f); + fputc('\n', f); + } +carefulClose(&f); +hashFree(&hash1); +hashFree(&hash2); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 6) + usage(); +tabJoin(argv[1], argv[2], argv[3], argv[4], argv[5]); +return 0; +}