2fe474a25e247a186d1ba8890c9c70028fbad0e2 kent Wed Sep 18 20:19:17 2019 -0700 Adding contributors to contact in hopes of unrolling them all and avoiding interns cutting and pasting long author lists. diff --git src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c index 86325d9..b91f730 100644 --- src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c +++ src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c @@ -141,53 +141,55 @@ /* Make contributor output table. The first row of it will be seeded with the contact. * We can fill out names, but not other info on the other contributors, who will make * up the rest of the rows. */ struct fieldedTable *contributors = fieldedTableNew("contributor", contactFields, realFieldCount); contributors->startsSharp = inProject->startsSharp; /* Make up first row from contacts */ char *outVals[realFieldCount]; int outIx; struct dyString *scratch = dyStringNew(0); for (outIx=0; outIx<realFieldCount-1; ++outIx) { char *inTsv = projectRow[contactIx[outIx]]; char *inVal = emptyForNull(cloneString(csvParseNext(&inTsv, scratch))); - outVals[outIx] = cloneString(csvEscapeToDyString(csvScratch, inVal)); + outVals[outIx] = inVal; } outVals[outIx] = "lab contact"; char *contactName = cloneString(outVals[0]); fieldedTableAdd(contributors, outVals, realFieldCount, 1); -/* Unroll the contributors field into further rows*/ +/* Unroll the contributors field into further rows if it exists. */ +int inContribIx = fieldedTableFindFieldIx(inProject, "contributors"); +if (inContribIx >= 0) + { for (outIx=0; outIx<realFieldCount; ++outIx) outVals[outIx] = ""; // Empty out all rows. -int inContribIx = fieldedTableMustFindFieldIx(inProject, "contributors"); int outContribIx = fieldedTableMustFindFieldIx(contributors, "?name"); char *inTsv = projectRow[inContribIx]; char *oneVal; while ((oneVal = csvParseNext(&inTsv, scratch)) != NULL) { - char *escaped = csvEscapeToDyString(csvScratch, oneVal); - if (differentString(escaped, contactName)) // We already got the contact as a contributor + if (differentString(oneVal, contactName)) // We already got the contact as a contributor { - outVals[outContribIx] = escaped; + outVals[outContribIx] = oneVal; outVals[realFieldCount-1] = "contributor"; fieldedTableAdd(contributors, outVals, realFieldCount, contributors->rowCount+1); } } + } dyStringFree(&csvScratch); return contributors; } char *lookupSpecies(char *taxon) /* Some day we may query a decent database, for now * just have some of the most common */ { if (sameString(taxon, "9606")) return "human"; if (sameString(taxon, "10090")) return "mouse"; if (sameString(taxon, "10116")) return "rat"; if (sameString(taxon, "7955")) return "zebrafish"; if (sameString(taxon, "7227")) return "fly"; if (sameString(taxon, "6239")) return "worm"; if (sameString(taxon, "4932")) return "yeast"; @@ -322,75 +324,80 @@ if (sameString("state_reached", inName) || sameString("cur_state", inName)) { safef(nameBuf, sizeof(nameBuf), "@%s_id@hcat_projectstate@state@id", inName); inName = cloneString(nameBuf); } else if (sameString("consent", inName)) { safef(nameBuf, sizeof(nameBuf), "@%s_id@hcat_%s@short_name@id", inName, inName); inName = cloneString(nameBuf); } else if (sameString("effort", inName)) { safef(nameBuf, sizeof(nameBuf), "@%s_id@hcat_efforttype@short_name@id", inName); inName = cloneString(nameBuf); } +#ifdef TOO_FLAKEY else if (sameString("lab", inName)) { safef(nameBuf, sizeof(nameBuf), "%s", "@@lab@id@hcat_project_labs@project_id@lab_id@hcat_lab@short_name@id"); inName = cloneString(nameBuf); } +#endif /* TOO_FLAKEY */ else if (sameString("publications", inName)) { safef(nameBuf, sizeof(nameBuf), "%s", "@@publications@id@hcat_project_publications@project_id@publication_id@hcat_publication@short_name@id"); inName = cloneString(nameBuf); } /* Output all the ones we haven't dealt with already or will deal with later */ if (!startsWith("contact_", inName) && !sameString("contributors", inName)) { outFields[outFieldCount] = inName; outRow[outFieldCount] = inVal; ++outFieldCount; } } /* Add in contributors as a multi to multi field */ +char *contributors = fieldedTableLookupNamedFieldInRow(inProject, "contributors", inRow); +if (contributors != NULL) + { outFields[outFieldCount] = "@@contributors@id@hcat_project_contributors@project_id@contributor_id@hcat_contributor@name@id"; -outRow[outFieldCount] = fieldedTableLookupNamedFieldInRow(inProject, "contributors", inRow); + outRow[outFieldCount] = contributors; outFieldCount += 1; + } /* Add in contacts as a multi to multi field too */ outFields[outFieldCount] = "@@contacts@id@hcat_project_contacts@project_id@contributor_id@hcat_contributor@name@id"; outRow[outFieldCount] = fieldedTableLookupNamedFieldInRow(inProject, "contact_name", inRow); outFieldCount += 1; /* Add the fields we scan and merge from sample at end */ projectVocabField(inProject, inSample, "organ", outDir, outFields, outRow, outFieldMax, &outFieldCount); projectVocabField(inProject, inSample, "organ_part", outDir, outFields, outRow, outFieldMax, &outFieldCount); projectVocabField(inProject, inSample, "assay_type", outDir, outFields, outRow, outFieldMax, &outFieldCount); projectVocabField(inProject, inSample, "assay_tech", outDir, outFields, outRow, outFieldMax, &outFieldCount); projectVocabField(inProject, inSample, "disease", outDir, outFields, outRow, outFieldMax, &outFieldCount); -uglyf("making project table with %d fields\n", outFieldCount); struct fieldedTable *outTable = fieldedTableNew("project", outFields, outFieldCount); outTable->startsSharp = inProject->startsSharp; fieldedTableAdd(outTable, outRow, outFieldCount, 2); dyStringFree(&scratch); return outTable; } struct fieldedTable *makePublication(struct fieldedTable *inProject) /* If there's a publication field we make a publication table and seed it with the pmid * and stuff. */ { int pubIx = fieldedTableFindFieldIx(inProject, "publications"); if (pubIx >= 0) { @@ -406,102 +413,107 @@ { char name[64]; safef(name, sizeof(name), "pmid: %s", pmid); char *outRow[2] = {name, pmid}; fieldedTableAdd(pubTable, outRow, ArraySize(outRow), 0); csvEscapeAndAppend(newPubNames, name); } inRow[pubIx] = dyStringCannibalize(&newPubNames); // Other people need to use new value too dyStringFree(&csvScratch); return pubTable; } else return NULL; } +#ifdef TOO_FLAKEY struct fieldedTable *makeLab(struct fieldedTable *inProject) /* If there's a lab field we make a lab table and seed it with the contacts. */ { int labIx = fieldedTableFindFieldIx(inProject, "lab"); if (labIx >= 0) { char **inRow = inProject->rowList->row; char *short_name = inRow[labIx]; - char *contributors = fieldedTableLookupNamedFieldInRow(inProject, "contributors", inRow); + char *contributors = emptyForNull(fieldedTableLookupNamedFieldInRow( + inProject, "contributors", inRow)); char *institute = fieldedTableLookupNamedFieldInRow(inProject, "contact_institute", inRow); char labName[256]; if (strlen(short_name) < 20) // Unlikely to be unique, may cause trouble safef(labName, sizeof(labName), "%s %s", short_name, emptyForNull(institute)); else safef(labName, sizeof(labName), "%s", short_name); labName[50] = 0; // not too long inRow[labIx] = cloneString(labName); /* Other people need to know about this too. */ char *outFields[3] = {"?short_name", "institution", "@@contributors@id@hcat_lab_contributors@lab_id@contributor_id@hcat_contributor@name@id"}; struct fieldedTable *labTable = fieldedTableNew("lab", outFields, ArraySize(outFields)); char *outRow[3] = {labName, institute, contributors}; fieldedTableAdd(labTable, outRow, ArraySize(outRow), 1); return labTable; } else return NULL; } +#endif /* TOO_FLAKEY */ void hcatTabUpdate(char *inDir, char *outDir) /* hcatTabUpdate - take the tabToTabDir result of the geo/sra import. * Put results in an output dir in a format sqlUpdateRelated understands. */ { // We are actually just looking for specific files in inDir. */ /* Load up input projects table */ char *projectFile = "hcat_project.tsv"; char inPath[PATH_LEN]; safef(inPath, sizeof(inPath), "%s/%s", inDir, projectFile); -char *projectRequired[] = {"short_name", "contact_name", "contributors"}; +char *projectRequired[] = {"short_name", "contact_name", }; struct fieldedTable *inProject = fieldedTableFromTabFile(inPath, inPath, projectRequired, ArraySize(projectRequired)); /* Load up samples table */ char *sampleFile = "hcat_sample.tsv"; safef(inPath, sizeof(inPath), "%s/%s", inDir, sampleFile); char *sampleRequired[] = {"short_name",}; struct fieldedTable *inSample = fieldedTableFromTabFile(inPath, inPath, sampleRequired, ArraySize(sampleRequired)); - +verbose(2, "Got %d fields %d rows in %s\n", inSample->fieldCount, inSample->rowCount, inPath); /* Make sure inProject table makes sense by having exactly one row */ if (inProject->rowCount != 1) errAbort("Expected one row in %s, got %d\n", projectFile, inProject->rowCount); /* Write output from lowest level to highest level tables. */ makeDirsOnPath(outDir); /* Contributors table - it's always there */ struct fieldedTable *outContributor = makeContributors(inProject); char outPath[PATH_LEN]; safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "contributor.tsv"); fieldedTableToTabFile(outContributor, outPath); +#ifdef TOO_FLAKEY /* Make lab table if there is a lab field */ struct fieldedTable *outLab = makeLab(inProject); if (outLab != NULL) { safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "lab.tsv"); fieldedTableToTabFile(outLab, outPath); } +#endif /* TOO_FLAKEY */ /* Make pubs table if there are pubs fields */ struct fieldedTable *outPub = makePublication(inProject); if (outPub != NULL) { safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "publication.tsv"); fieldedTableToTabFile(outPub, outPath); } struct fieldedTable *outProject = makeProject(inProject, inSample, outDir); safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "project.tsv"); fieldedTableToTabFile(outProject, outPath); }