5f2b7d5a6a6942d6676b0a98427ad1b91dbee4c9 kent Wed Oct 9 15:46:07 2019 -0700 Adjusting for them turning everything into lists. diff --git src/hca/hcat/hcatParseParth/hcatParseParth.c src/hca/hcat/hcatParseParth/hcatParseParth.c index 634d193..5020df6 100644 --- src/hca/hcat/hcatParseParth/hcatParseParth.c +++ src/hca/hcat/hcatParseParth/hcatParseParth.c @@ -60,36 +60,40 @@ char *lookupAssayType(char *ingestSym) /* Convert ingest's representation of library_construction to something we put * in assayTech. */ { if (sameWord(ingestSym, "10x v2 sequencing") || sameWord(ingestSym, "10x 3' v2 sequencing") || sameWord(ingestSym, "10x 5' v2 sequencing")) // this one Paris's typo most likely return "10X Chromium V2"; if (sameWord(ingestSym, "smart-seq2") || sameWord(ingestSym, "smart-seq") || sameWord(ingestSym, "full length single cell rna sequencing") // Quake lab set I seen before || sameWord(ingestSym, "smart-seq2 paired-end")) return "smart-seq2 PE"; if (sameWord(ingestSym, "cite-seq")) return "cite-seq"; +if (sameWord(ingestSym, "cel-seq2")) return "cel-seq2"; +if (sameWord(ingestSym, "sci-rna-seq")) return "sci-rna-seq"; +if (sameWord(ingestSym, "seq-well")) return "seq-well"; if (sameWord(ingestSym, "indrop")) return "inDrop"; if (sameWord(ingestSym, "mars-seq")) return "mars-seq"; if (sameWord(ingestSym, "drop-seq")) return "drop-seq"; if (sameWord(ingestSym, "dronc-seq")) return "dronc-seq"; if (sameWord(ingestSym, "10x 3' v1 sequencing")) return "10X Chromium V1"; if (sameWord(ingestSym, "10x 3' v3 sequencing")) return "10X Chromium V3"; +if (sameWord(ingestSym, "10x v3 sequencing")) return "10X Chromium V3"; // if (sameWord(ingestSym, "xyz")) return "xyz"; errAbort("Unknown ingest-info.library_construction_method %s", ingestSym); return NULL; } char *ingestConstructionRefsToAssayTech(struct slRef *refList, struct dyString *scratch) /* Convert a comma separated list of taxons in inVal to a * comma separated species list, using scratch. */ { struct dyString *result = dyStringNew(64); struct slRef *ref; for (ref = refList; ref != NULL; ref = ref->next) { char *assayType = jsonStringVal(ref->val, "library_construction_methods"); csvEscapeAndAppend(result, lookupAssayType(assayType)); @@ -153,66 +157,81 @@ { char *dupe = cloneString(semiList); // because of destructive parsing char *p = dupe; char *contrib; while ((contrib = nextContrib(&p)) != NULL) { if (!sameWord("n/a", contrib)) { fprintf(f, "%s\t%s\n", type, csvEscapeToDyString(scratch, contrib)); csvEscapeAndAppend(csvOut, contrib); } } freeMem(dupe); } +struct jsonElement *bypassListOfOne(struct jsonElement *el, char *name) +/* Check that element is a list with just one member, and then return that one member */ +{ +if (el->type != jsonList) + errAbort("Expected %s to be a list", name); +struct slRef *list = el->val.jeList; +int listSize = slCount(list); +if (listSize != 1) + errAbort("Expected %s to be a list with just one member", name); +return list->val; +} + void outputTracker(FILE *f, char *projectName, char *submissionId, char *uuid, struct hash *projectHash, int subBunCount, struct dyString *scratch) /* Look for interesting state info from parth's tracker outside of ingest and output it */ { /* Grab the dss-info and lots of pieces of it. It sees both primary and analysis, * not sure about matrix */ -struct jsonElement *dssEl = hashMustFindVal(projectHash, "dss-info"); +struct jsonElement *dssEl = bypassListOfOne(hashMustFindVal(projectHash, "dss-info"), "dss-info"); struct jsonElement *awsPrimary = jsonMustFindNamedField(dssEl, "dss-info", "aws_primary_bundle_count"); int awsPrimaryCount = jsonDoubleVal(awsPrimary, "aws_primary_bundle_count"); struct jsonElement *gcpPrimary = jsonMustFindNamedField(dssEl, "dss-info", "gcp_primary_bundle_count"); int gcpPrimaryCount = jsonDoubleVal(gcpPrimary, "gcp_primary_bundle_count"); struct jsonElement *awsAnalysis = jsonMustFindNamedField(dssEl, "dss-info", "aws_analysis_bundle_count"); int awsAnalysisCount = jsonDoubleVal(awsAnalysis, "aws_analysis_bundle_count"); struct jsonElement *gcpAnalysis = jsonMustFindNamedField(dssEl, "dss-info", "gcp_analysis_bundle_count"); int gcpAnalysisCount = jsonDoubleVal(gcpAnalysis, "gcp_analysis_bundle_count"); /* Grab just a little bit from azul-info */ -struct jsonElement *azulEl = hashMustFindVal(projectHash, "azul-info"); +struct jsonElement *azulEl = bypassListOfOne(hashMustFindVal(projectHash, "azul-info"), + "azul-info"); struct jsonElement *azulBundle = jsonMustFindNamedField(azulEl, "azul-info", "analysis_bundle_count"); int azulBundleCount = jsonDoubleVal(azulBundle, "azul-info.analysis_bundle_count"); /* Grab a little bit from analysis-info */ -struct jsonElement *analysisEl = hashMustFindVal(projectHash, "analysis-info"); +struct jsonElement *analysisEl = bypassListOfOne(hashMustFindVal(projectHash, "analysis-info"), + "analysis-info"); struct jsonElement *succeededEl = jsonFindNamedField(analysisEl, "analysis-info", "succeeded_workflows"); int succeededWorkflows = 0; if (succeededEl != NULL) succeededWorkflows = jsonDoubleVal(succeededEl, "succeeded_workflows"); /* And get the matrix stuff! */ -struct jsonElement *matrixEl = hashMustFindVal(projectHash, "matrix-info"); +struct jsonElement *matrixEl = bypassListOfOne(hashMustFindVal(projectHash, "matrix-info"), + "matrix-info"); struct jsonElement *matBundle = jsonMustFindNamedField(matrixEl, "matrix-info", "analysis_bundle_count"); int matrixBundleCount = jsonDoubleVal(matBundle, "matrix-info.analysis_bundle_count"); struct jsonElement *cellCountEl = jsonMustFindNamedField(matrixEl, "matrix-info", "cell_count"); int cellCount = jsonDoubleVal(cellCountEl, "cell_count"); fprintf(f, "%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", projectName, uuid, submissionId, subBunCount, awsPrimaryCount, gcpPrimaryCount, awsAnalysisCount, gcpAnalysisCount, azulBundleCount, succeededWorkflows, matrixBundleCount, cellCount); } void hcatParseParth(char *inFile, char *outDir) /* hcatParseParth - Parse through Parth Shah's Project Tracker json and turn it into fodder @@ -245,106 +264,123 @@ FILE *fContrib = mustOpen(contribPath, "w"); char projectPath[PATH_LEN]; safef(projectPath, sizeof(projectPath), "%s/hcat_project.tsv", outDir); FILE *fProject = mustOpen(projectPath, "w"); char postPath[PATH_LEN]; safef(postPath, sizeof(postPath), "%s/hcat_tracker.tsv", outDir); FILE *fTracker = mustOpen(postPath, "w"); /* Write out file headers */ fprintf(fContrib, "#@type_id@hcat_contributortype@short_name@id ?name\n"); fprintf(fProject, "#?short_name\ttitle\t@@species@id@hcat_project_species@project_id@species_id@hcat_species@common_name@id\t@@assay_tech@id@hcat_project_assay_tech@project_id@assaytech_id@hcat_assaytech@short_name@id\t@@contributors@id@hcat_project_contributors@project_id@contributor_id@hcat_contributor@name@id\tsubmit_date\n"); fprintf(fTracker, "#@project_id@hcat_project@short_name@id\t?uuid\tsubmission_id\tsubmission_bundles_exported_count\taws_primary_bundle_count\tgcp_primary_bundle_count\taws_analysis_bundle_count\tgcp_analysis_bundle_count\tazul_analysis_bundle_count\tsucceeded_workflows\tmatrix_bundle_count\tmatrix_cell_count\n"); /* Main loop - once through for each project (or in some cases project fragment */ struct slRef *projectRef; +verbose(1, "Got %d high level objects\n", slCount(rootEl->val.jeList)); for (projectRef = rootEl->val.jeList; projectRef != NULL; projectRef = projectRef->next) { /* Each project is an object/hash/dictionary depending on your fave language. * Here we get that level object into a C hash, and extract the project_uuid */ struct hash *projectHash = jsonObjectVal(projectRef->val, "project"); struct jsonElement *uuidEl = hashMustFindVal(projectHash, "project_uuid"); char *projectUuid = uuidEl->val.jeString; /* Get the ingest-info subobject and make sure it's complete. */ - struct jsonElement *ingestEl = hashFindVal(projectHash, "ingest-info"); - if (ingestEl == NULL) + struct jsonElement *ingestList = hashFindVal(projectHash, "ingest-info"); + if (ingestList == NULL) errAbort("Can't find ingest-info for project_uuid %s", projectUuid); + if (ingestList->type != jsonList) + errAbort("Expecting list value for ingest-info"); + int ingestListSize = slCount(ingestList->val.jeList); + if (ingestListSize != 1) + verbose(2, "ingest-info[] has %d members\n", ingestListSize); + + int subBunCount = 0; + struct slRef *ingestRef; + char *submissionId = NULL; + char *shortName = NULL; + for (ingestRef = ingestList->val.jeList; ingestRef != NULL; ingestRef = ingestRef->next) + { + struct jsonElement *ingestEl = ingestRef->val; char *primaryState = jsonStringField(ingestEl, "primary_state"); if (!isComplete(primaryState)) continue; /* God help us even among the completes there are multiple projects associated * with the same thing. So far project_short_name is unique. We'll just take * the first (complete) one and warn about the rest. Some of the dupes have the * same uuid, some different. Yes, it's a little messy this input . */ - char *shortName = jsonStringField(ingestEl, "project_short_name"); + shortName = jsonStringField(ingestEl, "project_short_name"); // Abbreviate what is really and truly not a short name! if (startsWith("Single cell RNAseq characterization of cell types produced over time in an in ", shortName)) { shortName = "Single cell RNAseq characterization of cell types produced over time"; verbose(2, "Abbreviated shortName to %s\n", shortName); } if (hashLookup(uniqShortNameHash, shortName)) { verbose(2, "Skipping duplicate project named '%s'\n", shortName); continue; } hashAdd(uniqShortNameHash, shortName, NULL); /* Grab more string fields we like from ingest-info. */ - char *submissionId = jsonStringField(ingestEl, "submission_id"); + submissionId = jsonStringField(ingestEl, "submission_id"); char *title = jsonStringField(ingestEl, "project_title"); char *wrangler = jsonStringField(ingestEl, "data_curator"); char *contributors = jsonStringField(ingestEl, "primary_investigator"); char *submissionDateTime = jsonStringField(ingestEl, "submission_date"); /* Turn dateTime into just date */ char *tStart = strchr(submissionDateTime, 'T'); if (tStart == NULL) errAbort("No T separator in submission_date %s", submissionDateTime); char *submissionDate = cloneStringZ(submissionDateTime, tStart - submissionDateTime); /* Get species list, maybe.... */ struct jsonElement *speciesEl = jsonMustFindNamedField(ingestEl, "ingest-info", "species"); struct slRef *speciesRefList = jsonListVal(speciesEl, "species"); char *species = sciNameRefsToSpecies(speciesRefList, scratch); struct jsonElement *subBunCountEl = jsonMustFindNamedField(ingestEl, "ingest-info", "submission_bundles_exported_count"); - int subBunCount = jsonDoubleVal(subBunCountEl, "submission_bundles_exported_count"); + subBunCount = jsonDoubleVal(subBunCountEl, "submission_bundles_exported_count"); /* Get assay techs maybe */ struct jsonElement *constructEl = jsonMustFindNamedField(ingestEl, "ingest-info", "library_construction_methods"); struct slRef *constructList = jsonListVal(constructEl, "library_construction_methods"); char *techs = ingestConstructionRefsToAssayTech(constructList, scratch); /* Still more error checking */ hashAddUnique(uniqHash, projectUuid, NULL); hashAddUnique(uniqTitleHash, title, NULL); /* Update contributors table */ dyStringClear(contribCsv); outputContributors(fContrib, contributors, "contributor", contribCsv, scratch); outputContributors(fContrib, wrangler, "wrangler", contribCsv, scratch); /* Update project table */ fprintf(fProject, "%s\t%s\t", shortName, title); fprintf(fProject, "%s\t%s\t%s\t", species, techs, contribCsv->string); fprintf(fProject, "%s\n", submissionDate); + break; // Still figuring out if this loop is here to stay + } + /* We processed the heck out of the ingest-info, and this routine is so long, * pass along what we parsed out that goes into the tracker table, and have it * deal with the azul-info, matrix-info, etc, which are read-only to wranglers. */ outputTracker(fTracker, shortName, submissionId, projectUuid, projectHash, subBunCount, scratch); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); hcatParseParth(argv[1], argv[2]);