57ad409a6278ef6a75e2c082988ed1ee419a5f78 hiram Fri Apr 7 11:35:09 2023 -0700 now allowing argument format=text on list files output refs #23589 diff --git src/hg/hubApi/list.c src/hg/hubApi/list.c index 0fc9cf8..d9e6e5b 100644 --- src/hg/hubApi/list.c +++ src/hg/hubApi/list.c @@ -573,115 +573,173 @@ /* typical rsync return columns: 0 1 2 3 4 drwxrwxr-x 162 2022/10/18 16:58:16 . drwxrwxr-x 4,096 2023/03/27 16:01:41 bigZips -r--rw-r-- 3,455 2022/08/11 03:26:26 bigZips/GCA_009914755.4_assembly_report.txt -rw-rw-r-- 0 2022/07/18 12:06:00 bigZips/THIS_IS_GENOME_ASSEMBLY_T2T-CHM13v2.0 -rw-rw-r-- 812,327,608 2022/07/16 14:27:39 bigZips/hs1.2bit appears to be a consistent set of columns */ /* might be variable depending upon which server request is coming from */ #define DOWNLOAD_HOST "hgdownload.soe.ucsc.edu" -static long long rsyncList(struct jsonWrite *jw, char *db, char *downPath) +static long long rsyncList(struct jsonWrite *jw, char *db, char *downPath, long long *itemsDone, boolean textOut) /* rsync listing from hgdownload on the given downPath/db * returning total bytes in the files listing */ { long long totalBytes = 0; +if (*itemsDone >= maxItemsOutput) + return totalBytes; +boolean reachedMaxItems = FALSE; int index = 3; /* rsyncCmd[3] == starts out at NULL, will become the * hgdownload path */ char *rsyncCmd[] = {"/usr/bin/rsync", "-a", "--list-only", NULL, NULL}; /* rsyncCmd[4] will remain NULL to terminate the list */ struct dyString *tmpDy = dyStringNew(128); dyStringPrintf(tmpDy, "%s::%s/%s/", DOWNLOAD_HOST, downPath, db); rsyncCmd[index++] = dyStringCannibalize(&tmpDy); struct pipeline *dataPipe = pipelineOpen1(rsyncCmd, pipelineRead, "/dev/null", NULL, 0); FILE *readingLines = pipelineFile(dataPipe); char lineBuf[PATH_MAX + 1024]; -while (fgets(lineBuf, sizeof(lineBuf), readingLines) != NULL) +while (! reachedMaxItems && fgets(lineBuf, sizeof(lineBuf), readingLines) != NULL) { if (startsWith("d", lineBuf)) continue; + *itemsDone += 1; + if (*itemsDone > maxItemsOutput) + { + reachedMaxItems = TRUE; + } + else + { char *columns[5]; (void) chopByWhite(lineBuf, columns, ArraySize(columns)); stripChar(columns[1], ','); long long bytes = sqlLongLong(columns[1]); totalBytes += bytes; char outString[PATH_MAX + 1024]; + if (textOut) + { + safef(outString, sizeof(outString), "https://%s/%s/%s/%s", + DOWNLOAD_HOST, downPath, db, columns[4]); + textLineOut(outString); + } + else + { jsonWriteObjectStart(jw, NULL); - jsonWriteNumber(jw, "b", sqlLongLong(columns[1])); + jsonWriteNumber(jw, "sizeBytes", sqlLongLong(columns[1])); safef(outString, sizeof(outString), "%sT%s", columns[2], columns[3]); - jsonWriteString(jw, "d", outString); + jsonWriteString(jw, "dateTime", outString); safef(outString, sizeof(outString), "%s/%s/%s", downPath, db, columns[4]); - jsonWriteString(jw, "u", outString); + jsonWriteString(jw, "url", outString); jsonWriteObjectEnd(jw); } + } + } pipelineClose(&dataPipe); pipelineFree(&dataPipe); return totalBytes; } -static void filesJsonOutput(FILE *f, char *genome) +static void filesJsonOutput(FILE *f, char *genome, boolean textOut) /* for given genome, output the URLs to files available on hgdownload * can be a UCSC database genome, or a GenArk hub genome name */ { +long long itemsReturned = 0; boolean genArkHub = FALSE; char genArkUrl[PATH_MAX + 1024]; if ( isGenArk(genome) ) { genArkHub = TRUE; safef(genArkUrl, sizeof(genArkUrl), "hubs/%s/", genArkPath(genome)); } /* if UCSC genome database, it has already been proven to exist */ -struct jsonWrite *jw = apiStartOutput(); +struct jsonWrite *jw = NULL; +if (textOut) + { + char outString[PATH_MAX + 1024]; + safef(outString, sizeof(outString), "# genome: %s", genome); + textLineOut(outString); + safef(outString, sizeof(outString), "# rsyncHost: rsync://%s", DOWNLOAD_HOST); + textLineOut(outString); + } +else + { + jw = apiStartOutput(); jsonWriteString(jw, "genome", genome); jsonWriteString(jw, "rsyncHost", "rsync://" DOWNLOAD_HOST); -/* describe schema of items in the output array */ -jsonWriteListStart(jw, "urlListArraySchema"); -jsonWriteObjectStart(jw, NULL); -jsonWriteString(jw, "b", "sizeBytes"); -jsonWriteString(jw, "d", "fileDateTime"); -jsonWriteString(jw, "u", "urlPathName"); -jsonWriteObjectEnd(jw); -jsonWriteListEnd(jw); - jsonWriteListStart(jw, "urlList"); + } + long long totalBytes = 0; if (genArkHub) { - totalBytes = rsyncList(jw, genome, genArkUrl); + totalBytes = rsyncList(jw, genome, genArkUrl, &itemsReturned, textOut); + } +else + { + totalBytes = rsyncList(jw, genome, "goldenPath", &itemsReturned, textOut); + if (itemsReturned < maxItemsOutput) + totalBytes += rsyncList(jw, genome, "gbdb", &itemsReturned, textOut); + if (itemsReturned < maxItemsOutput) + totalBytes += rsyncList(jw, genome, "mysql", &itemsReturned, textOut); + } + +if (textOut) + { + char outString[1024]; + safef(outString, sizeof(outString), "# totalBytes: %lld", totalBytes); + textLineOut(outString); + if (itemsReturned > maxItemsOutput) + { + safef(outString, sizeof(outString), "# maxItemLimit: TRUE"); + textLineOut(outString); + safef(outString, sizeof(outString), "# itemsReturned: %d", maxItemsOutput); + textLineOut(outString); } else { - totalBytes = rsyncList(jw, genome, "goldenPath"); - totalBytes += rsyncList(jw, genome, "gbdb"); + safef(outString, sizeof(outString), "# itemsReturned: %lld", itemsReturned); + textLineOut(outString); } + textFinishOutput(); + } +else + { jsonWriteListEnd(jw); jsonWriteNumber(jw, "totalBytes", totalBytes); + if (itemsReturned > maxItemsOutput) + { + jsonWriteBoolean(jw, "maxItemsLimit", TRUE); + jsonWriteNumber(jw, "itemsReturned", maxItemsOutput); + } + else + jsonWriteNumber(jw, "itemsReturned", itemsReturned); apiFinishOutput(0, NULL, jw); } +} static void chromInfoJsonOutput(FILE *f, char *db) /* for given db, if there is a track, list the chromosomes in that track, * for no track, simply list the chromosomes in the sequence */ { char *splitSqlTable = NULL; struct hTableInfo *tableInfo = NULL; char *chromName = NULL; char *table = cgiOptionalString("track"); char *bigDataUrl = NULL; struct trackDb *thisTrack = NULL; struct sqlConnection *conn = hAllocConnMaybe(db); if (NULL == conn) apiErrAbort(err400, err400Msg, "can not find 'genome=%s' for endpoint '/list/chromosomes", db); @@ -1070,27 +1128,36 @@ apiErrAbort(err400, err400Msg, "must supply hubUrl or genome name for endpoint '/list/schema", hubUrl, db); if (isEmpty(hubUrl)) // missing hubUrl implies UCSC database { schemaJsonOutput(stdout, db, track); return; } else { hubSchemaJsonOutput(stdout, hubUrl, genome, track); return; } } else if (sameWord("files", words[1])) { + boolean textOut = FALSE; char *extraArgs = verifyLegalArgs(argListFiles); if (extraArgs) apiErrAbort(err400, err400Msg, "extraneous arguments found for function /list/files '%s', only 'genome' is allowed.", extraArgs); char *genome = cgiOptionalString("genome"); + char *format = cgiOptionalString("format"); if (isEmpty(genome)) - apiErrAbort(err400, err400Msg, "must supply a genome name for endpoint '/list/files (a database name or GenArk genome name, e.g.: 'hg38' or 'GCA_021951015.1'"); - filesJsonOutput(stdout, genome); + apiErrAbort(err400, err400Msg, "must supply a genome name for endpoint '/list/files' (a database name or GenArk genome name, e.g.: 'hg38' or 'GCA_021951015.1'"); + if (isNotEmpty(format)) + { + if (sameWord("text", format)) + textOut = TRUE; + else + apiErrAbort(err400, err400Msg, "only format=text allowed for endpoint '/list/files', found: format=%s", format); + } + filesJsonOutput(stdout, genome, textOut); } else apiErrAbort(err400, err400Msg, "do not recognize endpoint function: '/%s/%s'", words[0], words[1]); } /* void apiList(char *words[MAX_PATH_INFO]) */