ab05b32b4dadff4da3aacc911b59d9eafb36489d
galt
  Fri Jan 14 01:44:22 2011 -0800
adding ability to recursively process FTP directories with FTP LIST output
diff --git src/utils/paraSync/paraSync.c src/utils/paraSync/paraSync.c
index 2f05beb..f33e9c7 100644
--- src/utils/paraSync/paraSync.c
+++ src/utils/paraSync/paraSync.c
@@ -1,165 +1,205 @@
 /* paraSync - uses paraFetch to recursively mirror url to given path. */
 #include "common.h"
 #include "options.h"
 #include "dystring.h"
 #include "obscure.h"
 #include "portable.h"
 #include "net.h"
 
 void usage()
 /* Explain usage and exit */
 {
 errAbort(
     "paraSync 1.0\n"
     "paraSync - uses paraFetch to recursively mirror url to given path\n"
     "usage:\n"
     "   paraSync {options} N R URL outPath\n"
     "   where N is the number of connections to use\n"
     "         R is the number of retries\n"
     "   Options:\n"
     "    -A='ext1,ext2'  means accept only files with ext1 or ext2\n" 
     );
 }
 
 static struct optionSpec options[] = {
    {"A", OPTION_STRING},
    {NULL, 0},
 };
 
 char *acceptString = NULL;
 char **acceptExtensions = NULL; 
 int acceptExtensionsCount = 0;
 
 boolean paraSync(int numConnections, int numRetries, struct dyString *url, struct dyString *outPath)
 /* Fetch given URL, send to stdout. */
 {
 // requirements:
 //   URL must end in / slash
 //   foo must end in / slash
 if (!endsWith(url->string,"/"))
     errAbort("URL must end in slash /");
 if (!endsWith(outPath->string,"/"))
     errAbort("outPath must end in slash /");
 // create subdir if it does not exist
 makeDir(outPath->string);
 struct dyString *dy = netSlurpUrl(url->string);
 char *p = dy->string;
 
 verbose(2,"response=[%s]\n", dy->string);
 
 boolean result = TRUE;
 
 char *pattern = "<a href=\"";
 while (TRUE)
     {
-    char *q = strstr(p,pattern);
+    char *q = NULL;
+    boolean isDirectory = FALSE;
+    if (startsWith("ftp:", url->string))
+	{
+	char ftype = p[0];
+	if (!ftype)
+	    break;
+	char *peol = strchr(p,'\n'); 
+	if (!peol)
+	    break;
+        *peol = 0;
+        if (*(peol-1) == '\r')
+	    *(peol-1) = 0;
+	q = strrchr(p,' ');
+	if (!q)
+	    break;  // should not happen
+	++q;
+        p = peol+1;
+	if (ftype == 'l')
+            {
+	    //skip symlinks
+	    continue;
+	    }
+	if (ftype == 'd')
+            {
+	    isDirectory = TRUE;
+	    }
+	}
+    else  // http(s)
+	{
+	q = strstr(p,pattern);
     if (!q)
 	break;
     q += strlen(pattern);
     p = strchr(q,'"');
     if (!p)
 	errAbort("unmatched \" in URL");
     *p = 0;
     ++p; // get past the terminator that we added earlier.   
 
     // We want to skip several kinds of links
     if (q[0] == '?') continue;
     if (q[0] == '/') continue;
-    if (startsWith(q, "ftp:")) continue;
-    if (startsWith(q, "http:")) continue;
-    if (startsWith(q, "https:")) continue;
-    if (startsWith(q, "./")) continue;
-    if (startsWith(q, "../")) continue;
+	if (startsWith("ftp:"  ,q)) continue;
+	if (startsWith("http:" ,q)) continue;
+	if (startsWith("https:",q)) continue;
+	if (startsWith("./"    ,q)) continue;
+	if (startsWith("../"   ,q)) continue;
+
+	if (endsWith(q, "/")) 
+	    isDirectory = TRUE;
+	}
 
     verbose(1, "%s\n", q);
 
     int saveUrlSize = url->stringSize;
     int saveOutPathSize = outPath->stringSize;
 
     dyStringAppend(url, q);
     dyStringAppend(outPath, q);
+    if (startsWith("ftp:", url->string) && isDirectory)
+	{
+	dyStringAppend(url, "/");
+    	dyStringAppend(outPath, "/");
+	}
+    
  
     // URL found
-    if (endsWith(q, "/")) // directory
+    if (isDirectory) 
 	{   
 	// recursive
 	if (!paraSync(numConnections, numRetries, url, outPath))
 	    result = FALSE;
 	}
     else    // file
 	{
 	// Test accepted extensions if applicable.
         boolean accepted = (acceptExtensionsCount == 0);
         int i = 0;
         for(i=0; i<acceptExtensionsCount; ++i) 
 	    {
 	    if (endsWith(q, acceptExtensions[i]))
 		{
 		accepted = TRUE;
 		break;
 		}
 	    }
 	if (accepted)
 	    {
 	    // check to see if it needs download, i.e. file does not exist, or it needs a resume
 	    int restoreSize = outPath->stringSize;
 	    dyStringAppend(outPath, ".paraFetchStatus");
 	    boolean needsDownload = fileExists(outPath->string);
 	    dyStringResize(outPath, restoreSize);
 	    if (!fileExists(outPath->string))
 		needsDownload = TRUE;
 	    if (needsDownload)
 		{
 		if (!parallelFetch(url->string, outPath->string, numConnections, numRetries))
 		    {
 		    warn("failed to download %s\n", url->string);
 		    // write to a log that this one failed
 		    // and try to continue
 		    result = FALSE;
 		    }
 		else
 		    {
 		    verbose(1,"%s downloaded successfully\n", url->string);
 		    }
 		}
 	    }
 	}
     
     dyStringResize(url, saveUrlSize);
     dyStringResize(outPath, saveOutPathSize);
 
     }
 
 
 return result;
 
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 acceptString = optionVal("A", NULL);
 if (acceptString)
     {
     acceptExtensionsCount = chopByChar(acceptString, ',', NULL, 0);
     AllocArray(acceptExtensions, acceptExtensionsCount);
     chopByChar(acceptString, ',', acceptExtensions, acceptExtensionsCount);
     verbose(1, "accept-option count: %d\n", acceptExtensionsCount);
     int i = 0;
     for(i=0; i<acceptExtensionsCount; ++i) 
 	{
 	verbose(2, "accept-option: %s\n", acceptExtensions[i]);
 	}
     }
 struct dyString *url = dyStringNew(4096);
 struct dyString *outPath = dyStringNew(4096);
 dyStringAppend(url, argv[3]);
 dyStringAppend(outPath, argv[4]);
 if (!paraSync(atoi(argv[1]), atoi(argv[2]), url, outPath))
     exit(1);
 return 0;
 }