94383c9c3192d68eee122deb4effb0313f13c773
kuhn
  Mon Dec 10 18:59:20 2012 -0800
added some logic to get the link name for each url and provision for backup in genecats when running script more than once in a day.  still needs some work.
diff --git src/utils/qa/checkStaticLinks.csh src/utils/qa/checkStaticLinks.csh
index 4dee5cd..c925ed0 100755
--- src/utils/qa/checkStaticLinks.csh
+++ src/utils/qa/checkStaticLinks.csh
@@ -1,96 +1,146 @@
 #!/bin/tcsh
 source `which qaConfig.csh`
 
 ###############################################
-#  05-10-04
+#  05-10-2004
 # 
 #  checks the links in all the files in a directory
 #  Robert Kuhn
 # 
 ###############################################
 
 set filePath=""
 set out=""
+set url=""
 set exclude=""
 set excludeList=""
 set baseUrl="http://hgwbeta.cse.ucsc.edu"
 set errs=""
 
 if ( $#argv < 1 || $#argv > 2 ) then
   # wrong number of command-line args
   echo
   echo "  checks the links in all the static pages in a directory."
   echo "  operates on pages on hgwbeta"
   echo "  writes a file called dir.dir.err"
   echo
   echo "    usage:  pathInHtdocs [excludeList]"
   echo '      where:'
   echo '        pathInHtdocs = path in htdocs (0 for htdocs root)'
   echo "        excludeList = filename for list of files not to check"
   echo
   exit
 endif
 
 if ($argv[1] == 0) then
   # filePath is already htdocs root level
 else
   # strip trailing backslash"
-  set filePath=`echo $argv[1] | sed -e 's/\/$//'`
+  set filePath=`echo $argv[1] | sed 's@/$@@'`
 endif
 
 if ( $#argv == 2 ) then
   set excludeList=$argv[2]
   file $excludeList | grep -q "ASCII text"
   if ( $status ) then
     echo "\nexclude file $excludeList does not exist\n"
     exit 1
   endif
   set exclude=`cat $excludeList`
 endif
 
 # get list of active files from beta
 # and strip off the pathname from list leaving only filenames
 
 set origlist=`ssh hgwbeta 'ls /usr/local/apache/htdocs/'${filePath}'/*html' \
       | sed "s/.*\///g"`
-echo
 
 # strip out any files in exclude list
 foreach excl ( $exclude )
   set origlist=`echo $origlist | sed "s/ /\n/g" | egrep -wv $excl`
 end
 
+# echo $origlist
+
+# set up outfile for all the files in the dir
 set i=0
-set errs=0
 rm -f outfile
 echo "\nfiles checked in htdocs/${filePath}" >> outfile
 echo $origlist | sed "s/ /\n/g" >> outfile
 echo >> outfile
 
 foreach file ( $origlist )
+  rm -f tmp0
+  htmlCheck checkLinks $baseUrl/$filePath/$file  >>& tmp0
+  if ( -e tmp0 ) then
+    # there were errors
+    # clean out things we don't care about
+    rm -f tmp
+    cat tmp0 | grep -v "403" \
+      | grep -v "doesn't exist" \
+      | grep -v "Cancelling" \
+      | grep -v "service not known" \
+      | grep -v "than directories in" \
+      | grep -v "Connection refused" \
+      | egrep "."  > tmp
+    rm -f tmp0
+
+    if ( `wc -l tmp | awk '{print $1}'` > 0 ) then
+      # there were errors worth looking at
+      # get the link names for any broken urls
+      @ errs = $errs + 1                    # counts files with errors
+      set j=1
+      set errors=`wc -l tmp | awk '{print $1}'`  # counts errs in file
   rm -f outfile$file
-  echo $file                                    >>& outfile$file
-  echo                 $baseUrl/$filePath/$file >>& outfile$file
-  htmlCheck checkLinks $baseUrl/$filePath/$file >>& outfile$file
-  if ( `cat outfile$file | grep -v "doesn't exist" | wc -l` > 2 ) then
-    # there are errors
-    cat outfile$file | grep -v "doesn't exist"  >> outfile
-    echo                       >> outfile
-    @ errs = $errs + 1
+      echo                                           >> err$file
+      while ( $j <= $errors )
+        set errLine=`sed -n "${j}p" tmp`
+        set url=`sed -n "${j}p" tmp | awk '{print $NF}'`
+        set xfile=$baseUrl/$filePath/$file
+        # set xfile=http://genome.ucsc.edu/goldenPath/credits.html
+        # set url=http://www.genome.washington.edu/UWGC
+
+        # grab 3 lines from html page and trim down to </A> tag
+        set link=`htmlCheck getHtml $xfile | egrep -qi -A 4 "$url" \
+          | sed -n "1,/<\/A>/p"`
+        set link=`echo $link \
+          | awk -F'</A>' '{print $1}' \
+          | awk -F'>' '{print $NF}'`
+
+        echo "link  = $link"                         >> err$file
+        echo "error = $errLine"                      >> err$file
+        echo                                         >> err$file
+        @ j = $j + 1
+      end
+      @ j = $j - 1
+      if ( $j > 0 ) then
+        echo $file                                >> outfile$file
+        echo $baseUrl/$filePath/$file             >> outfile$file
+        cat err$file                              >> outfile$file
+        echo " found $j errors in $file"          >> outfile$file
+        echo "---------------------------"        >> outfile$file
+        echo                                      >> outfile$file
+        cat outfile$file                       >> outfile
   endif
-  @ i = $i + 1
+      rm -f err$file
+    endif
+    rm -f tmp
   rm -f outfile$file
+  endif
+  @ i = $i + 1
 end
-echo "\n directory" = $filePath >> outfile
+
+echo "\n directory = htdocs/$filePath"         >> outfile
 echo " checked $i files" >> outfile
 # note:  if you change the line below the wrapper script will break
 echo " found errors in $errs files\n" >> outfile
+echo                                           >> outfile
 
 # cat outfile
 if ( $filePath == "" ) then
   set out=htdocs.err
 else
   set out=`echo $filePath | sed s@/@.@g`.err
 endif
 mv outfile $out