904412b7994f145e262ec606b47a725750b7edce
jcasper
  Wed Mar 6 10:58:24 2024 -0800
Some updates for the knownGene pipeline, refs #32414

diff --git src/hg/utils/otto/knownGene/buildTo.sh src/hg/utils/otto/knownGene/buildTo.sh
index c163074..aab8c8c 100755
--- src/hg/utils/otto/knownGene/buildTo.sh
+++ src/hg/utils/otto/knownGene/buildTo.sh
@@ -1,23 +1,23 @@
 #!/bin/sh -ex
 
 {
 . ./buildEnv.sh
 
 # knownToLocusLink
 #hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from hgFixed.refLink" $db > refToLl.txt
-hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from ncbiRefSeqLink where mrnaAcc != ''" $db > refToLl.txt
+hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from ncbiRefSeqLink where mrnaAcc != '' and locusLinkId != ''" $db > refToLl.txt
 hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db ncbiRefSeq knownGene knownToLocusLink -lookup=refToLl.txt
 rm refToLl.txt
 
 if test "$gtexGeneModel" != ""
 then
     hgMapToGene -geneTableType=genePred $db -tempDb=$tempDb -all -type=genePred $gtexGeneModel knownGene knownToGtex
 fi
 
 # knownToEnsembl and knownToGencode${GENCODE_VERSION}
 awk '{OFS="\t"} {print $4,$4}' ucscGenes.bed | sort | uniq > knownToEnsembl.tab
 cp knownToEnsembl.tab knownToGencode${GENCODE_VERSION}.tab
 hgLoadSqlTab -notOnServer $tempDb  knownToEnsembl  $kent/src/hg/lib/knownTo.sql  knownToEnsembl.tab
 hgLoadSqlTab -notOnServer $tempDb  knownToGencode${GENCODE_VERSION}  $kent/src/hg/lib/knownTo.sql  knownToGencode${GENCODE_VERSION}.tab
 
 # make knownToLynx
@@ -62,32 +62,36 @@
                         -lookup=$gnfAtlasTableLookup
     fi
 fi
 
 if test "$gnfU95TableLookup" != ""
 then
     hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db affyU95 knownGene $gnfU95TableLookup
     if test  "$gnfU95TablesFixed" != ""
     then
         time hgExpDistance $tempDb $gnfU95TablesFixed gnfU95Distance  -lookup=$gnfU95TableLookup
     fi
 fi
 
 if test "$hprd_file" != ""
 then
-    wget "$hprd_website/$hprd_tar"
-    tar xvf $hprd_tar
+# Commented out these fetch lines until the hprd.org website returns.  In the meantime,
+# you can copy the files from the previous build (they haven't changed in a while).
+#    wget "$hprd_website/$hprd_tar"
+#    tar xvf $hprd_tar
+    cp ${oldGeneDir}/${hprd_tar} .
+    cp -R `dirname ${oldGeneDir}/${hprd_file}` ./`dirname ${hprd_file}`
     knownToHprd $tempDb $hprd_file
     hgLoadNetDist $genomes/hg19/p2p/hprd/hprd.pathLengths $tempDb humanHprdP2P \
         -sqlRemap="select distinct value, name from knownToHprd"
 
     # these should be under a different test, but...
     hgLoadNetDist $genomes/hg19/p2p/vidal/humanVidal.pathLengths $tempDb humanVidalP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq"
     hgLoadNetDist $genomes/hg19/p2p/wanker/humanWanker.pathLengths $tempDb humanWankerP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq"
 
 fi
 
 # mupit-pdbids.txt was emailed from Kyle Moad (kmoad@insilico.us.com)
 # wc -l 
 cp /hive/data/outside/mupit/mupit-pdbids.txt .
 # get knownGene IDs and associated PDB IDS
 # the extDb{Ref} parts come from hg/hgGene/domains.c:domainsPrint()