904412b7994f145e262ec606b47a725750b7edce jcasper Wed Mar 6 10:58:24 2024 -0800 Some updates for the knownGene pipeline, refs #32414 diff --git src/hg/utils/otto/knownGene/buildTo.sh src/hg/utils/otto/knownGene/buildTo.sh index c163074..aab8c8c 100755 --- src/hg/utils/otto/knownGene/buildTo.sh +++ src/hg/utils/otto/knownGene/buildTo.sh @@ -1,23 +1,23 @@ #!/bin/sh -ex { . ./buildEnv.sh # knownToLocusLink #hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from hgFixed.refLink" $db > refToLl.txt -hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from ncbiRefSeqLink where mrnaAcc != ''" $db > refToLl.txt +hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from ncbiRefSeqLink where mrnaAcc != '' and locusLinkId != ''" $db > refToLl.txt hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db ncbiRefSeq knownGene knownToLocusLink -lookup=refToLl.txt rm refToLl.txt if test "$gtexGeneModel" != "" then hgMapToGene -geneTableType=genePred $db -tempDb=$tempDb -all -type=genePred $gtexGeneModel knownGene knownToGtex fi # knownToEnsembl and knownToGencode${GENCODE_VERSION} awk '{OFS="\t"} {print $4,$4}' ucscGenes.bed | sort | uniq > knownToEnsembl.tab cp knownToEnsembl.tab knownToGencode${GENCODE_VERSION}.tab hgLoadSqlTab -notOnServer $tempDb knownToEnsembl $kent/src/hg/lib/knownTo.sql knownToEnsembl.tab hgLoadSqlTab -notOnServer $tempDb knownToGencode${GENCODE_VERSION} $kent/src/hg/lib/knownTo.sql knownToGencode${GENCODE_VERSION}.tab # make knownToLynx @@ -62,32 +62,36 @@ -lookup=$gnfAtlasTableLookup fi fi if test "$gnfU95TableLookup" != "" then hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db affyU95 knownGene $gnfU95TableLookup if test "$gnfU95TablesFixed" != "" then time hgExpDistance $tempDb $gnfU95TablesFixed gnfU95Distance -lookup=$gnfU95TableLookup fi fi if test "$hprd_file" != "" then - wget "$hprd_website/$hprd_tar" - tar xvf $hprd_tar +# Commented out these fetch lines until the hprd.org website returns. In the meantime, +# you can copy the files from the previous build (they haven't changed in a while). +# wget "$hprd_website/$hprd_tar" +# tar xvf $hprd_tar + cp ${oldGeneDir}/${hprd_tar} . + cp -R `dirname ${oldGeneDir}/${hprd_file}` ./`dirname ${hprd_file}` knownToHprd $tempDb $hprd_file hgLoadNetDist $genomes/hg19/p2p/hprd/hprd.pathLengths $tempDb humanHprdP2P \ -sqlRemap="select distinct value, name from knownToHprd" # these should be under a different test, but... hgLoadNetDist $genomes/hg19/p2p/vidal/humanVidal.pathLengths $tempDb humanVidalP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq" hgLoadNetDist $genomes/hg19/p2p/wanker/humanWanker.pathLengths $tempDb humanWankerP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq" fi # mupit-pdbids.txt was emailed from Kyle Moad (kmoad@insilico.us.com) # wc -l cp /hive/data/outside/mupit/mupit-pdbids.txt . # get knownGene IDs and associated PDB IDS # the extDb{Ref} parts come from hg/hgGene/domains.c:domainsPrint()