a58598681583d3b1637c0a9fcd25610bdf2b6566 braney Sat Sep 11 07:27:43 2021 -0700 some cleanup of this old doc file diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh index f284599..14741f5 100644 --- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh @@ -347,32 +347,32 @@ hgLoadRnaFold $tempDb foldUtr5 fold cd ../utr3 hgLoadRnaFold -warnEmpty $tempDb foldUtr3 fold # Clean up rm -r split fold err batch.bak cd ../utr5 rm -r split fold err batch.bak hgKgGetText $tempDb tempSearch.txt sort tempSearch.txt > tempSearch2.txt tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt join tempSearch2.txt tempSearch3.txt | sort > knownGene.txt ixIxx knownGene.txt knownGene${GENCODE_VERSION}.ix knownGene${GENCODE_VERSION}.ixx rm -rf /gbdb/$db/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ixx -ln -s $dir/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ix -ln -s $dir/knownGene${GENCODE_VERSION}.ixx /gbdb/$db/knownGene${GENCODE_VERSION}.ixx +ln -s `pwd`/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ix +ln -s `pwd`/knownGene${GENCODE_VERSION}.ixx /gbdb/$db/knownGene${GENCODE_VERSION}.ixx tawk '{print $5}' knownCanonical.tab | sort > knownCanonicalId.txt join knownCanonicalId.txt knownGene.txt | join -v 1 /dev/stdin pseudo.txt > knownGeneFast.txt ixIxx knownGeneFast.txt knownGeneFast${GENCODE_VERSION}.ix knownGeneFast${GENCODE_VERSION}.ixx rm -rf /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ix /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ixx ln -s $dir/knownGeneFast${GENCODE_VERSION}.ix /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ix ln -s $dir/knownGeneFast${GENCODE_VERSION}.ixx /gbdb/$db/knownGeneFast${GENCODE_VERSION}.ixx #zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed #jtwoBitToFa -noMask /cluster/data/$db/$db.2bit -bed=ucscGenes.bed stdout | faFilter -uniq stdin ucscGenes.fa #jhgPepPred $tempDb generic knownGeneMrna ucscGenes.fa bedToPsl /cluster/data/$db/chrom.sizes ucscGenes.bed ucscGenes.psl pslRecalcMatch ucscGenes.psl /cluster/data/$db/$db.2bit ucscGenes.fa kgTargetAli.psl # should be zero awk '$11 != $1 + $3+$4' kgTargetAli.psl hgLoadPsl $tempDb kgTargetAli.psl @@ -380,45 +380,45 @@ cd $dir # Make PCR target for UCSC Genes, Part 1. # 1. Get a set of IDs that consist of the UCSC Gene accession concatenated with the # gene symbol, e.g. uc010nxr.1__DDX11L1 hgsql $tempDb -N -e 'select kgId,geneSymbol from kgXref' \ | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \ | sort -u > idSub.txt # 2. Get a file of per-transcript fasta sequences that contain the sequences of each UCSC Genes transcript, with this new ID in the place of the UCSC Genes accession. Convert that file to TwoBit format and soft-link it into /gbdb/hg38/targetDb/ awk '{if (!found[$4]) print; found[$4]=1 }' ucscGenes.bed > nodups.bed subColumn 4 nodups.bed idSub.txt ucscGenesIdSubbed.bed sequenceForBed -keepName -db=$db -bedIn=ucscGenesIdSubbed.bed -fastaOut=stdout | faToTwoBit stdin ${db}KgSeq${curVer}.2bit mkdir -p /gbdb/$db/targetDb/ rm -f /gbdb/$db/targetDb/${db}KgSeq${curVer}.2bit ln -s $dir/${db}KgSeq${curVer}.2bit /gbdb/$db/targetDb/ # Load the table kgTargetAli, which shows where in the genome these targets are. -cut -f 1-10 knownGene.gp | genePredToFakePsl $tempDb stdin kgTargetAli.psl /dev/null +#cut -f 1-10 knownGene.gp | genePredToFakePsl $tempDb stdin kgTargetAli.psl /dev/null hgLoadPsl $tempDb kgTargetAli.psl # 3. Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on # /gbdb/$db/targetDb/${db}KgSeq${curVer}.2bit # 4. On hgwdev, insert new records into blatServers and targetDb, using the # host (field 2) and port (field 3) specified by cluster-admin. Identify the # blatServer by the keyword "$db"Kg with the version number appended # untrans gfServer for hg38KgSeq12 on host blat1b, port 17897 hgsql hgcentraltest -e \ - 'INSERT into blatServers values ("hg38KgSeq13", "blat1b", 1909, 0, 1);' + 'INSERT into blatServers values ("hg38KgSeq13", "blat1b", 17909, 0, 1,"");' hgsql hgcentraltest -e \ 'INSERT into targetDb values("hg38KgSeq13", "GENCODE Genes", \ - "hg38", "knownGeneV35.kgTargetAli", "", "", \ + "hg38", "kgTargetAli", "", "", \ "/gbdb/hg38/targetDb/hg38KgSeq13.2bit", 1, now(), "");' for i in $tempFa $xdbFa $ratFa $fishFa $flyFa $wormFa $yeastFa do if test ! -f $i then echo $i not found fi done rm -rf $dir/hgNearBlastp mkdir $dir/hgNearBlastp cd $dir/hgNearBlastp tcsh cat << _EOF_ > config.ra # Latest human vs. other Gene Sorter orgs: