a3411a622fcf54100e80fd388cbda49afed17612 braney Mon Aug 3 09:31:04 2020 -0700 ongoing work on merged gene set diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh index 0917f8e..28cd813 100644 --- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh @@ -1,24 +1,25 @@ # This doc assumes that the gencode* tables have been built on $db db=hg38 GENCODE_VERSION=V35 dir=/hive/data/genomes/$db/bed/gencode$GENCODE_VERSION/build genomes=/hive/data/genomes tempDb=knownGeneV35 kent=$HOME/kent spDb=sp180404 cpuFarm=ku +export curVer=13 mkdir -p $dir cd $dir echo "create database $tempDb" | hgsql "" echo "create table chromInfo like $db.chromInfo" | hgsql $tempDb echo "insert into chromInfo select * from $db.chromInfo" | hgsql $tempDb hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | genePredToBed stdin stdout | sort -k1,1 -k2,2n | gzip -c > gencode${GENCODE_VERSION}.bed.gz touch oldToNew.tab zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq | bedClip stdin /cluster/data/hg38/chrom.sizes ucscSplice.bed @@ -275,15 +276,56 @@ hgLoadRnaFold -warnEmpty $tempDb foldUtr3 fold # Clean up rm -r split fold err batch.bak cd ../utr5 rm -r split fold err batch.bak hgKgGetText $tempDb tempSearch.txt sort tempSearch.txt > tempSearch2.txt tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt join tempSearch2.txt tempSearch3.txt | sort > knownGene.txt ixIxx knownGene.txt knownGene${GENCODE_VERSION}.ix knownGene${GENCODE_VERSION}.ixx rm -rf /gbdb/$db/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ixx ln -s $dir/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ix ln -s $dir/knownGene${GENCODE_VERSION}.ixx /gbdb/$db/knownGene${GENCODE_VERSION}.ixx + +#zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed +#jtwoBitToFa -noMask /cluster/data/$db/$db.2bit -bed=ucscGenes.bed stdout | faFilter -uniq stdin ucscGenes.fa +#jhgPepPred $tempDb generic knownGeneMrna ucscGenes.fa +bedToPsl /cluster/data/$db/chrom.sizes ucscGenes.bed ucscGenes.psl +pslRecalcMatch ucscGenes.psl /cluster/data/$db/$db.2bit ucscGenes.fa kgTargetAli.psl +# should be zero +awk '$11 != $1 + $3+$4' kgTargetAli.psl +hgLoadPsl $tempDb kgTargetAli.psl + +cd $dir +# Make PCR target for UCSC Genes, Part 1. +# 1. Get a set of IDs that consist of the UCSC Gene accession concatenated with the +# gene symbol, e.g. uc010nxr.1__DDX11L1 +hgsql $tempDb -N -e 'select kgId,geneSymbol from kgXref' \ + | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \ + | sort -u > idSub.txt +# 2. Get a file of per-transcript fasta sequences that contain the sequences of each UCSC Genes transcript, with this new ID in the place of the UCSC Genes accession. Convert that file to TwoBit format and soft-link it into /gbdb/hg38/targetDb/ +awk '{if (!found[$4]) print; found[$4]=1 }' ucscGenes.bed > nodups.bed +subColumn 4 nodups.bed idSub.txt ucscGenesIdSubbed.bed +sequenceForBed -keepName -db=$db -bedIn=ucscGenesIdSubbed.bed -fastaOut=stdout | faToTwoBit stdin ${db}KgSeq${curVer}.2bit +mkdir -p /gbdb/$db/targetDb/ +rm -f /gbdb/$db/targetDb/${db}KgSeq${curVer}.2bit +ln -s $dir/${db}KgSeq${curVer}.2bit /gbdb/$db/targetDb/ +# Load the table kgTargetAli, which shows where in the genome these targets are. +cut -f 1-10 knownGene.gp | genePredToFakePsl $tempDb stdin kgTargetAli.psl /dev/null +hgLoadPsl $tempDb kgTargetAli.psl + +# 3. Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on +# /gbdb/$db/targetDb/${db}KgSeq${curVer}.2bit + +# 4. On hgwdev, insert new records into blatServers and targetDb, using the +# host (field 2) and port (field 3) specified by cluster-admin. Identify the +# blatServer by the keyword "$db"Kg with the version number appended +# untrans gfServer for hg38KgSeq12 on host blat1b, port 17897 +hgsql hgcentraltest -e \ + 'INSERT into blatServers values ("hg38KgSeq13", "blat1b", 1909, 0, 1);' +hgsql hgcentraltest -e \ + 'INSERT into targetDb values("hg38KgSeq13", "GENCODE Genes", \ + "hg38", "knownGeneV35.kgTargetAli", "", "", \ + "/gbdb/hg38/targetDb/hg38KgSeq13.2bit", 1, now(), "");'