src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh a3411a622fcf54100e80fd388cbda49afed17612

a3411a622fcf54100e80fd388cbda49afed17612
braney
  Mon Aug 3 09:31:04 2020 -0700
ongoing work on merged gene set

diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh
index 0917f8e..28cd813 100644
--- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh
+++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh
@@ -1,24 +1,25 @@
 # This doc assumes that the gencode* tables have been built on $db
 db=hg38
 GENCODE_VERSION=V35
 dir=/hive/data/genomes/$db/bed/gencode$GENCODE_VERSION/build
 genomes=/hive/data/genomes
 tempDb=knownGeneV35
 kent=$HOME/kent
 spDb=sp180404
 cpuFarm=ku
+export curVer=13
 
 mkdir -p $dir
 cd $dir
 
 echo "create database $tempDb" | hgsql ""
 echo "create table chromInfo like  $db.chromInfo" | hgsql $tempDb
 echo "insert into chromInfo select * from   $db.chromInfo" | hgsql $tempDb
 
 hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 |  genePredToBed stdin stdout | sort -k1,1 -k2,2n | gzip -c > gencode${GENCODE_VERSION}.bed.gz
 
 touch oldToNew.tab
 
 zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed
 txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg
 txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq | bedClip stdin /cluster/data/hg38/chrom.sizes  ucscSplice.bed
@@ -275,15 +276,56 @@
     hgLoadRnaFold -warnEmpty $tempDb foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 hgKgGetText $tempDb tempSearch.txt
 sort tempSearch.txt > tempSearch2.txt
 tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt
 join tempSearch2.txt tempSearch3.txt | sort > knownGene.txt
 ixIxx knownGene.txt knownGene${GENCODE_VERSION}.ix knownGene${GENCODE_VERSION}.ixx
  rm -rf /gbdb/$db/knownGene${GENCODE_VERSION}.ix /gbdb/$db/knownGene${GENCODE_VERSION}.ixx
 ln -s $dir/knownGene${GENCODE_VERSION}.ix  /gbdb/$db/knownGene${GENCODE_VERSION}.ix
 ln -s $dir/knownGene${GENCODE_VERSION}.ixx /gbdb/$db/knownGene${GENCODE_VERSION}.ixx  
+
+#zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed
+#jtwoBitToFa -noMask /cluster/data/$db/$db.2bit -bed=ucscGenes.bed stdout | faFilter -uniq stdin  ucscGenes.fa
+#jhgPepPred $tempDb generic knownGeneMrna ucscGenes.fa
+bedToPsl /cluster/data/$db/chrom.sizes ucscGenes.bed ucscGenes.psl
+pslRecalcMatch ucscGenes.psl /cluster/data/$db/$db.2bit ucscGenes.fa kgTargetAli.psl
+# should be zero
+awk '$11 != $1 + $3+$4' kgTargetAli.psl
+hgLoadPsl $tempDb kgTargetAli.psl
+
+cd $dir
+# Make PCR target for UCSC Genes, Part 1.
+# 1. Get a set of IDs that consist of the UCSC Gene accession concatenated with the
+#    gene symbol, e.g. uc010nxr.1__DDX11L1
+hgsql $tempDb -N -e 'select kgId,geneSymbol from kgXref' \
+    | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
+      | sort -u > idSub.txt 
+# 2. Get a file of per-transcript fasta sequences that contain the sequences of each UCSC Genes transcript, with this new ID in the place of the UCSC Genes accession.   Convert that file to TwoBit format and soft-link it into /gbdb/hg38/targetDb/ 
+awk '{if (!found[$4]) print; found[$4]=1 }' ucscGenes.bed > nodups.bed
+subColumn 4 nodups.bed idSub.txt ucscGenesIdSubbed.bed 
+sequenceForBed -keepName -db=$db -bedIn=ucscGenesIdSubbed.bed -fastaOut=stdout  | faToTwoBit stdin ${db}KgSeq${curVer}.2bit
+mkdir -p /gbdb/$db/targetDb/ 
+rm -f /gbdb/$db/targetDb/${db}KgSeq${curVer}.2bit
+ln -s $dir/${db}KgSeq${curVer}.2bit /gbdb/$db/targetDb/
+# Load the table kgTargetAli, which shows where in the genome these targets are.
+cut -f 1-10 knownGene.gp | genePredToFakePsl $tempDb stdin kgTargetAli.psl /dev/null
+hgLoadPsl $tempDb kgTargetAli.psl
+
+# 3. Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on       
+# /gbdb/$db/targetDb/${db}KgSeq${curVer}.2bit 
+
+# 4. On hgwdev, insert new records into blatServers and targetDb, using the 
+# host (field 2) and port (field 3) specified by cluster-admin.  Identify the
+# blatServer by the keyword "$db"Kg with the version number appended
+# untrans gfServer for hg38KgSeq12 on host blat1b, port 17897
+hgsql hgcentraltest -e \
+      'INSERT into blatServers values ("hg38KgSeq13", "blat1b", 1909, 0, 1);'
+hgsql hgcentraltest -e \
+            'INSERT into targetDb values("hg38KgSeq13", "GENCODE Genes", \
+                     "hg38", "knownGeneV35.kgTargetAli", "", "", \
+                              "/gbdb/hg38/targetDb/hg38KgSeq13.2bit", 1, now(), "");'