c67f70fced286da481cf4a3c73013d1d2fca909b max Fri Jul 1 07:07:13 2022 -0700 adding lrg ncbi accession to search specs, refs #29689 diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt index 5a38b61..3e3eaaf 100644 --- src/hg/makeDb/doc/hg38/hg38.txt +++ src/hg/makeDb/doc/hg38/hg38.txt @@ -3636,30 +3636,31 @@ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 102m41.443s cat fb.monDom5.chainHg38Link.txt # 420069915 bases of 3501660299 (11.996%) in intersection time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \ -dbHost=hgwdev -workhorse=hgwdev monDom5 hg38) > rbest.log 2>&1 # real 90m56.189s _EOF_ ############################################################################# # LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie) # Redmine #13359, #24285 -- otto-mate To Do #17877 # previously done 7/7/14, 9/9/16, 5/30/18 +# THIS IS NOW AN OTTO JOB !! set today = `date +%Y_%m_%d` mkdir -p /hive/data/genomes/hg38/bed/lrg/$today cd /hive/data/genomes/hg38/bed/lrg/$today wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip unzip LRG_public_xml_files.zip # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts: # parseLrgXml.pl updated 2020-09-16 to add four new fields to the gp output # the four extra fields are identifiers for: # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38 genePredCheck lrgTranscriptsUnmapped.gp #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46 #checked: 1029 failed: 1 @@ -3674,32 +3675,36 @@ # and we need the transcript plus gene name later: cut -f1,12 lrgTranscriptsUnmapped.gp | sort > transcript.gene.name.txt # five extra columns have been added to the genePred (2020-10-05 - Hiram) # extract them so they can be added to the psl: awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort \ | join -t$'\t' - transcript.gene.name.txt \ | awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s\t%s %s\n", $1,$2,$3,$4,$5,$7,$6,$7}' > lrgTransExtraFields.tsv # the five extra fields are identifiers for: # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein, # Gene name # Load LRG regions: + #bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \ + #-tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name + # after ML #29689, added ncbiAcc field, Max, July 1, 2022 + # changed to: bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \ - -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name + -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name,ncbiAcc ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD): lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl pslCheck lrg.psl #checked: 919 failed: 0 errors: 0 awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes genePredToFakePsl -chromSize=lrg.sizes placeholder \ lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \ lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp #Warning: no CDS for LRG_163t1 #Warning: no CDS for LRG_347t1