src/hg/makeDb/doc/hg38/hg38.txt 17748af09f0add98009e2c4c181cb4e75d068d26

17748af09f0add98009e2c4c181cb4e75d068d26
hiram
  Wed Sep 16 14:17:15 2020 -0700
modified procedure to make a new style LRG transcript track refs #24672

diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt
index 88a1e90..6b29194 100644
--- src/hg/makeDb/doc/hg38/hg38.txt
+++ src/hg/makeDb/doc/hg38/hg38.txt
@@ -3643,66 +3643,94 @@
         -dbHost=hgwdev -workhorse=hgwdev monDom5 hg38) > rbest.log 2>&1
     #  real    90m56.189s
 
 _EOF_
 #############################################################################
 # LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie)
 # Redmine #13359, #24285 -- otto-mate To Do #17877
 # previously done 7/7/14, 9/9/16, 5/30/18
     set today = `date +%Y_%m_%d`
     mkdir -p /hive/data/genomes/hg38/bed/lrg/$today
     cd /hive/data/genomes/hg38/bed/lrg/$today
     wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip
     unzip LRG_public_xml_files.zip
 
     # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts:
+    # parseLrgXml.pl updated 2020-09-16 to add four new fields to the gp output
+    # the four extra fields are identifiers for:
+    # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein
+
     ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38
     genePredCheck lrgTranscriptsUnmapped.gp
 #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46
 #checked: 1029 failed: 1
     # If there are complaints e.g. about exonFrame, look for inconsistencies in the
     # affected transcript's coding_region/coordinates vs. exon/intron info in xml.
     # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background
     # (missing exonFrame info doesn't affect our track representation because we end up using
     # psl).  We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon
     # portion is only the stop codon.
 
     # No longer necessary to filter out alt and fix patches since they have been added to hg38.
 
+    # four extra columns have been added to the genePred (2020-09-16 - Hiram)
+    # extract them so they can be added to the psl:
+    awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort > lrgTransExtraFields.tsv
+
+    # the four extra fields are identifiers for:
+    # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein
+
     # Load LRG regions:
     bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \
       -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name
     ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb
     hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb
 
     # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD):
     lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl
     pslCheck lrg.psl
 #checked: 919 failed: 0 errors: 0
     awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes
     genePredToFakePsl -chromSize=lrg.sizes placeholder \
       lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds
     pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl
     mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \
       lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp
 #Warning: no CDS for LRG_163t1
 #Warning: no CDS for LRG_347t1
     # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*).
     grep -l NR_ LRG_163.xml LRG_347.xml
 #LRG_163.xml
 #LRG_347.xml
 
+    # construct bigPsl with four extra fields
+    pslToBigPsl lrgTranscriptsHg38.psl bigPsl.txt
+
+    # add the four extra identifiers to the bigPsl file:
+    join -t$'\t' -1 4 \
+       -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15\
+,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,2.2,2.3,2.4,2.5,2.6 \
+       <(sort -k4 bigPsl.txt) lrgTransExtraFields.tsv \
+         | sort -k1,1 -k2,2n > lrgExtraTranscriptsHg38.bigPsl.bed
+
+    bedToBigBed -as=bigPsl+5.as -type=bed12+17 -tab \
+       lrgExtraTranscriptsHg38.bigPsl.bed ../../../chrom.sizes lrgBigPsl.bb
+    bigBedInfo lrgBigPsl.bb
+    ln -sf `pwd`/lrgBigPsl.bb /gbdb/hg38/bbi
+    hgBbiDbLink hg38 lrgBigPsl /gbdb/hg38/bbi/lrgBigPsl.bb
+
+
     # Load PSL, CDS and sequences.
     hgLoadPsl hg38 -table=lrgTranscriptAli lrgTranscriptsHg38.psl
     hgLoadSqlTab hg38 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds
     hgPepPred hg38 tab lrgCdna lrgCdna.tab
     hgPepPred hg38 tab lrgPep lrgPep.tab
 
 
 #############################################################################
 ## 7-Way Multiz (DONE - 2014-06-02 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg38/bed/multiz7way
     cd /hive/data/genomes/hg38/bed/multiz7way
 
     # from the 63-way in the source tree, select out the 7 used here:
     /cluster/bin/phast/tree_doctor \