c67f70fced286da481cf4a3c73013d1d2fca909b
max
  Fri Jul 1 07:07:13 2022 -0700
adding lrg ncbi accession to search specs, refs #29689

diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt
index 5a38b61..3e3eaaf 100644
--- src/hg/makeDb/doc/hg38/hg38.txt
+++ src/hg/makeDb/doc/hg38/hg38.txt
@@ -3636,30 +3636,31 @@
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet) > swap.log 2>&1
     # real    102m41.443s
 
     cat fb.monDom5.chainHg38Link.txt
     # 420069915 bases of 3501660299 (11.996%) in intersection
     time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \
         -dbHost=hgwdev -workhorse=hgwdev monDom5 hg38) > rbest.log 2>&1
     #  real    90m56.189s
 
 _EOF_
 #############################################################################
 # LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie)
 # Redmine #13359, #24285 -- otto-mate To Do #17877
 # previously done 7/7/14, 9/9/16, 5/30/18
+# THIS IS NOW AN OTTO JOB !!
     set today = `date +%Y_%m_%d`
     mkdir -p /hive/data/genomes/hg38/bed/lrg/$today
     cd /hive/data/genomes/hg38/bed/lrg/$today
     wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip
     unzip LRG_public_xml_files.zip
 
     # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts:
     # parseLrgXml.pl updated 2020-09-16 to add four new fields to the gp output
     # the four extra fields are identifiers for:
     # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein
 
     ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38
     genePredCheck lrgTranscriptsUnmapped.gp
 #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46
 #checked: 1029 failed: 1
@@ -3674,32 +3675,36 @@
 
     # and we need the transcript plus gene name later:
     cut -f1,12 lrgTranscriptsUnmapped.gp | sort > transcript.gene.name.txt
 
     # five extra columns have been added to the genePred (2020-10-05 - Hiram)
     # extract them so they can be added to the psl:
     awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort \
        | join -t$'\t' - transcript.gene.name.txt \
          | awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s\t%s %s\n", $1,$2,$3,$4,$5,$7,$6,$7}' > lrgTransExtraFields.tsv
 
     # the five extra fields are identifiers for:
     # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein,
     #	Gene name
 
     # Load LRG regions:
+    #bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \
+    #-tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name
+    # after ML #29689, added ncbiAcc field, Max, July 1, 2022
+    # changed to:
     bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \
-      -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name
+    -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name,ncbiAcc
     ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb
     hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb
 
     # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD):
     lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl
     pslCheck lrg.psl
 #checked: 919 failed: 0 errors: 0
     awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes
     genePredToFakePsl -chromSize=lrg.sizes placeholder \
       lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds
     pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl
     mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \
       lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp
 #Warning: no CDS for LRG_163t1
 #Warning: no CDS for LRG_347t1