fbd7a94213923088270a2bc3da63d0b8139451f0
angie
  Thu Oct 10 14:33:52 2019 -0700
LRG now distinguishes between fix_patch and novel_patch in their XML, so we can include mappings to fix & alt patch sequences.  Overdue for an update anyway.  refs #24285

diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt
index 1f76430..01a19e0 100644
--- src/hg/makeDb/doc/hg19.txt
+++ src/hg/makeDb/doc/hg19.txt
@@ -28710,89 +28710,93 @@
     string hgvsCod;         "coding HGVS"
     string hgvsProt;         "protein HGVS"
     string lastEval;         "last evaluation"
     string guidelines;         "guidelines"
     lstring otherIds;         "other identifiers (OMIM)"
     )
 '_EOF_'
 
 bedToBigBed clinvarMain.bed /scratch/data/hg19/chrom.sizes clinvarMain.bb -type=bed4+18 -tab -as=clinvar.as
 bedToBigBed clinvarCnv.bed /scratch/data/hg19/chrom.sizes clinvarCnv.bb -type=bed4+18 -tab -as=clinvar.as
 cp clinvarMain.bb /hive/data/genomes/hg19/bed/clinvar/
 cp clinvarCnv.bb /hive/data/genomes/hg19/bed/clinvar/
 
 _EOF_
 #########################################################################
-# LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 5/30/18 angie)
-# Redmine #13359 -- otto-mate To Do #17877
-# previously done 7/7/14, 9/9/16
+# LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/10/19 angie)
+# Redmine #13359, #24285 -- otto-mate To Do #17877
+# previously done 7/7/14, 9/9/16, 5/30/18
     screen -S lrg -t lrg
     set today = `date +%Y_%m_%d`
     mkdir -p /hive/data/genomes/hg19/bed/lrg/$today
     cd /hive/data/genomes/hg19/bed/lrg/$today
     wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip
     unzip LRG_public_xml_files.zip
     # The .atree file was useful for getting a handle on the hierarchy and types of nodes:
     # autoDtd LRG_1.xml lrg.dtd lrg.stats -atree=lrg.atree
 
     # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts:
     ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh37
     genePredCheck lrgTranscriptsUnmapped.gp
 #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46
-#checked: 917 failed: 1
+#checked: 1029 failed: 1
     # If there are complaints e.g. about exonFrame, look for inconsistencies in the
     # affected transcript's coding_region/coordinates vs. exon/intron info in xml.
     # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background
     # (missing exonFrame info doesn't affect our track representation because we end up using
     # psl).  We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon
     # portion is only the stop codon.
 
-    # Filter out alts and patches not (yet) included in hg19:
+    # hg19 has patches on hgwdev but not on the RR, and the patches may remain on hgwdev.
+    # To avoid confusion, exclude patch sequences for now; if we release patches, rebuild
+    # LRG tracks without this part.
     mv lrg.bed lrg.allSeqs.bed
-    cut -f 1 ../../../chrom.sizes | grep -Fwf - lrg.allSeqs.bed > lrg.bed
+    cut -f 1 ../../../chrom.sizes.initial | grep -Fwf - lrg.allSeqs.bed > lrg.bed
+    wc -l lrg*bed
+#   930 lrg.allSeqs.bed
+#   888 lrg.bed
 
     # Load LRG regions:
     bedToBigBed lrg.bed /hive/data/genomes/hg19/chrom.sizes lrg.bb \
       -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name
-    rm -f /gbdb/hg19/bbi/lrg.bb
-    ln -s `pwd`/lrg.bb /gbdb/hg19/bbi/lrg.bb
+    ln -sf `pwd`/lrg.bb /gbdb/hg19/bbi/lrg.bb
     hgBbiDbLink hg19 lrg /gbdb/hg19/bbi/lrg.bb
 
     # Map LRG fixed_annotation transcripts from LRG coords to hg19 coords (HT MarkD):
     lrgToPsl lrg.bed /hive/data/genomes/hg19/chrom.sizes lrg.psl
     pslCheck lrg.psl
-#checked: 781 failed: 0 errors: 0
+#checked: 888 failed: 0 errors: 0
     awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes
     genePredToFakePsl -chromSize=lrg.sizes placeholder \
       lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds
     pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg19.psl
     mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \
       lrgTranscriptsHg19.psl lrgTranscriptsHg19NoName2.gp
 #Warning: no CDS for LRG_163t1
 #Warning: no CDS for LRG_347t1
-    # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx e.g. LRG_163t1.
+    # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*).
     grep -l NR_ LRG_163.xml LRG_347.xml
 #LRG_163.xml
 #LRG_347.xml
 
     # Load PSL, CDS and sequences.
     hgLoadPsl hg19 -table=lrgTranscriptAli lrgTranscriptsHg19.psl
     hgLoadSqlTab hg19 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds
     hgPepPred hg19 tab lrgCdna lrgCdna.tab
     hgPepPred hg19 tab lrgPep lrgPep.tab
 
-    # OPTIONAL
+    # OPTIONAL (only done for initial track development)
     # For a rough comparison of mapping methods, and to get some extra error-checking on the PSL
     # from the chain code, try this too:
     pslToChain lrg.psl lrg.chain
     chainSwap lrg.chain lrgMap.chain
     liftOver -genePred lrgTranscriptsUnmapped.gp lrgMap.chain lrgTLiftOver.gp noMap
     # The noMap file has a few "Boundary problem" errors because liftOver doesn't have
     # sophisticated handling of exonFrames and indels.  Also, liftOver carries over exonFrames
     # as-is without regard to strand, so they end up being reversed for LRG's on the - strand.
     # That's why we're using MarkD's method above!
     # The resulting genePredExt from that process is missing the name2 column from
     # the original; add it back:
     cut -f 1,12 lrgTranscriptsUnmapped.gp > txName2
     join -j 1 -a 1 -t'	' \
       -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,2.2,1.13,1.14,1.15 \
       lrgTranscriptsHg19NoName2.gp txName2 > lrgTranscriptsHg19.gp