fbd7a94213923088270a2bc3da63d0b8139451f0 angie Thu Oct 10 14:33:52 2019 -0700 LRG now distinguishes between fix_patch and novel_patch in their XML, so we can include mappings to fix & alt patch sequences. Overdue for an update anyway. refs #24285 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 1f76430..01a19e0 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -28710,89 +28710,93 @@ string hgvsCod; "coding HGVS" string hgvsProt; "protein HGVS" string lastEval; "last evaluation" string guidelines; "guidelines" lstring otherIds; "other identifiers (OMIM)" ) '_EOF_' bedToBigBed clinvarMain.bed /scratch/data/hg19/chrom.sizes clinvarMain.bb -type=bed4+18 -tab -as=clinvar.as bedToBigBed clinvarCnv.bed /scratch/data/hg19/chrom.sizes clinvarCnv.bb -type=bed4+18 -tab -as=clinvar.as cp clinvarMain.bb /hive/data/genomes/hg19/bed/clinvar/ cp clinvarCnv.bb /hive/data/genomes/hg19/bed/clinvar/ _EOF_ ######################################################################### -# LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 5/30/18 angie) -# Redmine #13359 -- otto-mate To Do #17877 -# previously done 7/7/14, 9/9/16 +# LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/10/19 angie) +# Redmine #13359, #24285 -- otto-mate To Do #17877 +# previously done 7/7/14, 9/9/16, 5/30/18 screen -S lrg -t lrg set today = `date +%Y_%m_%d` mkdir -p /hive/data/genomes/hg19/bed/lrg/$today cd /hive/data/genomes/hg19/bed/lrg/$today wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip unzip LRG_public_xml_files.zip # The .atree file was useful for getting a handle on the hierarchy and types of nodes: # autoDtd LRG_1.xml lrg.dtd lrg.stats -atree=lrg.atree # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts: ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh37 genePredCheck lrgTranscriptsUnmapped.gp #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46 -#checked: 917 failed: 1 +#checked: 1029 failed: 1 # If there are complaints e.g. about exonFrame, look for inconsistencies in the # affected transcript's coding_region/coordinates vs. exon/intron info in xml. # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background # (missing exonFrame info doesn't affect our track representation because we end up using # psl). We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon # portion is only the stop codon. - # Filter out alts and patches not (yet) included in hg19: + # hg19 has patches on hgwdev but not on the RR, and the patches may remain on hgwdev. + # To avoid confusion, exclude patch sequences for now; if we release patches, rebuild + # LRG tracks without this part. mv lrg.bed lrg.allSeqs.bed - cut -f 1 ../../../chrom.sizes | grep -Fwf - lrg.allSeqs.bed > lrg.bed + cut -f 1 ../../../chrom.sizes.initial | grep -Fwf - lrg.allSeqs.bed > lrg.bed + wc -l lrg*bed +# 930 lrg.allSeqs.bed +# 888 lrg.bed # Load LRG regions: bedToBigBed lrg.bed /hive/data/genomes/hg19/chrom.sizes lrg.bb \ -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name - rm -f /gbdb/hg19/bbi/lrg.bb - ln -s `pwd`/lrg.bb /gbdb/hg19/bbi/lrg.bb + ln -sf `pwd`/lrg.bb /gbdb/hg19/bbi/lrg.bb hgBbiDbLink hg19 lrg /gbdb/hg19/bbi/lrg.bb # Map LRG fixed_annotation transcripts from LRG coords to hg19 coords (HT MarkD): lrgToPsl lrg.bed /hive/data/genomes/hg19/chrom.sizes lrg.psl pslCheck lrg.psl -#checked: 781 failed: 0 errors: 0 +#checked: 888 failed: 0 errors: 0 awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes genePredToFakePsl -chromSize=lrg.sizes placeholder \ lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg19.psl mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \ lrgTranscriptsHg19.psl lrgTranscriptsHg19NoName2.gp #Warning: no CDS for LRG_163t1 #Warning: no CDS for LRG_347t1 - # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx e.g. LRG_163t1. + # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*). grep -l NR_ LRG_163.xml LRG_347.xml #LRG_163.xml #LRG_347.xml # Load PSL, CDS and sequences. hgLoadPsl hg19 -table=lrgTranscriptAli lrgTranscriptsHg19.psl hgLoadSqlTab hg19 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds hgPepPred hg19 tab lrgCdna lrgCdna.tab hgPepPred hg19 tab lrgPep lrgPep.tab - # OPTIONAL + # OPTIONAL (only done for initial track development) # For a rough comparison of mapping methods, and to get some extra error-checking on the PSL # from the chain code, try this too: pslToChain lrg.psl lrg.chain chainSwap lrg.chain lrgMap.chain liftOver -genePred lrgTranscriptsUnmapped.gp lrgMap.chain lrgTLiftOver.gp noMap # The noMap file has a few "Boundary problem" errors because liftOver doesn't have # sophisticated handling of exonFrames and indels. Also, liftOver carries over exonFrames # as-is without regard to strand, so they end up being reversed for LRG's on the - strand. # That's why we're using MarkD's method above! # The resulting genePredExt from that process is missing the name2 column from # the original; add it back: cut -f 1,12 lrgTranscriptsUnmapped.gp > txName2 join -j 1 -a 1 -t' ' \ -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,2.2,1.13,1.14,1.15 \ lrgTranscriptsHg19NoName2.gp txName2 > lrgTranscriptsHg19.gp