272a25f286c4041d89fa5c159a1c2b59d6d78271 hiram Thu Oct 15 09:19:39 2020 -0700 add gene name to pop message for LRG track refs #24672 diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt index 6b29194..aa41c8b 100644 --- src/hg/makeDb/doc/hg38/hg38.txt +++ src/hg/makeDb/doc/hg38/hg38.txt @@ -3660,71 +3660,77 @@ # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38 genePredCheck lrgTranscriptsUnmapped.gp #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46 #checked: 1029 failed: 1 # If there are complaints e.g. about exonFrame, look for inconsistencies in the # affected transcript's coding_region/coordinates vs. exon/intron info in xml. # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background # (missing exonFrame info doesn't affect our track representation because we end up using # psl). We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon # portion is only the stop codon. # No longer necessary to filter out alt and fix patches since they have been added to hg38. - # four extra columns have been added to the genePred (2020-09-16 - Hiram) + # and we need the transcript plus gene name later: + cut -f1,12 lrgTranscriptsUnmapped.gp | sort > transcript.gene.name.txt + + # five extra columns have been added to the genePred (2020-10-05 - Hiram) # extract them so they can be added to the psl: - awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort > lrgTransExtraFields.tsv + awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort \ + | join -t$'\t' - transcript.gene.name.txt \ + | awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s\t%s %s\n", $1,$2,$3,$4,$5,$7,$6,$7}' > lrgTransExtraFields.tsv - # the four extra fields are identifiers for: - # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein + # the five extra fields are identifiers for: + # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein, + # Gene name # Load LRG regions: bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \ -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD): lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl pslCheck lrg.psl #checked: 919 failed: 0 errors: 0 awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes genePredToFakePsl -chromSize=lrg.sizes placeholder \ lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \ lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp #Warning: no CDS for LRG_163t1 #Warning: no CDS for LRG_347t1 # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*). grep -l NR_ LRG_163.xml LRG_347.xml #LRG_163.xml #LRG_347.xml - # construct bigPsl with four extra fields + # construct bigPsl with five extra fields pslToBigPsl lrgTranscriptsHg38.psl bigPsl.txt - # add the four extra identifiers to the bigPsl file: + # add the five extra identifiers to the bigPsl file: join -t$'\t' -1 4 \ -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15\ -,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,2.2,2.3,2.4,2.5,2.6 \ +,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,2.2,2.3,2.4,2.5,2.6,2.7 \ <(sort -k4 bigPsl.txt) lrgTransExtraFields.tsv \ | sort -k1,1 -k2,2n > lrgExtraTranscriptsHg38.bigPsl.bed - bedToBigBed -as=bigPsl+5.as -type=bed12+17 -tab \ + bedToBigBed -as=bigPsl+6.as -type=bed12+19 -tab \ lrgExtraTranscriptsHg38.bigPsl.bed ../../../chrom.sizes lrgBigPsl.bb bigBedInfo lrgBigPsl.bb ln -sf `pwd`/lrgBigPsl.bb /gbdb/hg38/bbi hgBbiDbLink hg38 lrgBigPsl /gbdb/hg38/bbi/lrgBigPsl.bb # Load PSL, CDS and sequences. hgLoadPsl hg38 -table=lrgTranscriptAli lrgTranscriptsHg38.psl hgLoadSqlTab hg38 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds hgPepPred hg38 tab lrgCdna lrgCdna.tab hgPepPred hg38 tab lrgPep lrgPep.tab ############################################################################# ## 7-Way Multiz (DONE - 2014-06-02 - Hiram)