cd2eb699ee42cfe22aa6d1587ed3ac444953d8e9 hiram Fri Apr 17 13:00:43 2026 -0700 fixup to get geneName2 column correctly populated refs #14450 diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl index a7b8e41912f..9b0fafe91f6 100755 --- src/hg/utils/automation/doXenoRefGene.pl +++ src/hg/utils/automation/doXenoRefGene.pl @@ -302,32 +302,38 @@ if [ -s "\$db.xenoRefGene.psl" ]; then grep NR_ \$db.xenoRefGene.psl > NR.psl grep NM_ \$db.xenoRefGene.psl > NM.psl mrnaToGene -cdsDb=hgFixed NM.psl NM.gp mrnaToGene -noCds NR.psl NR.gp cat NM.gp NR.gp | genePredSingleCover stdin \$db.xenoRefGene.gp genePredCheck -db=\$db -chromSizes=\$db.chrom.sizes \$db.xenoRefGene.gp genePredToBed \$db.xenoRefGene.gp stdout \\ | bedToExons stdin stdout | bedSingleCover.pl stdin > \$db.exons.bed export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$db.exons.bed` export asmSizeNoGaps=`grep sequences ../../\$db.faSize.txt | awk '{print \$5}'` export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$db.xenoRefGene.txt rm -f \$db.exons.bed - genePredToBigGenePred -geneNames=$mrnas/geneOrgXref.txt \$db.xenoRefGene.gp \\ + # The genePred names have version suffixes (e.g. NM_000014.6) but + # geneOrgXref.txt keys are unversioned (NM_000014). Build a versioned + # xref so genePredToBigGenePred can match them and populate geneName2. + awk -F'\\t' 'NR==FNR{key[\$1]=\$2"\\t"\$3; next} {split(\$1,a,"."); if(a[1] in key) print \$1"\\t"key[a[1]]}' \\ + $mrnas/geneOrgXref.txt \$db.xenoRefGene.gp > \$db.versionedXref.txt + genePredToBigGenePred -geneNames=\$db.versionedXref.txt \$db.xenoRefGene.gp \\ stdout | sort -k1,1 -k2,2n > \$db.bgpInput + rm -f \$db.versionedXref.txt sed -e 's#Alternative/human readable gene name#species of origin of the mRNA#; s#Name or ID of item, ideally both human readable and unique#RefSeq accession id#; s#Primary identifier for gene#gene name#;' \\ \$HOME/kent/src/hg/lib/bigGenePred.as > xenoRefGene.as bedToBigBed -extraIndex=name,geneName -type=bed12+8 -tab -as=xenoRefGene.as \\ \$db.bgpInput \$db.chrom.sizes \$db.xenoRefGene.bb \$HOME/kent/src/hg/utils/automation/xenoRefGeneIx.pl \$db.bgpInput | sort -u > \$db.ix.txt ixIxx \$db.ix.txt \$db.xenoRefGene.ix \$db.xenoRefGene.ixx mkdir -p /dev/shm/\$db cp -p \$db.xenoRefGene.gp /dev/shm/\$db/xenoRefGene.\$db cd /dev/shm/\$db genePredToGtf -utr file xenoRefGene.\$db stdout | gzip -c \\ > \$buildDir/\$db.xenoRefGene.gtf.gz cd \$buildDir rm -f /dev/shm/\$db/xenoRefGene.\$db rmdir /dev/shm/\$db fi