fcdf5c401c80840d28c346409c4fbb527544fac7 markd Tue Jun 9 13:59:33 2020 -0700 make gencode hgc robust against metadata that is sometimes not mapped in the backmap releases diff --git src/hg/makeDb/outside/gencode/gencodeLoad.mk src/hg/makeDb/outside/gencode/gencodeLoad.mk index cb37e47..2c30cf0 100644 --- src/hg/makeDb/outside/gencode/gencodeLoad.mk +++ src/hg/makeDb/outside/gencode/gencodeLoad.mk @@ -23,32 +23,32 @@ SHELL = bash -e export SHELLOPTS=pipefail ## # programs, etc ## mach = $(shell uname -m) ## # Release info and files from Sanger. # BEGIN EDIT THESE EACH RELEASE # # - ensemblPrevVersion is use to get chrom name mappings for pre-release, # as this doesn't change between release. ## -db = hg38 -#db = hg19 +#db = hg38 +db = hg19 #db = mm10 #preRelease = no preRelease = yes ifeq (${db},mm10) grcRefAssembly = GRCm38 ver = M25 prevVer = M24 gencodeOrg = Gencode_mouse ftpReleaseSubdir = release_${ver} annGffTypeName = chr_patch_hapl_scaff.annotation ensemblVer = 100_38 ensemblPrevVer = 99_38 ensemblCDnaDb = mus_musculus_cdna_${ensemblPrevVer} else ifeq (${db},hg38) grcRefAssembly = GRCh38 @@ -407,34 +407,38 @@ # generic tables ${loadedDir}/%.tab.loaded: ${tableDir}/%.tab @mkdir -p $(dir $@) ${loadLock} hgLoadSqlTab ${db} $* ${encodeAutoSqlDir}/$(subst ${rel},,$*).sql $< touch $@ ## # sanity checks ## # check if the .incorrect files is empty define checkForIncorrect awk 'END{if (NR != 0) {print "Incorrect data, see " FILENAME>"/dev/stderr"; exit 1}}' $(basename $@).incorrect endef -checkSanity: ${checkDir}/${tableGeneSource}.checked ${checkDir}/${tableTranscriptSource}.checked \ - ${checkDir}/${tableBasic}.checked ${checkDir}/${tableBasic}.pseudo.checked \ +checkSanity:: ${checkDir}/${tableBasic}.checked ${checkDir}/${tableBasic}.pseudo.checked \ ${checkDir}/${tableComp}.pseudo.checked +# backmap does have all gene/transcript source entries. +ifneq (${isBackmap},yes) +checkSanity:: ${checkDir}/${tableGeneSource}.checked ${checkDir}/${tableTranscriptSource}.checked +endif + # are gene source all in attrs ${checkDir}/${tableGeneSource}.checked: ${loadedDir}/${tableGeneSource}.tab.loaded ${loadedDir}/${tableAttrs}.tab.loaded @mkdir -p $(dir $@) hgsql -Ne 'select geneId from ${tableAttrs} where geneId not in (select geneId from ${tableGeneSource})' ${db} | sort -u >$(basename $@).incorrect @$(checkForIncorrect) touch $@ # are transcript source all in attrs ${checkDir}/${tableTranscriptSource}.checked: ${loadedDir}/${tableTranscriptSource}.tab.loaded ${loadedDir}/${tableAttrs}.tab.loaded @mkdir -p $(dir $@) hgsql -Ne 'select transcriptId from ${tableAttrs} where transcriptId not in (select transcriptId from ${tableTranscriptSource})' ${db} | sort -u >$(basename $@).incorrect @$(checkForIncorrect) touch $@ # make sure all basic are in comprehensive