70fde199fda0e697d5c0e5e01470ff433fa00dc1 markd Tue May 14 00:06:40 2024 -0700 Import of GENCODE V46lift37, including add an option to import code to drop specific transcripts. This deals incorrectly included PAR transcript. diff --git src/hg/makeDb/outside/gencode/gencodeLoad.mk src/hg/makeDb/outside/gencode/gencodeLoad.mk index 8446b91..2c0133a 100644 --- src/hg/makeDb/outside/gencode/gencodeLoad.mk +++ src/hg/makeDb/outside/gencode/gencodeLoad.mk @@ -22,32 +22,32 @@ tmpExt = ${host}.${ppid}.tmp SHELL = bash -e export SHELLOPTS=pipefail ## # programs, etc ## mach = $(shell uname -m) ## # Release info and files from Sanger. # BEGIN EDIT THESE EACH RELEASE ## preRelease = no #preRelease = yes -db = hg38 -#db = hg19 +#db = hg38 +db = hg19 #db = mm39 ifeq (${db},mm10) # mm10 lift back was never revewed and released grcRefAssembly = GRCm38 verBase = M25 prevVer = M24 backmapTargetVer = M25 ver = ${verBase}lift37 gencodeOrg = Gencode_mouse ftpReleaseSubdir = release_${verBase}/GRCm38_mapping annGffTypeName = chr_patch_hapl_scaff.annotation isBackmap = yes else ifeq (${db},mm39) grcRefAssembly = GRCm39 ver = M35 @@ -60,30 +60,33 @@ ver = 46 prevVer = 44 gencodeOrg = Gencode_human ftpReleaseSubdir = release_${ver} annGffTypeName = chr_patch_hapl_scaff.annotation else ifeq (${db},hg19) grcRefAssembly = GRCh37 verBase = 46 prevVer = 45lift37 ver = ${verBase}lift37 backmapTargetVer = 19 ftpReleaseSubdir = release_${verBase}/GRCh37_mapping gencodeOrg = Gencode_human annGffTypeName = annotation isBackmap = yes + # caused by change in PAR gencode ids, backmap needs to be made smarted, until then, + # just drop old transcipts that gets included + dropIdsOpts = --drop=ENST00000302805.2 else $(error unimplement genome database: ${db}) endif # END EDIT THESE EACH RELEASE ifeq (${preRelease},yes) # pre-release baseUrl = rsync://ftp.ebi.ac.uk/pub/databases/havana/gencode_pre else # official release baseUrl = rsync://ftp.ebi.ac.uk/pub/databases/gencode endif rel = V${ver} @@ -282,31 +285,31 @@ mv -f $@.${tmpExt} $@ ${tableUniProtTab}: ${tableSwissProtMeta} ${tableTrEMBLMeta} ${gencodeTsv} @mkdir -p $(dir $@) ((${metaFilterCmdGz} ${tableSwissProtMeta} | tawk '{print $$0,"SwissProt"}') && (${metaFilterCmdGz} ${tableTrEMBLMeta} | tawk '{print $$0,"TrEMBL"}')) | sort -k 1,1 > $@.${tmpExt} mv -f $@.${tmpExt} $@ ${gencodeToUcscChain}: @mkdir -p $(dir $@) ${buildGencodeToUcscLift} ${db} $@.${tmpExt} mv -f $@.${tmpExt} $@ # other tab files, just copy to name following convention to make load rules # work ifeq (${isBackmap}, yes) - metaFilterCmd = ${gencodeBackMapMetadataIds} ${db} ${gencodeTsv} ${targetGencodeTsv} + metaFilterCmd = ${gencodeBackMapMetadataIds} ${dropIdsOpts} ${db} ${gencodeTsv} ${targetGencodeTsv} metaFilterCmdGz = ${metaFilterCmd} metaFilterDepend = ${gencodeTsv} ${targetGencodeTsv} else metaFilterCmd = cat metaFilterCmdGz = zcat metaFilterDepend = ${gencodeTsv} endif define copyMetadataTabGz mkdir -p $(dir $@) ${metaFilterCmdGz} $< > $@.${tmpExt} mv -f $@.${tmpExt} $@ endef define copyMetadataTab mkdir -p $(dir $@) ${metaFilterCmd} $< > $@.${tmpExt} @@ -340,48 +343,48 @@ ${tableAnnotationRemarkTab}: ${tableAnnotationRemarkMeta} ${metaFilterDepend} @mkdir -p $(dir $@) ${metaFilterCmdGz} $< | tawk '{print $$1,gensub("\\\\n|\\\\","","g",$$2)}' | sort -k 1,1 > $@.${tmpExt} mv -f $@.${tmpExt} $@ # drop ENSTR entries that are a hack to support PAR sequences in GTF ${tableEntrezGeneTab}: ${tableEntrezGeneMeta} ${metaFilterDepend} @mkdir -p $(dir $@) zcat $< | tawk '$$1!~/^ENSTR/' | sort -k 1,1 | ${metaFilterCmd} /dev/stdin > $@.${tmpExt} mv -f $@.${tmpExt} $@ ## # intermediate data for ensembl/havana, not loaded into databases ## ${gencodeGp}: ${annotationGff} ${gencodeToUcscChain} @mkdir -p $(dir $@) - ${gencodeGxfToGenePred} ${db} ${annotationGff} ${gencodeToUcscChain} $@.${tmpExt} + ${gencodeGxfToGenePred} ${dropIdsOpts} ${db} ${annotationGff} ${gencodeToUcscChain} $@.${tmpExt} mv -f $@.${tmpExt} $@ touch $@ ${gencodeTsv}: ${annotationGff} @mkdir -p $(dir $@) - ${gencodeGxfToAttrs} ${transcriptRanksOpt} ${annotationGff} $@.${tmpExt} + ${gencodeGxfToAttrs} ${dropIdsOpts} ${transcriptRanksOpt} ${annotationGff} $@.${tmpExt} mv -f $@.${tmpExt} $@ ${targetGencodeTsv}: @mkdir -p $(dir $@) hgsql ${db} -e 'select * from wgEncodeGencodeAttrsV${backmapTargetVer}' > $@.${tmpExt} mv -f $@.${tmpExt} $@ # check attributes so code can be updated to handle new biotypes checkAttrs: ${annotationGff} - ${gencodeGxfToAttrs} ${transcriptRanksOpt} ${annotationGff} /dev/null + ${gencodeGxfToAttrs} ${dropIdsOpts} ${transcriptRanksOpt} ${annotationGff} /dev/null ## # load tables # browser commands use static tmp file name, so use lock file to serialize ## loadLock = flock load.lock loadTables: ${genePredExtTables:%=${loadedDir}/%.genePredExt.loaded} \ ${genePredTables:%=${loadedDir}/%.genePred.loaded} \ ${tabTables:%=${loadedDir}/%.tab.loaded} ${loadedDir}/%.genePredExt.loaded: ${tableDir}/%.gp @mkdir -p $(dir $@) ${loadLock} hgLoadGenePred -genePredExt ${db} $* $< touch $@