b6aee4c6471cddebd638fec8dbb988c29a69bc22 markd Thu Apr 23 21:58:41 2026 -0700 import of GENCODE V50, MV39, and V50lift37; added a command to do import with a single command diff --git src/hg/makeDb/outside/gencode/gencodeLoad.mk src/hg/makeDb/outside/gencode/gencodeLoad.mk index 8c922489981..9d7c1fa08c8 100644 --- src/hg/makeDb/outside/gencode/gencodeLoad.mk +++ src/hg/makeDb/outside/gencode/gencodeLoad.mk @@ -1,214 +1,218 @@ #### # build GENCODE tracks requires CCDS and markd python junk. -# targets: +# +# This is normally run by gencodeBuildRelease +# +# Targets: # all - do everything # fetch - download gencode # checkAttrs - check attribute conversion, so code can be updated to handle new biotypes # mkTables - create table files # loadTables - load tables # checkSanity - do some checks on the tables # cmpRelease - compare with previous release # joinerCheck - run joinerCheck on gencode tabkes # -# can use -j n to run multiple jobs in parallel. +# use -j n to run multiple jobs in parallel. +# +# The following arguments must be passed on the commmand line: +# db - hg38, hg19, mm39 +# version - 47, M38, 47lift37 +# prevVersion - 46, .. +# baseVersion - 47, etc; source version for lift, otherwise same as version +# relType - pre, final #### ## # make/bash robustness stuff ## .SECONDARY: host=$(shell hostname) ppid=$(shell echo $$PPID) tmpExt = ${host}.${ppid}.tmp SHELL = bash -e export SHELLOPTS=pipefail ## # programs, etc ## mach = $(shell uname -m) ## -# Release info and files from Sanger. -# BEGIN EDIT THESE EACH RELEASE +# validate parameters ## -#preRelease = yes -preRelease = no -#db = hg38 -db = hg19 -#db = mm39 -ifeq (${db},mm39) - ver = M38 - prevVer = M37 -else ifeq (${db},hg38) - ver = 49 - prevVer = 48 -else ifeq (${db},hg19) - verBase = 49 - prevVerBase = 48 - ver = ${verBase}lift37 -else - $(error unimplement genome database: ${db}) +ifeq (${db},) + $(error Error: must specify db=) +endif +ifeq (${version},) + $(error Error: must specify version=) +endif +ifeq (${prevVersion},) + $(error Error: must specify prevVersion=) +endif +ifeq (${baseVersion},) + $(error Error: must specify baseVersion=) +endif +ifeq (${relType},) + $(error Error: must specify relType=) endif -# END EDIT THESE EACH RELEASE ifeq (${db},mm39) grcRefAssembly = GRCm39 gencodeOrg = Gencode_mouse - ftpReleaseSubdir = release_${ver} + ftpReleaseSubdir = release_${version} annGffTypeName = chr_patch_hapl_scaff.annotation else ifeq (${db},hg38) grcRefAssembly = GRCh38 gencodeOrg = Gencode_human - ftpReleaseSubdir = release_${ver} + ftpReleaseSubdir = release_${version} annGffTypeName = chr_patch_hapl_scaff.annotation else ifeq (${db},hg19) grcRefAssembly = GRCh37 - ver = ${verBase}lift37 - prevVer = ${prevVerBase}lift37 backmapTargetVer = 19 - ftpReleaseSubdir = release_${verBase}/GRCh37_mapping + ftpReleaseSubdir = release_${baseVersion}/GRCh37_mapping gencodeOrg = Gencode_human annGffTypeName = annotation isBackmap = yes # caused by change in PAR gencode ids, backmap needs to be made smarted, until then, # just drop old transcipts that gets included dropIdsOpts = --drop=ENST00000302805.2 else $(error unimplement genome database: ${db}) endif -ifeq (${preRelease},yes) +ifeq (${relType},pre) # pre-release baseUrl = rsync://ftp.ebi.ac.uk/pub/databases/havana/gencode_pre else # official release baseUrl = rsync://ftp.ebi.ac.uk/pub/databases/gencode endif -rel = V${ver} +flagDir = flags +fetchDone = ${flagDir}/fetch.done + + +rel = V${version} releaseUrl = ${baseUrl}/${gencodeOrg}/${ftpReleaseSubdir} dataDir = data -relDir = ${dataDir}/release_${ver} -annotationGff = ${relDir}/gencode.v${ver}.${annGffTypeName}.gff3.gz -polyAGff = ${relDir}/gencode.v${ver}.polyAs.gff3.gz +relDir = ${dataDir}/release_${version} +annotationGff = ${relDir}/gencode.v${version}.${annGffTypeName}.gff3.gz +polyAGff = ${relDir}/gencode.v${version}.polyAs.gff3.gz ifneq (${isBackmap},yes) - transcriptRanks = ${relDir}/gencode.v${ver}.transcript_rankings.txt.gz + transcriptRanks = ${relDir}/gencode.v${version}.transcript_rankings.txt.gz transcriptRanksOpt = --transcriptRanks=${transcriptRanks} endif gencodeBinDir = ${HOME}/kent/src/hg/makeDb/outside/gencode/bin gencodeMakeTracks = ${gencodeBinDir}/gencodeMakeTracks gencodeMakeAttrs = ${gencodeBinDir}/gencodeMakeAttrs gencodeExonSupportToTable = ${gencodeBinDir}/gencodeExonSupportToTable gencodeGxfToGenePred = ${gencodeBinDir}/gencodeGxfToGenePred gencodePolyaGxfToGenePred = ${gencodeBinDir}/gencodePolyaGxfToGenePred gencodeGxfToAttrs = ${gencodeBinDir}/gencodeGxfToAttrs buildGencodeToUcscLift = ${HOME}/kent/src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift gencodeBackMapMetadataIds = ${gencodeBinDir}/gencodeBackMapMetadataIds encodeAutoSqlDir = ${HOME}/kent/src/hg/lib/encode ## # intermediate data not loaded into tracks ## gencodeGp = ${dataDir}/gencode.gp gencodeTsv = ${dataDir}/gencode.tsv gencodeToUcscChain = ${dataDir}/gencodeToUcsc.chain -# flag indicating fetch was done -fetchDone = ${relDir}/done - ## # track and table data ## tableDir = tables tablePre = wgEncodeGencode # subset track and pattern for generate genePred and track names for each subset # obtained from gencode.v*.annotation.level_1_2.gtf, gencode.v*.annotation.level_3.gtf tableBasic = ${tablePre}Basic${rel} tableBasicGp = ${tableDir}/${tableBasic}.gp tableComp = ${tablePre}Comp${rel} tableCompGp = ${tableDir}/${tableComp}.gp tablePseudo = ${tablePre}PseudoGene${rel} tablePseudoGp = ${tableDir}/${tablePseudo}.gp tableAttrs = ${tablePre}Attrs${rel} tableAttrsTab = ${tableDir}/${tableAttrs}.tab tableTag = ${tablePre}Tag${rel} tableTagTab = ${tableDir}/${tableTag}.tab # obtained from gencode.v*.polyAs.gtf tablePolyA = ${tablePre}Polya${rel} tablePolyAGp = ${tableDir}/${tablePolyA}.gp # other metadata -tableGeneSourceMeta = ${relDir}/gencode.v${ver}.metadata.Gene_source.gz +tableGeneSourceMeta = ${relDir}/gencode.v${version}.metadata.Gene_source.gz tableGeneSource = ${tablePre}GeneSource${rel} tableGeneSourceTab = ${tableDir}/${tableGeneSource}.tab -tableTranscriptSourceMeta = ${relDir}/gencode.v${ver}.metadata.Transcript_source.gz +tableTranscriptSourceMeta = ${relDir}/gencode.v${version}.metadata.Transcript_source.gz tableTranscriptSource = ${tablePre}TranscriptSource${rel} tableTranscriptSourceTab = ${tableDir}/${tableTranscriptSource}.tab -tableTranscriptSupportMeta = ${relDir}/gencode.v${ver}.metadata.Transcript_supporting_feature.gz +tableTranscriptSupportMeta = ${relDir}/gencode.v${version}.metadata.Transcript_supporting_feature.gz tableTranscriptSupport = ${tablePre}TranscriptSupport${rel} tableTranscriptSupportTab = ${tableDir}/${tableTranscriptSupport}.tab -tableExonSupportMeta = ${relDir}/gencode.v${ver}.metadata.Exon_supporting_feature.gz +tableExonSupportMeta = ${relDir}/gencode.v${version}.metadata.Exon_supporting_feature.gz tableExonSupport = ${tablePre}ExonSupport${rel} tableExonSupportTab = ${tableDir}/${tableExonSupport}.tab ifeq (${gencodeOrg}, Gencode_human) - tableGeneSymbolMeta = ${relDir}/gencode.v${ver}.metadata.HGNC.gz + tableGeneSymbolMeta = ${relDir}/gencode.v${version}.metadata.HGNC.gz else - tableGeneSymbolMeta = ${relDir}/gencode.v${ver}.metadata.MGI.gz + tableGeneSymbolMeta = ${relDir}/gencode.v${version}.metadata.MGI.gz endif tableGeneSymbol = ${tablePre}GeneSymbol${rel} tableGeneSymbolTab = ${tableDir}/${tableGeneSymbol}.tab -tablePdbMeta = ${relDir}/gencode.v${ver}.metadata.PDB.gz +tablePdbMeta = ${relDir}/gencode.v${version}.metadata.PDB.gz tablePdb = ${tablePre}Pdb${rel} tablePdbTab = ${tableDir}/${tablePdb}.tab -tablePubMedMeta = ${relDir}/gencode.v${ver}.metadata.Pubmed_id.gz +tablePubMedMeta = ${relDir}/gencode.v${version}.metadata.Pubmed_id.gz tablePubMed = ${tablePre}PubMed${rel} tablePubMedTab = ${tableDir}/${tablePubMed}.tab -tableRefSeqMeta = ${relDir}/gencode.v${ver}.metadata.RefSeq.gz +tableRefSeqMeta = ${relDir}/gencode.v${version}.metadata.RefSeq.gz tableRefSeq = ${tablePre}RefSeq${rel} tableRefSeqTab = ${tableDir}/${tableRefSeq}.tab -tableSwissProtMeta = ${relDir}/gencode.v${ver}.metadata.SwissProt.gz -tableTrEMBLMeta = ${relDir}/gencode.v${ver}.metadata.TrEMBL.gz +tableSwissProtMeta = ${relDir}/gencode.v${version}.metadata.SwissProt.gz +tableTrEMBLMeta = ${relDir}/gencode.v${version}.metadata.TrEMBL.gz tableUniProt = ${tablePre}UniProt${rel} tableUniProtTab = ${tableDir}/${tableUniProt}.tab -tablePolyAFeatureMeta = ${relDir}/gencode.v${ver}.metadata.PolyA_feature.gz +tablePolyAFeatureMeta = ${relDir}/gencode.v${version}.metadata.PolyA_feature.gz tablePolyAFeature = ${tablePre}PolyAFeature${rel} tablePolyAFeatureTab = ${tableDir}/${tablePolyAFeature}.tab -tableAnnotationRemarkMeta = ${relDir}/gencode.v${ver}.metadata.Annotation_remark.gz +tableAnnotationRemarkMeta = ${relDir}/gencode.v${version}.metadata.Annotation_remark.gz tableAnnotationRemark = ${tablePre}AnnotationRemark${rel} tableAnnotationRemarkTab = ${tableDir}/${tableAnnotationRemark}.tab -tableEntrezGeneMeta = ${relDir}/gencode.v${ver}.metadata.EntrezGene.gz +tableEntrezGeneMeta = ${relDir}/gencode.v${version}.metadata.EntrezGene.gz tableEntrezGene = ${tablePre}EntrezGene${rel} tableEntrezGeneTab = ${tableDir}/${tableEntrezGene}.tab tableTranscriptionSupportLevel = ${tablePre}TranscriptionSupportLevel${rel} tableTranscriptionSupportLevelTab = ${tableDir}/${tableTranscriptionSupportLevel}.tab genePredExtTables = ${tableBasic} ${tableComp} ${tablePseudo} genePredTables = tabTables = ${tableAttrs} ${tableTag} ${tableGeneSource} \ ${tableTranscriptSource} ${tableTranscriptSupport} \ ${tableGeneSymbol} ${tablePdb} ${tablePubMed} ${tableRefSeq} ${tableUniProt} \ ${tableAnnotationRemark} ${tableEntrezGene} ${tableTranscriptionSupportLevel} ifeq (${isBackmap}, yes) targetGencodeTsv = ${dataDir}/target-gencode.tsv else @@ -220,31 +224,31 @@ # directory for flags indicating tables were loaded loadedDir = loaded # directory for output and flags for sanity checks checkDir = check all: fetch mkTables loadTables checkSanity cmpRelease listTables ## # fetch release, this doesn't get subdirectories so as not to copy the lift releases ## fetch: ${fetchDone} ${fetchDone}: - @mkdir -p $(dir $@) + @mkdir -p ${relDir} $(dir $@) rsync -a --include='gencode.*' --exclude='*' '${releaseUrl}/' ${relDir} touch $@ ## # dependencies for files from release ## ${annotationGff}: ${fetchDone} ${polyAGff}: ${fetchDone} ${tableGeneSourceMeta}: ${fetchDone} ${tableTranscriptSourceMeta}: ${fetchDone} ${tableTranscriptSupportMeta}: ${fetchDone} ${tableExonSupportMeta}: ${fetchDone} ${tableGeneSymbolMeta}: ${fetchDone} ${tablePdbMeta}: ${fetchDone} ${tablePubMedMeta}: ${fetchDone} @@ -255,38 +259,36 @@ ${tableAnnotationRemarkMeta}: ${fetchDone} ${tableEntrezGeneMeta}: ${fetchDone} ## # primary table files ## mkTables: ${genePredExtTables:%=${tableDir}/%.gp} ${genePredTables:%=${tableDir}/%.gp} \ ${tabTables:%=${tableDir}/%.tab} # grab subset name from file pattern (this is what tr command below does) ${tableDir}/${tablePre}%${rel}.gp: ${gencodeGp} ${gencodeTsv} @mkdir -p $(dir $@) ${gencodeMakeTracks} $$(echo $* | tr A-Z a-z) ${gencodeGp} ${gencodeTsv} $@.${tmpExt} mv -f $@.${tmpExt} $@ -${tableTagTab}: ${tableAttrsTab} -${tableTranscriptionSupportLevelTab}: ${tableAttrsTab} -${tableAttrsTab}: ${gencodeGp} ${gencodeTsv} - @mkdir -p $(dir $@) - ${gencodeMakeAttrs} ${gencodeGp} ${gencodeTsv} $@.${tmpExt} ${tableTagTab}.${tmpExt} ${tableTranscriptionSupportLevelTab}.${tmpExt} +${tableAttrsTab} ${tableTagTab} ${tableTranscriptionSupportLevelTab} &: ${gencodeGp} ${gencodeTsv} + @mkdir -p $(dir ${tableAttrsTab}) + ${gencodeMakeAttrs} ${gencodeGp} ${gencodeTsv} ${tableAttrsTab}.${tmpExt} ${tableTagTab}.${tmpExt} ${tableTranscriptionSupportLevelTab}.${tmpExt} mv -f ${tableTranscriptionSupportLevelTab}.${tmpExt} ${tableTranscriptionSupportLevelTab} mv -f ${tableTagTab}.${tmpExt} ${tableTagTab} - mv -f $@.${tmpExt} $@ + mv -f ${tableAttrsTab}.${tmpExt} ${tableAttrsTab} ${tablePolyAGp}: ${polyAGff} ${gencodeToUcscChain} @mkdir -p $(dir $@) ${gencodePolyaGxfToGenePred} $< ${gencodeToUcscChain} $@.${tmpExt} mv -f $@.${tmpExt} $@ ${tableUniProtTab}: ${tableSwissProtMeta} ${tableTrEMBLMeta} ${gencodeTsv} @mkdir -p $(dir $@) ((${metaFilterCmdGz} ${tableSwissProtMeta} | tawk '{print $$0,"SwissProt"}') && (${metaFilterCmdGz} ${tableTrEMBLMeta} | tawk '{print $$0,"TrEMBL"}')) | sort -k 1,1 > $@.${tmpExt} mv -f $@.${tmpExt} $@ ${gencodeToUcscChain}: @mkdir -p $(dir $@) ${buildGencodeToUcscLift} ${db} $@.${tmpExt} mv -f $@.${tmpExt} $@ @@ -343,31 +345,30 @@ mv -f $@.${tmpExt} $@ # drop ENSTR entries that are a hack to support PAR sequences in GTF ${tableEntrezGeneTab}: ${tableEntrezGeneMeta} ${metaFilterDepend} @mkdir -p $(dir $@) zcat $< | tawk '$$1!~/^ENSTR/' | sort -k 1,1 | ${metaFilterCmd} /dev/stdin > $@.${tmpExt} mv -f $@.${tmpExt} $@ ## # intermediate data for ensembl/havana, not loaded into databases ## ${gencodeGp}: ${annotationGff} ${gencodeToUcscChain} @mkdir -p $(dir $@) ${gencodeGxfToGenePred} ${dropIdsOpts} ${db} ${annotationGff} ${gencodeToUcscChain} $@.${tmpExt} mv -f $@.${tmpExt} $@ - touch $@ ${gencodeTsv}: ${annotationGff} @mkdir -p $(dir $@) ${gencodeGxfToAttrs} ${dropIdsOpts} ${transcriptRanksOpt} ${annotationGff} $@.${tmpExt} mv -f $@.${tmpExt} $@ ${targetGencodeTsv}: @mkdir -p $(dir $@) hgsql ${db} -e 'select * from wgEncodeGencodeAttrsV${backmapTargetVer}' > $@.${tmpExt} mv -f $@.${tmpExt} $@ # check attributes so code can be updated to handle new biotypes checkAttrs: ${annotationGff} ${gencodeGxfToAttrs} ${dropIdsOpts} ${transcriptRanksOpt} ${annotationGff} /dev/null @@ -440,43 +441,39 @@ hgsql -Ne 'select * from ${tableBasic}, ${tableAttrs} where name = transcriptId and geneType like "%pseudo%" and geneType != "polymorphic_pseudogene"' ${db} | sort -u >$(basename $@).incorrect @$(checkForIncorrect) touch $@ # make there are no psuedo in comprehensive ${checkDir}/${tableComp}.pseudo.checked: ${loadedDir}/${tableComp}.genePredExt.loaded ${loadedDir}/${tableAttrs}.tab.loaded @mkdir -p $(dir $@) hgsql -Ne 'select * from ${tableComp}, ${tableAttrs} where name = transcriptId and geneType like "%pseudo%" and geneType != "polymorphic_pseudogene"' ${db} | sort -u >$(basename $@).incorrect @$(checkForIncorrect) touch $@ # create table list to past into redmine listTables: tables.lst tables.lst: loadTables - hgsql -Ne 'show tables like "wgEncodeGencode%V${ver}"' ${db} >$@.tmp + hgsql -Ne 'show tables like "wgEncodeGencode%V${version}"' ${db} >$@.tmp mv -f $@.tmp $@ ## # compare number of tracks with previous ## cmpRelease: gencode-cmp.tsv gencode-cmp.tsv: loadTables - @echo 'table V${prevVer} V${ver} delta' >$@.tmp + @echo 'table V${prevVersion} V${version} delta' >$@.tmp @for tab in ${allTables} ; do \ - prevTab=$$(echo "$$tab" | sed 's/V${ver}/V${prevVer}/g') ; \ + prevTab=$$(echo "$$tab" | sed 's/V${version}/V${prevVersion}/g') ; \ echo "$${tab} "$$(hgsql -Ne "select count(*) from $${prevTab}" ${db})" "$$(hgsql -Ne "select count(*) from $${tab}" ${db}) ; \ done | tawk '{print $$1, $$2, $$3, $$3-$$2}' >>$@.tmp mv -f $@.tmp $@ joinerCheck: loadTables @mkdir -p check - for tbl in $$(hgsql -Ne 'show tables like "wgEncodeGencode%V${ver}"' ${db} | egrep -v 'wgEncodeGencode2wayConsPseudo|wgEncodeGencodePolya') ; do \ - echo table=$$tbl; \ - runJoiner.csh ${db} $$tbl ~/kent/src/hg/makeDb/schema/all.joiner noTimes; \ - done >check/joiner.out 2>&1 + ${gencodeBinDir} gencodeJoinerCheck $db} ${version} >check/joiner.out 2>&1 if fgrep Error: check/joiner.out ; then false; else true; fi - clean: rm -rf ${dataDir} ${tableDir} ${loadedDir} ${checkDir}