0a6421e764888bb6e8e1b2058501f7ce4188d814 markd Mon Jun 17 02:08:53 2019 -0700 import of prerelease of gencode V31lift37 diff --git src/hg/makeDb/outside/gencode/gencodeLoad.mk src/hg/makeDb/outside/gencode/gencodeLoad.mk index 159f4bc..45edbc5 100644 --- src/hg/makeDb/outside/gencode/gencodeLoad.mk +++ src/hg/makeDb/outside/gencode/gencodeLoad.mk @@ -52,30 +52,31 @@ ensemblCDnaDb = mus_musculus_cdna_${ensemblPrevVer} else ifeq (${db},hg38) grcRefAssembly = GRCh38 ver = 31 prevVer = 30 gencodeOrg = Gencode_human ftpReleaseSubdir = release_${ver} annGffTypeName = chr_patch_hapl_scaff.annotation ensemblVer = 97_38 ensemblPrevVer = 96_38 ensemblCDnaDb = homo_sapiens_cdna_${ensemblPrevVer} else ifeq (${db},hg19) grcRefAssembly = GRCh37 verBase = 31 ver = ${verBase}lift37 + backmapTargetVer = 19 ftpReleaseSubdir = release_${verBase}/GRCh37_mapping prevVer = 29lift37 gencodeOrg = Gencode_human annGffTypeName = annotation ensemblVer = 74_37 # only used to get genome chromsome name mappings ensemblPrevVer = ${ensemblVer} # doesn't change ensemblCDnaDb = homo_sapiens_cdna_${ensemblPrevVer} isBackmap = yes else $(error unimplement genome database: ${db}) endif # END EDIT THESE EACH RELEASE ifeq (${preRelease},yes) @@ -198,31 +199,33 @@ tableEntrezGeneMeta = ${relDir}/gencode.v${ver}.metadata.EntrezGene.gz tableEntrezGene = ${tablePre}EntrezGene${rel} tableEntrezGeneTab = ${tableDir}/${tableEntrezGene}.tab tableTranscriptionSupportLevelData = ${dataDir}/gencode.v${ver}.transcriptionSupportLevel.tab tableTranscriptionSupportLevel = ${tablePre}TranscriptionSupportLevel${rel} tableTranscriptionSupportLevelTab = ${tableDir}/${tableTranscriptionSupportLevel}.tab genePredExtTables = ${tableBasic} ${tableComp} ${tablePseudo} genePredTables = tabTables = ${tableAttrs} ${tableTag} ${tableGeneSource} \ ${tableTranscriptSource} ${tableTranscriptSupport} \ ${tableGeneSymbol} ${tablePdb} ${tablePubMed} ${tableRefSeq} ${tableUniProt} \ ${tableAnnotationRemark} ${tableEntrezGene} ${tableTranscriptionSupportLevel} -ifneq (${isBackmap}, yes) +ifeq (${isBackmap}, yes) + targetGencodeTsv = ${dataDir}/target-gencode.tsv +else # these are not included in backmap releases genePredTables = ${table2WayConsPseudo} genePredExtTables += ${tablePolyA} tabTables += ${tableExonSupport} endif allTables = ${genePredExtTables} ${genePredTables} ${tabTables} # directory for flags indicating tables were loaded loadedDir = loaded # directory for output and flags for sanity checks checkDir = check all: fetch mkTables loadTables checkSanity cmpRelease listTables @@ -286,102 +289,110 @@ mv -f $@.${tmpExt} $@ ${tableUniProtTab}: ${tableSwissProtMeta} ${tableTrEMBLMeta} ${gencodeTsv} @mkdir -p $(dir $@) ((${metaFilterCmdGz} ${tableSwissProtMeta} | tawk '{print $$0,"SwissProt"}') && (${metaFilterCmdGz} ${tableTrEMBLMeta} | tawk '{print $$0,"TrEMBL"}')) | sort -k 1,1 > $@.${tmpExt} mv -f $@.${tmpExt} $@ ${ensemblToUcscChain}: @mkdir -p $(dir $@) ${ensToUcscChromMap} ${ensemblCDnaDb} ${grcRefAssembly} ${db} /dev/stdout | pslSwap stdin stdout | pslToChain stdin $@.${tmpExt} mv -f $@.${tmpExt} $@ # other tab files, just copy to name following convention to make load rules # work ifeq (${isBackmap}, yes) - metaFilterCmd = ${gencodeBackMapMetadataIds} ${gencodeTsv} + metaFilterCmd = ${gencodeBackMapMetadataIds} ${gencodeTsv} ${targetGencodeTsv} metaFilterCmdGz = ${metaFilterCmd} + metaFilterDepend = ${gencodeTsv} ${targetGencodeTsv} else metaFilterCmd = cat metaFilterCmdGz = zcat + metaFilterDepend = ${gencodeTsv} endif define copyMetadataTabGz mkdir -p $(dir $@) ${metaFilterCmdGz} $< > $@.${tmpExt} mv -f $@.${tmpExt} $@ endef define copyMetadataTab mkdir -p $(dir $@) ${metaFilterCmd} $< > $@.${tmpExt} mv -f $@.${tmpExt} $@ endef -${tableGeneSourceTab}: ${tableGeneSourceMeta} ${gencodeTsv} +${tableGeneSourceTab}: ${tableGeneSourceMeta} ${metaFilterDepend} ${copyMetadataTabGz} -${tableTranscriptSourceTab}: ${tableTranscriptSourceMeta} ${gencodeTsv} +${tableTranscriptSourceTab}: ${tableTranscriptSourceMeta} ${metaFilterDepend} ${copyMetadataTabGz} -${tableTranscriptSupportTab}: ${tableTranscriptSupportMeta} ${gencodeTsv} +${tableTranscriptSupportTab}: ${tableTranscriptSupportMeta} ${metaFilterDepend} ${copyMetadataTabGz} -${tableExonSupportTab}: ${tableExonSupportMeta} ${ensemblToUcscChain} ${gencodeTsv} +${tableExonSupportTab}: ${tableExonSupportMeta} ${ensemblToUcscChain} ${metaFilterDepend} @mkdir -p $(dir $@) ${gencodeExonSupportToTable} ${tableExonSupportMeta} ${ensemblToUcscChain} $@.${tmpExt} mv -f $@.${tmpExt} $@ -${tableGeneSymbolTab}: ${tableGeneSymbolMeta} ${gencodeTsv} +${tableGeneSymbolTab}: ${tableGeneSymbolMeta} ${metaFilterDepend} ${copyMetadataTabGz} -${tablePdbTab}: ${tablePdbMeta} ${gencodeTsv} +${tablePdbTab}: ${tablePdbMeta} ${metaFilterDepend} ${copyMetadataTabGz} -${tablePubMedTab}: ${tablePubMedMeta} ${gencodeTsv} +${tablePubMedTab}: ${tablePubMedMeta} ${metaFilterDepend} ${copyMetadataTabGz} -${tableRefSeqTab}: ${tableRefSeqMeta} ${gencodeTsv} +${tableRefSeqTab}: ${tableRefSeqMeta} ${metaFilterDepend} ${copyMetadataTabGz} ${tableTranscriptionSupportLevelTab}: ${tableTranscriptionSupportLevelData} mkdir -p $(dir $@) cp $< $@.${tmpExt} mv -f $@.${tmpExt} $@ # convert to zero-based, 1/2 open -${tablePolyAFeatureTab}: ${tablePolyAFeatureMeta} ${gencodeTsv} +${tablePolyAFeatureTab}: ${tablePolyAFeatureMeta} ${metaFilterDepend} @mkdir -p $(dir $@) zcat $< | tawk '{print $$1,$$2-1,$$3,$$4,$$5-1,$$6,$$7,$$8}' | sort -k 4,4 -k 5,5n > $@.${tmpExt} mv -f $@.${tmpExt} $@ -${tableAnnotationRemarkTab}: ${tableAnnotationRemarkMeta} ${gencodeTsv} +${tableAnnotationRemarkTab}: ${tableAnnotationRemarkMeta} ${metaFilterDepend} @mkdir -p $(dir $@) ${metaFilterCmdGz} $< | tawk '{print $$1,gensub("\\\\n|\\\\","","g",$$2)}' | sort -k 1,1 > $@.${tmpExt} mv -f $@.${tmpExt} $@ # drop ENSTR entries that are a hack to support PAR sequences in GTF -${tableEntrezGeneTab}: ${tableEntrezGeneMeta} ${gencodeTsv} +${tableEntrezGeneTab}: ${tableEntrezGeneMeta} ${metaFilterDepend} @mkdir -p $(dir $@) zcat $< | tawk '$$1!~/^ENSTR/' | sort -k 1,1 | ${metaFilterCmd} /dev/stdin > $@.${tmpExt} mv -f $@.${tmpExt} $@ ## # intermediate data for ensembl/havana, not loaded into databases ## ${gencodeGp}: ${annotationGff} ${ensemblToUcscChain} @mkdir -p $(dir $@) ${gencodeGxfToGenePred} ${annotationGff} ${ensemblToUcscChain} $@.${tmpExt} mv -f $@.${tmpExt} $@ -${tableTranscriptionSupportLevelData}: ${gencodeTsv} +${tableTranscriptionSupportLevelData}: ${metaFilterDepend} touch $@ ${gencodeTsv}: ${annotationGff} @mkdir -p $(dir $@) ${gencodeGxfToAttrs} --keepGoing ${annotationGff} $@.${tmpExt} --tslTabOut=${tableTranscriptionSupportLevelData}.${tmpExt} mv -f ${tableTranscriptionSupportLevelData}.${tmpExt} ${tableTranscriptionSupportLevelData} mv -f $@.${tmpExt} $@ +${targetGencodeTsv}: + @mkdir -p $(dir $@) + hgsql ${db} -e 'select * from wgEncodeGencodeAttrsV${backmapTargetVer}' > $@.${tmpExt} + mv -f $@.${tmpExt} $@ + + # check attributes so code can be updated to handle new biotypes checkAttrs: ${annotationGff} ${gencodeGxfToAttrs} ${annotationGff} /dev/null ## # load tables # browser commands use static tmp file name, so use lock file to serialize ## loadLock = flock load.lock loadTables: ${genePredExtTables:%=${loadedDir}/%.genePredExt.loaded} \ ${genePredTables:%=${loadedDir}/%.genePred.loaded} \ ${tabTables:%=${loadedDir}/%.tab.loaded} ${loadedDir}/%.genePredExt.loaded: ${tableDir}/%.gp