7b9d4ae382ad4ef6f30ea743d518543b25da87a7 markd Thu Jan 27 17:21:32 2022 -0800 Update gencode import to finish remove of use of older code in a different tree. This is the first step towards simplifying the build process diff --git src/hg/makeDb/outside/gencode/gencodeLoad.mk src/hg/makeDb/outside/gencode/gencodeLoad.mk index e3949af..6c267a2 100644 --- src/hg/makeDb/outside/gencode/gencodeLoad.mk +++ src/hg/makeDb/outside/gencode/gencodeLoad.mk @@ -22,33 +22,33 @@ tmpExt = ${host}.${ppid}.tmp SHELL = bash -e export SHELLOPTS=pipefail ## # programs, etc ## mach = $(shell uname -m) ## # Release info and files from Sanger. # BEGIN EDIT THESE EACH RELEASE ## #preRelease = no preRelease = yes -#db = hg38 +db = hg38 #db = hg19 -db = mm39 +#db = mm39 #db = mm10 ifeq (${db},mm10) grcRefAssembly = GRCm38 verBase = M25 prevVer = M24 backmapTargetVer = M25 ver = ${verBase}lift37 gencodeOrg = Gencode_mouse ftpReleaseSubdir = release_${verBase}/GRCm38_mapping annGffTypeName = chr_patch_hapl_scaff.annotation isBackmap = yes else ifeq (${db},mm39) grcRefAssembly = GRCm39 ver = M29 prevVer = M28 @@ -82,39 +82,39 @@ # pre-release baseUrl = rsync://ftp.ebi.ac.uk/pub/databases/havana/gencode_pre else # official release baseUrl = rsync://ftp.ebi.ac.uk/pub/databases/gencode endif rel = V${ver} releaseUrl = ${baseUrl}/${gencodeOrg}/${ftpReleaseSubdir} dataDir = data relDir = ${dataDir}/release_${ver} annotationGff = ${relDir}/gencode.v${ver}.${annGffTypeName}.gff3.gz pseudo2WayGff = ${relDir}/gencode.v${ver}.2wayconspseudos.gff3.gz polyAGff = ${relDir}/gencode.v${ver}.polyAs.gff3.gz -ccdsBinDir = ~markd/compbio/ccds/ccds2/output/bin/$(mach)/opt -gencodeMakeTracks = ${ccdsBinDir}/gencodeMakeTracks -gencodeMakeAttrs = ${ccdsBinDir}/gencodeMakeAttrs -gencodeExonSupportToTable = ${ccdsBinDir}/gencodeExonSupportToTable -gencodeGxfToGenePred = ${ccdsBinDir}/gencodeGxfToGenePred -gencodePolyaGxfToGenePred = ${ccdsBinDir}/gencodePolyaGxfToGenePred -gencodeGxfToAttrs = ${ccdsBinDir}/gencodeGxfToAttrs +gencodeBinDir = ${HOME}/kent/src/hg/makeDb/outside/gencode/bin +gencodeMakeTracks = ${gencodeBinDir}/gencodeMakeTracks +gencodeMakeAttrs = ${gencodeBinDir}/gencodeMakeAttrs +gencodeExonSupportToTable = ${gencodeBinDir}/gencodeExonSupportToTable +gencodeGxfToGenePred = ${gencodeBinDir}/gencodeGxfToGenePred +gencodePolyaGxfToGenePred = ${gencodeBinDir}/gencodePolyaGxfToGenePred +gencodeGxfToAttrs = ${gencodeBinDir}/gencodeGxfToAttrs buildGencodeToUcscLift = ${HOME}/kent/src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift -gencodeBackMapMetadataIds = ${ccdsBinDir}/gencodeBackMapMetadataIds +gencodeBackMapMetadataIds = ${gencodeBinDir}/gencodeBackMapMetadataIds encodeAutoSqlDir = ${HOME}/kent/src/hg/lib/encode ## # intermediate data not loaded into tracks ## gencodeGp = ${dataDir}/gencode.gp gencodeTsv = ${dataDir}/gencode.tsv gencodeToUcscChain = ${dataDir}/gencodeToUcsc.chain # flag indicating fetch was done fetchDone = ${relDir}/done ## # track and table data ## @@ -188,31 +188,30 @@ tableUniProt = ${tablePre}UniProt${rel} tableUniProtTab = ${tableDir}/${tableUniProt}.tab tablePolyAFeatureMeta = ${relDir}/gencode.v${ver}.metadata.PolyA_feature.gz tablePolyAFeature = ${tablePre}PolyAFeature${rel} tablePolyAFeatureTab = ${tableDir}/${tablePolyAFeature}.tab tableAnnotationRemarkMeta = ${relDir}/gencode.v${ver}.metadata.Annotation_remark.gz tableAnnotationRemark = ${tablePre}AnnotationRemark${rel} tableAnnotationRemarkTab = ${tableDir}/${tableAnnotationRemark}.tab tableEntrezGeneMeta = ${relDir}/gencode.v${ver}.metadata.EntrezGene.gz tableEntrezGene = ${tablePre}EntrezGene${rel} tableEntrezGeneTab = ${tableDir}/${tableEntrezGene}.tab -tableTranscriptionSupportLevelData = ${dataDir}/gencode.v${ver}.transcriptionSupportLevel.tab tableTranscriptionSupportLevel = ${tablePre}TranscriptionSupportLevel${rel} tableTranscriptionSupportLevelTab = ${tableDir}/${tableTranscriptionSupportLevel}.tab genePredExtTables = ${tableBasic} ${tableComp} ${tablePseudo} genePredTables = tabTables = ${tableAttrs} ${tableTag} ${tableGeneSource} \ ${tableTranscriptSource} ${tableTranscriptSupport} \ ${tableGeneSymbol} ${tablePdb} ${tablePubMed} ${tableRefSeq} ${tableUniProt} \ ${tableAnnotationRemark} ${tableEntrezGene} ${tableTranscriptionSupportLevel} ifeq (${isBackmap}, yes) targetGencodeTsv = ${dataDir}/target-gencode.tsv else # these are not included in backmap releases genePredTables = ${table2WayConsPseudo} genePredExtTables += ${tablePolyA} @@ -258,33 +257,36 @@ ${tableEntrezGeneMeta}: ${fetchDone} ## # primary table files ## mkTables: ${genePredExtTables:%=${tableDir}/%.gp} ${genePredTables:%=${tableDir}/%.gp} \ ${tabTables:%=${tableDir}/%.tab} # grab subset name from file pattern (this is what tr command below does) ${tableDir}/${tablePre}%${rel}.gp: ${gencodeGp} ${gencodeTsv} @mkdir -p $(dir $@) ${gencodeMakeTracks} $$(echo $* | tr A-Z a-z) ${gencodeGp} ${gencodeTsv} $@.${tmpExt} mv -f $@.${tmpExt} $@ ${tableTagTab}: ${tableAttrsTab} +${tableTranscriptionSupportLevelTab}: ${tableAttrsTab} ${tableAttrsTab}: ${gencodeGp} ${gencodeTsv} @mkdir -p $(dir $@) - ${gencodeMakeAttrs} ${gencodeGp} ${gencodeTsv} $@.${tmpExt} ${tableTagTab} + ${gencodeMakeAttrs} ${gencodeGp} ${gencodeTsv} $@.${tmpExt} ${tableTagTab}.${tmpExt} ${tableTranscriptionSupportLevelTab}.${tmpExt} + mv -f ${tableTranscriptionSupportLevelTab}.${tmpExt} ${tableTranscriptionSupportLevelTab} + mv -f ${tableTagTab}.${tmpExt} ${tableTagTab} mv -f $@.${tmpExt} $@ ${table2WayConsPseudoGp}: ${pseudo2WayGff} @mkdir -p $(dir $@) gff3ToGenePred -allowMinimalGenes $< $@.${tmpExt} mv -f $@.${tmpExt} $@ ${tablePolyAGp}: ${polyAGff} ${gencodeToUcscChain} @mkdir -p $(dir $@) ${gencodePolyaGxfToGenePred} $< ${gencodeToUcscChain} $@.${tmpExt} mv -f $@.${tmpExt} $@ ${tableUniProtTab}: ${tableSwissProtMeta} ${tableTrEMBLMeta} ${gencodeTsv} @mkdir -p $(dir $@) ((${metaFilterCmdGz} ${tableSwissProtMeta} | tawk '{print $$0,"SwissProt"}') && (${metaFilterCmdGz} ${tableTrEMBLMeta} | tawk '{print $$0,"TrEMBL"}')) | sort -k 1,1 > $@.${tmpExt} @@ -324,64 +326,57 @@ ${tableTranscriptSupportTab}: ${tableTranscriptSupportMeta} ${metaFilterDepend} ${copyMetadataTabGz} ${tableExonSupportTab}: ${tableExonSupportMeta} ${gencodeToUcscChain} ${metaFilterDepend} @mkdir -p $(dir $@) ${gencodeExonSupportToTable} ${tableExonSupportMeta} ${gencodeToUcscChain} $@.${tmpExt} mv -f $@.${tmpExt} $@ ${tableGeneSymbolTab}: ${tableGeneSymbolMeta} ${metaFilterDepend} ${copyMetadataTabGz} ${tablePdbTab}: ${tablePdbMeta} ${metaFilterDepend} ${copyMetadataTabGz} ${tablePubMedTab}: ${tablePubMedMeta} ${metaFilterDepend} ${copyMetadataTabGz} ${tableRefSeqTab}: ${tableRefSeqMeta} ${metaFilterDepend} ${copyMetadataTabGz} -${tableTranscriptionSupportLevelTab}: ${tableTranscriptionSupportLevelData} - mkdir -p $(dir $@) - cp $< $@.${tmpExt} - mv -f $@.${tmpExt} $@ - # convert to zero-based, 1/2 open ${tablePolyAFeatureTab}: ${tablePolyAFeatureMeta} ${metaFilterDepend} @mkdir -p $(dir $@) zcat $< | tawk '{print $$1,$$2-1,$$3,$$4,$$5-1,$$6,$$7,$$8}' | sort -k 4,4 -k 5,5n > $@.${tmpExt} mv -f $@.${tmpExt} $@ ${tableAnnotationRemarkTab}: ${tableAnnotationRemarkMeta} ${metaFilterDepend} @mkdir -p $(dir $@) ${metaFilterCmdGz} $< | tawk '{print $$1,gensub("\\\\n|\\\\","","g",$$2)}' | sort -k 1,1 > $@.${tmpExt} mv -f $@.${tmpExt} $@ # drop ENSTR entries that are a hack to support PAR sequences in GTF ${tableEntrezGeneTab}: ${tableEntrezGeneMeta} ${metaFilterDepend} @mkdir -p $(dir $@) zcat $< | tawk '$$1!~/^ENSTR/' | sort -k 1,1 | ${metaFilterCmd} /dev/stdin > $@.${tmpExt} mv -f $@.${tmpExt} $@ ## # intermediate data for ensembl/havana, not loaded into databases ## ${gencodeGp}: ${annotationGff} ${gencodeToUcscChain} @mkdir -p $(dir $@) ${gencodeGxfToGenePred} ${db} ${annotationGff} ${gencodeToUcscChain} $@.${tmpExt} mv -f $@.${tmpExt} $@ -${tableTranscriptionSupportLevelData}: ${metaFilterDepend} touch $@ ${gencodeTsv}: ${annotationGff} @mkdir -p $(dir $@) - ${gencodeGxfToAttrs} --keepGoing ${annotationGff} $@.${tmpExt} --tslTabOut=${tableTranscriptionSupportLevelData}.${tmpExt} - mv -f ${tableTranscriptionSupportLevelData}.${tmpExt} ${tableTranscriptionSupportLevelData} + ${gencodeGxfToAttrs} ${annotationGff} $@.${tmpExt} mv -f $@.${tmpExt} $@ ${targetGencodeTsv}: @mkdir -p $(dir $@) hgsql ${db} -e 'select * from wgEncodeGencodeAttrsV${backmapTargetVer}' > $@.${tmpExt} mv -f $@.${tmpExt} $@ # check attributes so code can be updated to handle new biotypes checkAttrs: ${annotationGff} ${gencodeGxfToAttrs} ${annotationGff} /dev/null ## # load tables # browser commands use static tmp file name, so use lock file to serialize