7b9d4ae382ad4ef6f30ea743d518543b25da87a7
markd
  Thu Jan 27 17:21:32 2022 -0800
Update gencode import to finish remove of use of older code in a different tree.  This is the first step towards simplifying the build process

diff --git src/hg/makeDb/outside/gencode/gencodeLoad.mk src/hg/makeDb/outside/gencode/gencodeLoad.mk
index e3949af..6c267a2 100644
--- src/hg/makeDb/outside/gencode/gencodeLoad.mk
+++ src/hg/makeDb/outside/gencode/gencodeLoad.mk
@@ -22,33 +22,33 @@
 tmpExt = ${host}.${ppid}.tmp
 SHELL = bash -e
 export SHELLOPTS=pipefail
 
 ##
 # programs, etc
 ##
 mach = $(shell uname -m)
 
 ##
 # Release info and files from Sanger.
 # BEGIN EDIT THESE EACH RELEASE
 ##
 #preRelease = no
 preRelease = yes
-#db = hg38
+db = hg38
 #db = hg19
-db = mm39
+#db = mm39
 #db = mm10
 ifeq (${db},mm10)
     grcRefAssembly = GRCm38
     verBase = M25
     prevVer = M24
     backmapTargetVer = M25
     ver = ${verBase}lift37
     gencodeOrg = Gencode_mouse
     ftpReleaseSubdir = release_${verBase}/GRCm38_mapping
     annGffTypeName = chr_patch_hapl_scaff.annotation
     isBackmap = yes
 else ifeq (${db},mm39)
     grcRefAssembly = GRCm39
     ver = M29
     prevVer = M28
@@ -82,39 +82,39 @@
     # pre-release
     baseUrl = rsync://ftp.ebi.ac.uk/pub/databases/havana/gencode_pre
 else
     # official release
     baseUrl = rsync://ftp.ebi.ac.uk/pub/databases/gencode
 endif
 
 rel = V${ver}
 releaseUrl = ${baseUrl}/${gencodeOrg}/${ftpReleaseSubdir}
 dataDir = data
 relDir = ${dataDir}/release_${ver}
 annotationGff = ${relDir}/gencode.v${ver}.${annGffTypeName}.gff3.gz
 pseudo2WayGff = ${relDir}/gencode.v${ver}.2wayconspseudos.gff3.gz
 polyAGff = ${relDir}/gencode.v${ver}.polyAs.gff3.gz
 
-ccdsBinDir = ~markd/compbio/ccds/ccds2/output/bin/$(mach)/opt
-gencodeMakeTracks = ${ccdsBinDir}/gencodeMakeTracks
-gencodeMakeAttrs = ${ccdsBinDir}/gencodeMakeAttrs
-gencodeExonSupportToTable = ${ccdsBinDir}/gencodeExonSupportToTable
-gencodeGxfToGenePred = ${ccdsBinDir}/gencodeGxfToGenePred
-gencodePolyaGxfToGenePred = ${ccdsBinDir}/gencodePolyaGxfToGenePred
-gencodeGxfToAttrs = ${ccdsBinDir}/gencodeGxfToAttrs
+gencodeBinDir = ${HOME}/kent/src/hg/makeDb/outside/gencode/bin
+gencodeMakeTracks = ${gencodeBinDir}/gencodeMakeTracks
+gencodeMakeAttrs = ${gencodeBinDir}/gencodeMakeAttrs
+gencodeExonSupportToTable = ${gencodeBinDir}/gencodeExonSupportToTable
+gencodeGxfToGenePred = ${gencodeBinDir}/gencodeGxfToGenePred
+gencodePolyaGxfToGenePred = ${gencodeBinDir}/gencodePolyaGxfToGenePred
+gencodeGxfToAttrs = ${gencodeBinDir}/gencodeGxfToAttrs
 buildGencodeToUcscLift = ${HOME}/kent/src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift
-gencodeBackMapMetadataIds = ${ccdsBinDir}/gencodeBackMapMetadataIds
+gencodeBackMapMetadataIds = ${gencodeBinDir}/gencodeBackMapMetadataIds
 encodeAutoSqlDir = ${HOME}/kent/src/hg/lib/encode
 
 ##
 # intermediate data not loaded into tracks
 ##
 gencodeGp = ${dataDir}/gencode.gp
 gencodeTsv = ${dataDir}/gencode.tsv
 gencodeToUcscChain = ${dataDir}/gencodeToUcsc.chain
 
 # flag indicating fetch was done
 fetchDone = ${relDir}/done
 
 ##
 # track and table data
 ##
@@ -188,31 +188,30 @@
 tableUniProt = ${tablePre}UniProt${rel}
 tableUniProtTab = ${tableDir}/${tableUniProt}.tab
 
 tablePolyAFeatureMeta = ${relDir}/gencode.v${ver}.metadata.PolyA_feature.gz
 tablePolyAFeature = ${tablePre}PolyAFeature${rel}
 tablePolyAFeatureTab = ${tableDir}/${tablePolyAFeature}.tab
 
 tableAnnotationRemarkMeta = ${relDir}/gencode.v${ver}.metadata.Annotation_remark.gz
 tableAnnotationRemark = ${tablePre}AnnotationRemark${rel}
 tableAnnotationRemarkTab = ${tableDir}/${tableAnnotationRemark}.tab
 
 tableEntrezGeneMeta = ${relDir}/gencode.v${ver}.metadata.EntrezGene.gz
 tableEntrezGene = ${tablePre}EntrezGene${rel}
 tableEntrezGeneTab = ${tableDir}/${tableEntrezGene}.tab
 
-tableTranscriptionSupportLevelData = ${dataDir}/gencode.v${ver}.transcriptionSupportLevel.tab
 tableTranscriptionSupportLevel  = ${tablePre}TranscriptionSupportLevel${rel}
 tableTranscriptionSupportLevelTab = ${tableDir}/${tableTranscriptionSupportLevel}.tab
 
 genePredExtTables = ${tableBasic} ${tableComp} ${tablePseudo}
 genePredTables =
 tabTables = ${tableAttrs} ${tableTag} ${tableGeneSource} \
 	    ${tableTranscriptSource} ${tableTranscriptSupport} \
 	    ${tableGeneSymbol} ${tablePdb} ${tablePubMed} ${tableRefSeq} ${tableUniProt} \
 	    ${tableAnnotationRemark} ${tableEntrezGene}  ${tableTranscriptionSupportLevel}
 ifeq (${isBackmap}, yes)
     targetGencodeTsv = ${dataDir}/target-gencode.tsv
 else
     # these are not included in backmap releases
     genePredTables = ${table2WayConsPseudo}
     genePredExtTables += ${tablePolyA}
@@ -258,33 +257,36 @@
 ${tableEntrezGeneMeta}: ${fetchDone}
 
 ##
 # primary table files
 ##
 mkTables: ${genePredExtTables:%=${tableDir}/%.gp} ${genePredTables:%=${tableDir}/%.gp} \
 	  ${tabTables:%=${tableDir}/%.tab}
 
 # grab subset name from file pattern (this is what tr command below does)
 ${tableDir}/${tablePre}%${rel}.gp: ${gencodeGp} ${gencodeTsv}
 	@mkdir -p $(dir $@)
 	${gencodeMakeTracks}  $$(echo $* | tr A-Z a-z) ${gencodeGp} ${gencodeTsv} $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 
 ${tableTagTab}: ${tableAttrsTab}
+${tableTranscriptionSupportLevelTab}: ${tableAttrsTab}
 ${tableAttrsTab}: ${gencodeGp} ${gencodeTsv}
 	@mkdir -p $(dir $@)
-	${gencodeMakeAttrs} ${gencodeGp} ${gencodeTsv} $@.${tmpExt} ${tableTagTab}
+	${gencodeMakeAttrs} ${gencodeGp} ${gencodeTsv} $@.${tmpExt} ${tableTagTab}.${tmpExt} ${tableTranscriptionSupportLevelTab}.${tmpExt}
+	mv -f ${tableTranscriptionSupportLevelTab}.${tmpExt} ${tableTranscriptionSupportLevelTab}
+	mv -f ${tableTagTab}.${tmpExt} ${tableTagTab}
 	mv -f $@.${tmpExt} $@
 
 ${table2WayConsPseudoGp}: ${pseudo2WayGff}
 	@mkdir -p $(dir $@)
 	gff3ToGenePred -allowMinimalGenes $< $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 
 ${tablePolyAGp}: ${polyAGff} ${gencodeToUcscChain}
 	@mkdir -p $(dir $@)
 	${gencodePolyaGxfToGenePred} $< ${gencodeToUcscChain} $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 
 ${tableUniProtTab}: ${tableSwissProtMeta} ${tableTrEMBLMeta} ${gencodeTsv}
 	@mkdir -p $(dir $@)
 	((${metaFilterCmdGz} ${tableSwissProtMeta} | tawk '{print $$0,"SwissProt"}') && (${metaFilterCmdGz}  ${tableTrEMBLMeta} | tawk '{print $$0,"TrEMBL"}')) | sort -k 1,1 > $@.${tmpExt}
@@ -324,64 +326,57 @@
 ${tableTranscriptSupportTab}: ${tableTranscriptSupportMeta} ${metaFilterDepend}
 	${copyMetadataTabGz}
 ${tableExonSupportTab}: ${tableExonSupportMeta} ${gencodeToUcscChain} ${metaFilterDepend}
 	@mkdir -p $(dir $@)
 	${gencodeExonSupportToTable} ${tableExonSupportMeta} ${gencodeToUcscChain} $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 ${tableGeneSymbolTab}: ${tableGeneSymbolMeta} ${metaFilterDepend}
 	${copyMetadataTabGz}
 ${tablePdbTab}: ${tablePdbMeta} ${metaFilterDepend}
 	${copyMetadataTabGz}
 ${tablePubMedTab}: ${tablePubMedMeta} ${metaFilterDepend}
 	${copyMetadataTabGz}
 ${tableRefSeqTab}: ${tableRefSeqMeta} ${metaFilterDepend}
 	${copyMetadataTabGz}
 
-${tableTranscriptionSupportLevelTab}: ${tableTranscriptionSupportLevelData}
-	mkdir -p $(dir $@)
-	cp $< $@.${tmpExt}
-	mv -f $@.${tmpExt} $@
-
 # convert to zero-based, 1/2 open
 ${tablePolyAFeatureTab}: ${tablePolyAFeatureMeta} ${metaFilterDepend}
 	@mkdir -p $(dir $@)
 	zcat $< | tawk '{print $$1,$$2-1,$$3,$$4,$$5-1,$$6,$$7,$$8}' | sort -k 4,4 -k 5,5n > $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 ${tableAnnotationRemarkTab}: ${tableAnnotationRemarkMeta} ${metaFilterDepend}
 	@mkdir -p $(dir $@)
 	${metaFilterCmdGz} $<  | tawk '{print $$1,gensub("\\\\n|\\\\","","g",$$2)}' | sort -k 1,1 > $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 # drop ENSTR entries that are a hack to support PAR sequences in GTF
 ${tableEntrezGeneTab}: ${tableEntrezGeneMeta} ${metaFilterDepend}
 	@mkdir -p $(dir $@)
 	zcat $< | tawk '$$1!~/^ENSTR/' | sort -k 1,1 | ${metaFilterCmd} /dev/stdin > $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 
 ##
 # intermediate data for ensembl/havana, not loaded into databases
 ##
 ${gencodeGp}: ${annotationGff} ${gencodeToUcscChain}
 	@mkdir -p $(dir $@)
 	${gencodeGxfToGenePred} ${db} ${annotationGff} ${gencodeToUcscChain} $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 
-${tableTranscriptionSupportLevelData}: ${metaFilterDepend}
 	touch $@
 ${gencodeTsv}: ${annotationGff}
 	@mkdir -p $(dir $@)
-	${gencodeGxfToAttrs} --keepGoing ${annotationGff} $@.${tmpExt} --tslTabOut=${tableTranscriptionSupportLevelData}.${tmpExt}
-	mv -f ${tableTranscriptionSupportLevelData}.${tmpExt} ${tableTranscriptionSupportLevelData}
+	${gencodeGxfToAttrs} ${annotationGff} $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 
 ${targetGencodeTsv}:
 	@mkdir -p $(dir $@)
 	hgsql ${db}  -e 'select * from wgEncodeGencodeAttrsV${backmapTargetVer}' > $@.${tmpExt}
 	mv -f $@.${tmpExt} $@
 
 
 # check attributes so code can be updated to handle new biotypes
 checkAttrs: ${annotationGff}
 	${gencodeGxfToAttrs} ${annotationGff} /dev/null
 
 ##
 # load tables
 #  browser commands use static tmp file name, so use lock file to serialize