c5026c9eeaff1077f67822e478031ef9552fbd47 lrnassar Tue Oct 22 17:35:33 2024 -0700 I made a bonehead move when I first made this validation and did not re-assign the variable. This meant that the md5sum values always were md5sum+file name, and since the file names are always different the otto job always ran. This fixes the validation to only run when there is new data. No RM. diff --git src/hg/utils/otto/genCC/doGenCC.py src/hg/utils/otto/genCC/doGenCC.py index 43642089..28f9942 100644 --- src/hg/utils/otto/genCC/doGenCC.py +++ src/hg/utils/otto/genCC/doGenCC.py @@ -205,33 +205,33 @@ chromStart = geneDic['txStart'] chromEnd = geneDic['txEnd'] strand = geneDic['strand'] outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd, "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:]))) except: n+=1 print("No match for: "+geneSymbol) hg38GenCCbedFile.close() outputHg19File.close() print("hg19 genCC bed file completed. Total number of failed entries: "+str(n)) def checkIfUpdateIsNeeded(): bash("wget -q https://search.thegencc.org/download/action/submissions-export-tsv -O /hive/data/outside/otto/genCC/newSubmission.tsv") newMd5sum = bash("md5sum /hive/data/outside/otto/genCC/newSubmission.tsv") - newMd5sum.split(" ")[0] + newMd5sum = newMd5sum.split(" ")[0] oldMd5sum = bash("md5sum /hive/data/outside/otto/genCC/prevSubmission.tsv") - oldMd5sum.split(" ")[0] + oldMd5sum = oldMd5sum.split(" ")[0] if oldMd5sum != newMd5sum: return(True) else: return(False) if checkIfUpdateIsNeeded(): date = str(datetime.now()).split(" ")[0] workDir = "/hive/data/outside/otto/genCC/"+date bash("mkdir -p "+workDir) hg19outPutFile = workDir+"/hg19genCC.bed" hg38outPutFile = workDir+"/hg38genCC.bed" bash("cp /hive/data/outside/otto/genCC/newSubmission.tsv "+workDir) genCCtsvFile = "/hive/data/outside/otto/genCC/newSubmission.tsv" buildFileHg38(genCCtsvFile,hg38outPutFile)