d17cf65947b999bfaf88af1fc6eaaff2c1f1080a mspeir Wed Jun 25 10:54:12 2025 -0700 changing hgdownload urls from cse -> gi diff --git src/cbPyLib/cellbrowser/genes.py src/cbPyLib/cellbrowser/genes.py index 0bb3f14..cbdac9f 100755 --- src/cbPyLib/cellbrowser/genes.py +++ src/cbPyLib/cellbrowser/genes.py @@ -204,58 +204,58 @@ outFname = getStaticPath(getGeneSymPath(geneType)) writeRows(rows, outFname) def iterGencodePairs(release, doTransGene=False): " generator, yields geneId,symbol or transId,geneId pairs for a given gencode release" # e.g. trackName = "wgEncodeGencodeBasicV34" #attrFname = trackName.replace("Basic", "Attrs").replace("Comp", "Attrs") #assert(release[1:].isdigit()) db = "hg38" if release[0]=="M": db = "mm10" if int(release.strip("M"))>=26: db='mm39' if release in ["7", "14", "17", "19"] or "lift" in release: db = "hg19" - url = "https://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgEncodeGencodeAttrsV%s.txt.gz" % (db, release) + url = "https://hgdownload.gi.ucsc.edu/goldenPath/%s/database/wgEncodeGencodeAttrsV%s.txt.gz" % (db, release) logging.info("Downloading %s" % url) doneIds = set() lines = downloadUrlLines(url) for line in lines: row = line.rstrip("\n").split("\t") if doTransGene: # key = transcript ID, val is geneId key = row[4] val = row[0] val = val else: # key = geneId, val is symbol key = row[0] key = key val = row[1] if key not in doneIds: yield key, val doneIds.add(key) def iterGencodeBed(db, release): " generator, yields a BED12+1 with a 'canonical' transcript for every gencode comprehensive gene " transToGene = dict(iterGencodePairs(release, doTransGene=True)) - url = "http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgEncodeGencodeCompV%s.txt.gz" % (db, release) + url = "http://hgdownload.gi.ucsc.edu/goldenPath/%s/database/wgEncodeGencodeCompV%s.txt.gz" % (db, release) logging.info("Downloading %s" % url) geneToTransList = defaultdict(list) for line in downloadUrlLines(url): row = tuple(line.split('\t')) transId = row[1] geneId = transToGene[transId] score = int(''.join(c for c in geneId if c.isdigit())) # extract only the xxx part of the ENSGxxx ID geneToTransList[geneId].append( (score, row) ) logging.info("Picking one transcript per gene") for geneId, transList in iterItems(geneToTransList): transList.sort() # prefer older transcripts canonTransRow = transList[0][1] binIdx, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames = canonTransRow blockStarts = [] @@ -362,34 +362,34 @@ sep = "\n" print("Pre-built gene model mapping files available for 'fetch' at %s" % url) print(sep.join(geneFnames)) #for g in geneFnames: #print(g.replace(".bed.gz","")) print() print("Pre-built geneId/symbol tables available for 'fetch' at %s" % url) print(sep.join(symFnames)) #for g in symFnames: #print(g.replace(".symbols.tsv.gz", "")) def listModelRemoteBuild(): sep = "\n" - urls = [("hg38", "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/"), - ("mm10", "https://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/"), - ("mm39", "https://hgdownload.cse.ucsc.edu/goldenPath/mm39/database/"), - ("hg19", "https://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/") + urls = [("hg38", "https://hgdownload.gi.ucsc.edu/goldenPath/hg38/database/"), + ("mm10", "https://hgdownload.gi.ucsc.edu/goldenPath/mm10/database/"), + ("mm39", "https://hgdownload.gi.ucsc.edu/goldenPath/mm39/database/"), + ("hg19", "https://hgdownload.gi.ucsc.edu/goldenPath/hg19/database/") ] allNames = defaultdict(list) for db, url in urls: print() print("Files available for 'build' for assembly %s (%s)" % (db, url)) lines = downloadUrlLines(url) fnames = parseApacheDir(lines) geneFnames = [x for x in fnames if x.startswith("wgEncodeGencodeAttrs") and x.endswith(".txt.gz")] relNames = [x.replace("wgEncodeGencodeAttrsV", "gencode-").replace(".txt.gz", "") for x in geneFnames] allNames[db].extend(relNames) print(sep.join(relNames)) #for db, names in allNames.items(): #for name in names: