dc808927b55b21c72db0cd528f76624fcba339dd max Mon Jan 24 05:15:33 2022 -0800 adding igv-style chrom alias support to chromToUcsc, refs #28722 diff --git src/utils/chromToUcsc/chromToUcsc src/utils/chromToUcsc/chromToUcsc index 31ce38d..c1f884f 100755 --- src/utils/chromToUcsc/chromToUcsc +++ src/utils/chromToUcsc/chromToUcsc @@ -78,48 +78,64 @@ yield lineNo, " ", row, line # *step lines are always space-separated else: if sep==-1: # why test first for space? Because after fixedStep, the lines are just single numbers if "\t" in line: sep = "\t" elif " " in line: sep = None # = split on any whitespace, consec. whitespc counts as one else: sep = "\t" # default is to split on tab row = line.rstrip("\n\r").split(sep) if sep is None: sep = " " yield lineNo, sep, row, line +def parseNewAlias(ifh): + " IGV-compatible format: first is UCSC, all other columns are aliases " + toUcsc = {} + for line in ifh: + if line.startswith("#"): + continue + row = line.rstrip("\n").split("\t") + for i in range(1, len(row)): + toUcsc[row[i]] = row[0] + return toUcsc + def parseAlias(fname): " parse tsv file with at least two columns, orig chrom name and new chrom name " + logging.debug("alias file is in IGV-format") toUcsc = {} if fname.startswith("http://") or fname.startswith("https://"): ifh = urlopen(fname) if fname.endswith(".gz"): data = gzip.GzipFile(fileobj=ifh).read().decode() ifh = data.splitlines() elif fname.endswith(".gz"): ifh = gzip.open(fname, "rt") else: ifh = open(fname) + firstLine = True for line in ifh: + if line.startswith("#") and firstLine: + return parseNewAlias(ifh) if line.startswith("alias"): continue row = line.rstrip("\n").split("\t") toUcsc[row[0]] = row[1] + firstLine = False return toUcsc def handledUnmappedChrom(chrom, skipUnknown, skipWarned, message): "either generate an error or warning when an unknown chromosome is encountered." if skipUnknown: if chrom not in skipWarned: logging.warning(message) skipWarned.add(chrom) else: logging.error(message) exit(1) def chromToUcsc(aliasFname, fieldIdx, skipUnknown, ifh, ofh): " convert column number fieldIdx to UCSC-style chrom names "