59ed2ee65401685e0a3f7c9e4277bdc1e6bde82e chmalee Mon May 6 10:35:21 2019 -0700 Small script for translating dgv gff to bed12, refs #23371 diff --git src/hg/utils/automation/translateDgvGold.py src/hg/utils/automation/translateDgvGold.py new file mode 100755 index 0000000..9e72afc --- /dev/null +++ src/hg/utils/automation/translateDgvGold.py @@ -0,0 +1,114 @@ +#!/bin/env python +""" +Parses the Gold-standard copy number variants GFF from DGV and writes +a tab separated bed 12 with extra fields to stdout. + +Example: +./translateDgvGold.py input.gff | sort -k1,1 -k2,2n | uniq > output.bed12 +""" + +import sys + +header = "chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\t" \ + "itemRgb\tblockCount\tblockSizes\tblockStarts" + +# gain blue, loss red +colors = {"Gain": "0,0,200", "Loss": "200,0,0"} + +def parseExtraInfo(extraInfoList, doHeader=False): + """ Parses the 9th field of the DGV Gold GFF and optionally assigns the header.""" + info = [] + extraHeader = [] + for x in extraInfoList: + s = x.split("=") + if doHeader: + extraHeader.append(s[0]) + info.append(s[1]) + + name = info[1] + subType = info[3] + outerStart = info[4] + innerStart = info[5] + innerEnd = info[6] + outerEnd = info[7] + + # delete fields that are gonna be in the normal bed fields + info.remove(name) + info.remove(outerStart) + info.remove(innerStart) + info.remove(innerEnd) + info.remove(outerEnd) + extraString = "\t".join(info) + + if doHeader: + global header + extraHeader.remove("Name") + extraHeader.remove("outer_start") + extraHeader.remove("inner_start") + extraHeader.remove("inner_end") + extraHeader.remove("outer_end") + header += "\t" + "\t".join(extraHeader) + + return name, subType, outerStart, innerStart, innerEnd, outerEnd, extraString + +### main ### + +if len(sys.argv) != 1: + print("Usage: translateDgvGold.py input.gff | sort -k1,1 -k2,2n | uniq > out.bed12\n"\ + "\nWrites a bed12 to stdout.\n") + sys.exit(1) + +doHeader = True + +for line in open(sys.argv[1]): + gffFields = line.strip().split("\t") + extraInfo = gffFields[8].split(";") + chrom = gffFields[0] + name, subType, outerStart, innerStart, innerEnd, chromEnd, extraString = parseExtraInfo(extraInfo, doHeader) + if subType not in colors: + sys.exit("CNV subtype: %s not recognized.\n" % subType) + + chromStart = str(int(outerStart) - 1) + + # one block + if innerStart == outerStart and innerEnd == chromEnd: + blockCount = "1" + blockSizes = str(int(chromEnd) - int(chromStart)) + blockStarts = "0" + # no left intron + elif outerStart == innerStart: + blockCount = "2" + blockStarts = "0," + str(int(chromEnd) - int(chromStart) - 1) + blockSizes = str(int(innerEnd) - int(chromStart)) + ",1" + # no right intron + elif innerEnd == chromEnd: + blockCount = "2" + blockStarts = "0," + str(int(innerEnd) - (int(innerEnd) - int(innerStart)) - int(chromStart)) + blockSizes = "1," + str(int(innerEnd) - int(innerStart)) + # two introns + else: + blockCount = "3" + blockSizes = "1,"+ str(int(innerEnd) - int(innerStart))+",1" + blockStarts = "0," + \ + str(int(innerStart)-int(chromStart)) + \ + ","+str(int(chromEnd)-int(chromStart)-1) + + bedLine = [chrom, + chromStart, + chromEnd, + name, + "0", + ".", + chromEnd, + chromEnd, + colors[subType], + blockCount, + blockSizes, + blockStarts, + extraString] + + if doHeader: + print("#" + header) + + print("\t".join(bedLine)) + doHeader = False