src/hg/tcga/scripts/processData.py 1.7
1.7 2009/04/25 02:17:38 sbenz
Added pipeline for baylor OV data
Index: src/hg/tcga/scripts/processData.py
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/scripts/processData.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 1000000 -r1.6 -r1.7
--- src/hg/tcga/scripts/processData.py 17 Nov 2008 05:23:17 -0000 1.6
+++ src/hg/tcga/scripts/processData.py 25 Apr 2009 02:17:38 -0000 1.7
@@ -1,262 +1,294 @@
import os, sys, fnmatch, string, math, getopt
from DataTypes import *
def usage():
print "processData [-options]"
print "\t-h, --help\tthis usage statement"
print "\t-t, --type=<string>\tType of data to process"
print "\twhere <string> can be:"
print "\t\tmethyl2 -- JHU-USC Methylation - OMA002: Cy3 / Cy5 ratio"
print "\t\tmethyl2Beta -- JHU-USC Methylation - OMA002: Avg Beta Values"
print "\t\tmethyl3 -- JHU-USC Methylation - OMA003: Cy3 / Cy5 ratio"
print "\t\tmethyl3Beta -- JHU-USC Methylation - OMA003: Avg Beta Values"
print "\t\tharvard -- Harvard CGH 244A GBM"
print "\t\tharvardOV -- Harvard CGH 244A Ovarian"
print "\t\tmskcc -- MSKCC 244A"
print "\t\thuEx -- human exon array"
print "\t\tbroadABI -- Broad ABI SNP data -- GBM"
print "\t\tbroadABI -- Broad ABI SNP data -- Ovarian"
print "\t\tbaylorABI -- Baylor ABI SNP data"
print "\t\twustlABI -- WUSTL ABI SNP Data"
print "\t\tsnp6 -- Broad SNP 6.0"
print ""
def handleOpts(argv):
opts, args = getopt.getopt(argv, "ht:", ["help", "type="])
type = ""
for o, a in opts:
if o in ("-h", "--help"):
usage()
sys.exit()
elif o in ("-t", "--type"):
type = a
else:
print "Unhandled option"
usage()
sys.exit()
if type == "methyl2":
baseDir = "/data/TCGA/O_jhuUscMethyl/working/"
patterns = ["*OMA002*cy3-cy5-value.txt"]
bed12file = baseDir + "jhuUscMethyl_OMA002.bed"
bed15file = baseDir + "jhuUscMethyl2_bed15.bed"
prefix = "jhuUscMethyl2"
Data = MethylData(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "methyl3":
baseDir = "/data/TCGA/O_jhuUscMethyl/working/"
patterns = ["*OMA003*cy3-cy5-value.txt"]
bed12file = baseDir + "jhuUscMethyl_OMA003.bed"
bed15file = baseDir + "jhuUscMethyl3_bed15.bed"
prefix = "jhuUscMethyl3"
Data = MethylData(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "methyl2Beta":
baseDir = "/data/TCGA/O_jhuUscMethyl/working/"
patterns = ["*OMA002*beta-value.txt"]
bed12file = baseDir + "jhuUscMethyl_OMA002.bed"
bed15file = baseDir + "jhuUscMethylBeta2_bed15.bed"
prefix = "jhuUscMethyl2Beta"
Data = MethylBetaData(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "methyl3Beta":
baseDir = "/data/TCGA/O_jhuUscMethyl/working/"
patterns = ["*OMA002*beta-value.txt"]
bed12file = baseDir + "jhuUscMethyl_OMA003.bed"
bed15file = baseDir + "jhuUscMethyl3Beta_bed15.bed"
prefix = "jhuUscMethyl3Beta"
Data = MethylBetaData(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "harvard":
baseDir = "/data/TCGA/O_harvardCGH/working/"
patterns = ["*data.txt"]
bed12file = baseDir + "agilentCGH244A.bed"
bed15file = baseDir + "harvardCGH244A_bed15.bed"
prefix = "harvardCGH"
Data = CGH244A(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "harvardOV":
baseDir = "/data/TCGA/O_harvardOVCGH/working/"
patterns = ["*data.txt"]
bed12file = baseDir + "agilentCGH244A.bed"
bed15file = baseDir + "harvardOVCGH244A_bed15.bed"
prefix = "harvardOVCGH"
Data = CGH244A(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "mskcc":
baseDir = "/data/TCGA/O_mskccCGH/working/"
patterns = ["*data.txt"]
bed12file = baseDir + "agilentCGH244A.bed"
bed15file = baseDir + "mskccCGH244A_bed15.bed"
prefix = "mskccCGH"
Data = CGH244A(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "huEx":
baseDir = "/data/TCGA/C_lblHuEx/working/"
patterns = ["*data.txt"]
bed12file = baseDir + "affyHuEx1_bed12.bed"
bed15file = baseDir + "lblHuEx1_bed15.bed"
prefix = "lblHuEx1"
Data = HuEx(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "snp6":
baseDir = "/data/TCGA/C_broadSNP6.0/working/"
patterns = ["*copynumber.txt"]
bed12file = baseDir + "GenomeWideSNP_6.bed"
bed15file = baseDir + "broadSNP6_bed15.bed"
prefix = "broadSNP6"
Data = SNP6(baseDir, patterns, bed12file, bed15file, prefix)
elif type == "broadABI":
baseDir = "/data/TCGA/C_broadABI/working/"
patterns = ["*.maf"]
bed12file = ""
bed15file = baseDir + "broadSNP6_bed15.bed"
prefix = "broadABI"
Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
Data.bed12Files = []
Data.chromField = "Chromosome"
Data.startField = "Start_position"
Data.stopField = "End_position"
Data.strandField = "Strand"
Data.statusField = "Mutation_Status"
Data.classField = "Variant_Classification"
Data.sampleIdField = "Tumor_Sample_Barcode"
Data.collapseStatus = {}
Data.collapseStatus["Somatic"] = ["Somatic"]
Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown"]
Data.collapseClass = {}
Data.collapseClass["Quiet"] = ["Silent", "Targeted_Region"]
Data.collapseClass["Loud"] = ["Frame_Shift_Del", "Frame_Shift_Ins",
"In_Frame_Del", "In_Frame_Ins",
"Missense_Mutation", "Nonsense_Mutation",
"Splice_Site_Indel", "Splice_Site_SNP"]
elif type == "broadOVABI":
baseDir = "/data/TCGA/C_broadOVABI/working/"
patterns = ["*.maf"]
bed12file = ""
bed15file = baseDir + "broadOVSNP6_bed15.bed"
prefix = "broadOVABI"
Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
Data.bed12Files = []
Data.chromField = "Chromosome"
Data.startField = "Start_position"
Data.stopField = "End_position"
Data.strandField = "Strand"
Data.statusField = "Mutation_Status"
Data.classField = "Variant_Classification"
Data.sampleIdField = "Tumor_Sample_Barcode"
Data.collapseStatus = {}
Data.collapseStatus["Somatic"] = ["Somatic"]
Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown"]
Data.collapseClass = {}
Data.collapseClass["Quiet"] = ["Silent", "Targeted_Region"]
Data.collapseClass["Loud"] = ["Frame_Shift_Del", "Frame_Shift_Ins",
"In_Frame_Del", "In_Frame_Ins",
"Missense_Mutation", "Nonsense_Mutation",
"Splice_Site_Indel", "Splice_Site_SNP"]
elif type == "baylorABI":
baseDir = "/data/TCGA/C_baylorABI/working/"
patterns = ["*.maf"]
bed12file = ""
bed15file = baseDir + "baylorABI_bed15.bed"
prefix = "hgscABI"
Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
Data.bed12Files = []
Data.chromField = "CHROM"
Data.startField = "START_POSITION"
Data.stopField = "END_POSITION"
Data.strandField = "STRAND"
Data.statusField = "MUTATION_STATUS"
Data.classField = "VARIANT_CLASSIFICATION"
Data.sampleIdField = "TUMOR_SAMPLE_BARCODE"
Data.collapseStatus = {}
Data.collapseStatus["Somatic"] = ["Somatic", '"somatic, homozygous"',
'"somatic, heterozygous"']
Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown", "germline"]
Data.collapseClass = {}
Data.collapseClass["Quiet"] = ["Silent", "Targeted_Region", "Synonymous"]
Data.collapseClass["Loud"] = ["frame_shift", "Frame_Shift_Del", "Frame_Shift_Ins",
"in_frame_insertion", "in_frame_deletion",
"In_Frame_Del", "In_Frame_Ins", "Splice_Site_Indel",
"Splice_Site_SNP", "Splice_site",
"Nonsense_Mutation", "Missense_Mutation", "Missense",
"Nonsense"]
+ elif type == "baylorOVABI":
+ baseDir = "/data/TCGA/OV/mutation/hgsc.bcm.edu_OV.ABI.1.1.0/"
+ patterns = ["*.maf"]
+ bed12file = ""
+ bed15file = baseDir + "baylorOVABI_bed15.bed"
+ prefix = "hgscOVABI"
+
+ Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
+
+ Data.bed12Files = []
+ Data.chromField = "CHROM"
+ Data.startField = "START_POSITION"
+ Data.stopField = "END_POSITION"
+ Data.strandField = "STRAND"
+ Data.statusField = "MUTATION_STATUS"
+ Data.classField = "VARIANT_CLASSIFICATION"
+ Data.sampleIdField = "TUMOR_SAMPLE_ID"
+
+ Data.collapseStatus = {}
+ Data.collapseStatus["Somatic"] = ["Somatic", '"somatic, homozygous"',
+ '"somatic, heterozygous"']
+ Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown", "germline"]
+
+ Data.collapseClass = {}
+ Data.collapseClass["Quiet"] = ["Silent", "Targeted_Region", "Synonymous"]
+ Data.collapseClass["Loud"] = ["frame_shift", "Frame_Shift_Del", "Frame_Shift_Ins",
+ "in_frame_insertion", "in_frame_deletion",
+ "In_Frame_Del", "In_Frame_Ins", "Splice_Site_Indel",
+ "Splice_Site_SNP", "Splice_site",
+ "Nonsense_Mutation", "Missense_Mutation", "Missense",
+ "Nonsense"]
+
elif type == "wustlABI":
baseDir = "/data/TCGA/C_wustlABI/working/"
patterns = ["*.maf"]
bed12file = ""
bed15file = baseDir + "wustlABI_bed15.bed"
prefix = "wustlABI"
Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
Data.bed12Files = []
Data.chromField = "Chromosome"
Data.startField = "Start_position"
Data.stopField = "End_position"
Data.strandField = "Strand"
Data.statusField = "Mutation_Status"
Data.classField = "Variant_Classification"
Data.sampleIdField = "Tumor_Sample_Barcode"
Data.collapseStatus = {}
Data.collapseStatus["Somatic"] = ["Somatic"]
Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown", "Valid"]
Data.collapseClass = {}
Data.collapseClass["Quiet"] = ["Silent_Mutation", "Targeted_Region"]
Data.collapseClass["Loud"] = ["Frame_Shift_Del", "Frame_Shift_Ins",
"In_Frame_Del", "In_Frame_Ins",
"Missense_Mutation", "Nonsense_Mutation",
"Splice_Site_Del", "Splice_Site_Ins", "Splice_Site_SNP"]
else:
print "Unhandled type: " + type
usage()
sys.exit()
return Data
def main(argv):
dataObject = handleOpts(argv)
if not dataObject is None:
print "Processing data..."
dataObject.process()
print "Merging data into single file..."
dataObject.mergeFiles()
print "Converting merged data to BED15..."
dataObject.convertToBed15()
print "Writing entry for microarrayGroups.ra file..."
dataObject.writeRaFile()
if __name__ == "__main__":
main(sys.argv[1:])