src/hg/tcga/scripts/processData.py 1.10

1.10 2009/09/12 01:39:07 jsanborn
fixed a few bugs
Index: src/hg/tcga/scripts/processData.py
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/scripts/processData.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/tcga/scripts/processData.py	12 Aug 2009 05:05:24 -0000	1.9
+++ src/hg/tcga/scripts/processData.py	12 Sep 2009 01:39:07 -0000	1.10
@@ -1,377 +1,388 @@
 import os, sys, fnmatch, string, math, getopt
 from DataTypes import *
 
 def usage():
     print "processData [-options]"
     print "\t-h, --help\tthis usage statement"
     print "\t-t, --type=<string>\tType of data to process"
     print "\twhere <string> can be:"
     print "\t\tmethyl2 -- JHU-USC Methylation - OMA002: Cy3 / Cy5 ratio"
     print "\t\tmethyl2Beta -- JHU-USC Methylation - OMA002: Avg Beta Values"
     print "\t\tmethyl3 -- JHU-USC Methylation - OMA003: Cy3 / Cy5 ratio"
     print "\t\tmethyl3Beta -- JHU-USC Methylation - OMA003: Avg Beta Values"
     print "\t\tharvard -- Harvard CGH 244A GBM"
     print "\t\tharvardOV -- Harvard CGH 244A Ovarian"
     print "\t\tmskcc -- MSKCC 244A"
     print "\t\tmskccOV -- MSKCC Ovarian 244A"
     print "\t\tmskccOV1x1M - MSKCC Ovarian 1x1M" 
     print "\t\thuEx -- human exon array"
     print "\t\tbroadABI -- Broad ABI SNP data -- GBM"
     print "\t\tbroadABI -- Broad ABI SNP data -- Ovarian"
     print "\t\tbaylorABI -- Baylor ABI SNP data"
     print "\t\twustlABI -- WUSTL ABI SNP Data"
     print "\t\tsnp6 -- Broad SNP 6.0"
     print "\t\tbroadAffyOV -- Affy U133A Ovarian"
     print "\t\tuncOVG4502  -- Agilent G4502 Ovarian"
     print "\t\tharvardOVCGH415 -- Agilent G4124A 415K CGH Ovarian"
     print "\t\tuncOVMiRNA -- Agilent MiRNA Ovarian"
     print "\t\tjhuMethylOV -- Methylation27 Ovarian"
+    print "\t\trnaSeqOV -- RNA Seq DGE Ovarian"
     print ""
     
 def handleOpts(argv):
     opts, args = getopt.getopt(argv, "ht:", ["help", "type="])
 
     type = ""
     
     for o, a in opts:
         if o in ("-h", "--help"):
             usage()
             sys.exit()
         elif o in ("-t", "--type"):
             type = a
         else:
             print "Unhandled option"
             usage()
             sys.exit()
 
     if type == "methyl2":
         baseDir = "/data/TCGA/O_jhuUscMethyl/working/"
         patterns = ["*OMA002*cy3-cy5-value.txt"]
         bed12file = baseDir + "jhuUscMethyl_OMA002.bed"
         bed15file = baseDir + "jhuUscMethyl2_bed15.bed"
         prefix = "jhuUscMethyl2"
             
         Data = MethylData(baseDir, patterns, bed12file, bed15file, prefix)
 
     elif type == "methyl3":
         baseDir = "/data/TCGA/O_jhuUscMethyl/working/"
         patterns = ["*OMA003*cy3-cy5-value.txt"]
         bed12file = baseDir + "jhuUscMethyl_OMA003.bed"
         bed15file = baseDir + "jhuUscMethyl3_bed15.bed"
         prefix = "jhuUscMethyl3"
             
         Data = MethylData(baseDir, patterns, bed12file, bed15file, prefix)
 
     elif type == "methyl2Beta":
         baseDir = "/data/TCGA/O_jhuUscMethyl/working/"
         patterns = ["*OMA002*beta-value.txt"]
         bed12file = baseDir + "jhuUscMethyl_OMA002.bed"
         bed15file = baseDir + "jhuUscMethylBeta2_bed15.bed"
         prefix = "jhuUscMethyl2Beta"
 
         Data = MethylBetaData(baseDir, patterns, bed12file, bed15file, prefix)
 
     elif type == "methyl3Beta":
         baseDir = "/data/TCGA/O_jhuUscMethyl/working/"
         patterns = ["*OMA002*beta-value.txt"]
         bed12file = baseDir + "jhuUscMethyl_OMA003.bed"
         bed15file = baseDir + "jhuUscMethyl3Beta_bed15.bed"
         prefix = "jhuUscMethyl3Beta"
 
         Data = MethylBetaData(baseDir, patterns, bed12file, bed15file, prefix)
 
     elif type == "jhuMethylOV":
         baseDir = "/data/TCGA/O_jhuMethylOV/working/"
         patterns = ["*lvl-2*"]
         bed12file = baseDir + "methylIllumina.bed"
         bed15file = baseDir + "jhuMethylOV.bed"
         prefix = "jhuMethyOV"
 
         Data = Methyl27Data(baseDir, patterns, bed12file, bed15file, prefix)
         Data.dataStartIndex = 1
         Data.medianNormalize = 1
         
     elif type == "harvard":
         baseDir = "/data/TCGA/O_harvardCGH/working/"
         patterns = ["*data.txt"]
-        bed12file = baseDir + "agilentCGH244A.bed"
+        bed12file = baseDir + "agilentCgh244a_bed12.bed"
         bed15file = baseDir + "harvardCGH244A_bed15.bed"
         prefix = "harvardCGH"
         
         Data = CGH244A(baseDir, patterns, bed12file, bed15file, prefix)
 
     elif type == "harvardOV":
         baseDir = "/data/TCGA/O_harvardOVCGH/working/"
         patterns = ["*.data.txt"]
-        bed12file = baseDir + "agilentCGH244A.bed"
+        bed12file = baseDir + "agilentCgh244a_bed12.bed"
         bed15file = baseDir + "harvardOVCGH244A_bed15.bed"
         prefix = "harvardOVCGH"
         
         Data = CGH244A(baseDir, patterns, bed12file, bed15file, prefix)
 
     elif type == "harvardOVCGH415":
         baseDir = "/data/TCGA/O_harvardOVCGH415/working/"
         patterns = ["*.data.txt"]
         bed12file = baseDir + "agilentG4124A.bed"
         bed15file = baseDir + "harvardOVCGH415_bed15.bed"
         prefix = "harvardOVCGH415"
         
         Data = CGH244A(baseDir, patterns, bed12file, bed15file, prefix)
 
     elif type == "broadAffyOV":
         baseDir = "/data/TCGA/O_broadAffyOV/working/"
         patterns = ["*data.txt"]
         bed12file = baseDir + "affyU133A.bed"
         bed15file = baseDir + "broadAffyOV_bed15.bed"
         prefix = "broadAffyOV"
         
         Data = AffyU133A(baseDir, patterns, bed12file, bed15file, prefix)
         Data.medianNormalize = 1
         
     elif type == "uncOVG4502":
         baseDir = "/data/TCGA/O_uncOVG4502/working/"
         patterns = ["*data.txt"]
         bed12file = baseDir + "agilentG4502A.bed"
         bed15file = baseDir + "uncOVG4502_bed15.bed"
         prefix = "uncOVG4502"
         
         Data = AffyU133A(baseDir, patterns, bed12file, bed15file, prefix)
         Data.medianNormalize = 1
         
     elif type == "mskcc":
         baseDir = "/data/TCGA/O_mskccCGH/working/"
         patterns = ["*data.txt"]
-        bed12file = baseDir + "agilentCGH244A.bed"
+        bed12file = baseDir + "agilentCgh244a_bed12.bed"
         bed15file = baseDir + "mskccCGH244A_bed15.bed"
         prefix = "mskccCGH"
         
         Data = CGH244A(baseDir, patterns, bed12file, bed15file, prefix)
         Data.dataStartIndex = 3
         
     elif type == "mskccOV":
         baseDir = "/data/TCGA/O_mskccOVCGH/working/"
         patterns = ["*transformation"]
         bed12file = baseDir + "agilentCGH244A.bed"
         bed15file = baseDir + "mskccOVCGH244A_bed15.bed"
         prefix = "mskccOVCGH"
                 
         Data = CGH244A(baseDir, patterns, bed12file, bed15file, prefix)
         Data.dataStartIndex = 2
 
     elif type == "mskccOV1x1M":
         baseDir = "/data/TCGA/O_mskccOV1x1M/working/"
         patterns = ["*data.txt"]
         bed12file = baseDir + "agilent1x1M.bed"
         bed15file = baseDir + "mskccOV1x1M_bed15.bed"
         prefix = "mskccOV1x1M"
                 
         Data = CGH1x1M(baseDir, patterns, bed12file, bed15file, prefix)
         Data.dataStartIndex = 3
 
     elif type == "uncOVMiRNA":
         baseDir = "/data/TCGA/O_uncOVMiRNA/working/"
         patterns = ["*data.txt"]
         bed12file = baseDir + "miRNA_8x15K.bed"
         bed15file = baseDir + "uncOVHmiRNA_bed15.bed"
         prefix = "uncOVHmiRNA"
                 
         Data = MiRNA(baseDir, patterns, bed12file, bed15file, prefix)
         Data.medianNormalize = 1
         
     elif type == "huEx":
         baseDir = "/data/TCGA/C_lblHuEx/working/"
         patterns = ["*data.txt"]
         bed12file = baseDir + "affyHuEx1_bed12.bed"
         bed15file = baseDir + "lblHuEx1_bed15.bed"
         prefix = "lblHuEx1"
         
         Data = HuEx(baseDir, patterns, bed12file, bed15file, prefix)
 
     elif type == "snp6":
         baseDir = "/data/TCGA/C_broadSNP6.0/working/"
         patterns = ["*copynumber.txt"]
         bed12file = baseDir + "GenomeWideSNP_6.bed"
         bed15file = baseDir + "broadSNP6_bed15.bed"
         prefix = "broadSNP6"
         
         Data = SNP6(baseDir, patterns, bed12file, bed15file, prefix)
 
+    elif type == "rnaSeqOV":
+        baseDir = "/data/TCGA/O_rnaSeqOV/working/"
+        patterns = ["*genes.txt"]
+        bed12file = baseDir + "rnaSeq32samples.bed"
+        bed15file = baseDir + "rnaSeqOV_bed15.bed"
+        prefix = "rnaSeqOV"
+        
+        Data = RNASeq(baseDir, patterns, bed12file, bed15file, prefix)
+        Data.medianNormalize = 1
+        
     elif type == "broadABI":
         baseDir = "/data/TCGA/C_broadABI/working/"
         patterns = ["*.maf"]
         bed12file = ""
         bed15file = baseDir + "broadSNP6_bed15.bed"
         prefix = "broadABI"
 
         Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
 
         Data.bed12Files = []
         Data.chromField = "Chromosome"
         Data.startField = "Start_position"
         Data.stopField = "End_position"
         Data.strandField = "Strand"
         Data.statusField = "Mutation_Status"
         Data.classField = "Variant_Classification"
         Data.sampleIdField = "Tumor_Sample_Barcode"
 
         Data.collapseStatus = {}
         Data.collapseStatus["Somatic"] = ["Somatic"]
         Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown"]
         
         Data.collapseClass = {}
         Data.collapseClass["Quiet"] = ["Silent", "Targeted_Region"]
         Data.collapseClass["Loud"] = ["Frame_Shift_Del", "Frame_Shift_Ins",
                                       "In_Frame_Del", "In_Frame_Ins",
                                       "Missense_Mutation", "Nonsense_Mutation",
                                       "Splice_Site_Indel", "Splice_Site_SNP"]
     elif type == "broadOVABI":
         baseDir = "/data/TCGA/C_broadOVABI/working/"
         patterns = ["*.maf"]
         bed12file = ""
-        bed15file = baseDir + "broadOVSNP6_bed15.bed"
+        bed15file = baseDir + "broadOVABI_bed15.bed"
         prefix = "broadOVABI"
 
         Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
 
         Data.bed12Files = []
         Data.chromField = "Chromosome"
         Data.startField = "Start_position"
         Data.stopField = "End_position"
         Data.strandField = "Strand"
         Data.statusField = "Mutation_Status"
         Data.classField = "Variant_Classification"
         Data.sampleIdField = "Tumor_Sample_Barcode"
 
         Data.collapseStatus = {}
         Data.collapseStatus["Somatic"] = ["Somatic"]
         Data.collapseStatus["Germline"] = ["Germline"] #, "LOH", "Unknown"]
         
         Data.collapseClass = {}
         Data.collapseClass["Quiet"] = ["Silent"]
         Data.collapseClass["Loud"] = ["Frame_Shift_Del", "Frame_Shift_Ins",
                                       "In_Frame_Del", "In_Frame_Ins",
                                       "Missense_Mutation", "Nonsense_Mutation",
                                       "Splice_Site_Indel", "Splice_Site_SNP"]  
     elif type == "baylorABI":
         baseDir = "/data/TCGA/C_baylorABI/working/"
         patterns = ["*.maf"]
         bed12file = ""
         bed15file = baseDir + "baylorABI_bed15.bed"
         prefix = "hgscABI"
 
         Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
 
         Data.bed12Files = []
         Data.chromField = "CHROM"
         Data.startField = "START_POSITION"
         Data.stopField = "END_POSITION"
         Data.strandField = "STRAND"
         Data.statusField = "MUTATION_STATUS"
         Data.classField = "VARIANT_CLASSIFICATION"
         Data.sampleIdField = "TUMOR_SAMPLE_BARCODE"
 
         Data.collapseStatus = {}
         Data.collapseStatus["Somatic"] = ["Somatic", '"somatic, homozygous"',
                                           '"somatic, heterozygous"']
         Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown", "germline"]
         
         Data.collapseClass = {}
         Data.collapseClass["Quiet"] = ["Silent", "Targeted_Region", "Synonymous"]
         Data.collapseClass["Loud"] = ["frame_shift", "Frame_Shift_Del", "Frame_Shift_Ins",
                                       "in_frame_insertion", "in_frame_deletion",
                                       "In_Frame_Del", "In_Frame_Ins", "Splice_Site_Indel",
                                       "Splice_Site_SNP", "Splice_site",
                                       "Nonsense_Mutation", "Missense_Mutation", "Missense",
                                       "Nonsense"]  
 
     elif type == "baylorOVABI":
-        baseDir = "/data/TCGA/OV/mutation/hgsc.bcm.edu_OV.ABI.1.1.0/"
+        baseDir = "/data/TCGA/C_baylorOVABI/working/"
         patterns = ["*.maf"]
         bed12file = ""
         bed15file = baseDir + "baylorOVABI_bed15.bed"
         prefix = "hgscOVABI"
 
         Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
 
         Data.bed12Files = []
         Data.chromField = "CHROM"
         Data.startField = "START_POSITION"
         Data.stopField = "END_POSITION"
         Data.strandField = "STRAND"
         Data.statusField = "MUTATION_STATUS"
         Data.classField = "VARIANT_CLASSIFICATION"
         Data.sampleIdField = "TUMOR_SAMPLE_ID"
 
         Data.collapseStatus = {}
         Data.collapseStatus["Somatic"] = ["Somatic", '"somatic, homozygous"',
                                           '"somatic, heterozygous"']
         Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown", "germline"]
         
         Data.collapseClass = {}
         Data.collapseClass["Quiet"] = ["Silent", "Targeted_Region", "Synonymous"]
         Data.collapseClass["Loud"] = ["frame_shift", "Frame_Shift_Del", "Frame_Shift_Ins",
                                       "in_frame_insertion", "in_frame_deletion",
                                       "In_Frame_Del", "In_Frame_Ins", "Splice_Site_Indel",
                                       "Splice_Site_SNP", "Splice_site",
                                       "Nonsense_Mutation", "Missense_Mutation", "Missense",
                                       "Nonsense"]  
 
     elif type == "wustlABI":
         baseDir = "/data/TCGA/C_wustlABI/working/"
         patterns = ["*.maf"]
         bed12file = ""
         bed15file = baseDir + "wustlABI_bed15.bed"
         prefix = "wustlABI"
 
         Data = ABI(baseDir, patterns, bed12file, bed15file, prefix)
 
         Data.bed12Files = []
         Data.chromField = "Chromosome"
         Data.startField = "Start_position"
         Data.stopField = "End_position"
         Data.strandField = "Strand"
         Data.statusField = "Mutation_Status"
         Data.classField = "Variant_Classification"
         Data.sampleIdField = "Tumor_Sample_Barcode"
 
         Data.collapseStatus = {}
         Data.collapseStatus["Somatic"] = ["Somatic"]
         Data.collapseStatus["Germline"] = ["Germline", "LOH", "Unknown", "Valid"]
         
         Data.collapseClass = {}
         Data.collapseClass["Quiet"] = ["Silent_Mutation", "Targeted_Region"]
         Data.collapseClass["Loud"] = ["Frame_Shift_Del", "Frame_Shift_Ins",
                                       "In_Frame_Del", "In_Frame_Ins",
                                       "Missense_Mutation", "Nonsense_Mutation",
                                       "Splice_Site_Del", "Splice_Site_Ins", "Splice_Site_SNP"]  
     else:
         print "Unhandled type: " + type
         usage()
         sys.exit()
         
     return Data
 
 def main(argv):
     dataObject = handleOpts(argv)
 
     if not dataObject is None:
         print "Processing data..."
         dataObject.process()
 
         print "Merging data into single file..."
         dataObject.mergeFiles()
 
         print "Converting merged data to BED15..."
         dataObject.convertToBed15()
 
         print "Writing entry for microarrayGroups.ra file..."
         dataObject.writeRaFile()
 
         print "Add txLength data to score field, only works on SNP data..."
         dataObject.addTxLength()
         
 if __name__ == "__main__":
     main(sys.argv[1:])