src/utils/phegenToBed c23a3877f7282272a39ca38bbca6fc8544d2e8ac

c23a3877f7282272a39ca38bbca6fc8544d2e8ac
max
  Tue Sep 11 06:47:32 2018 -0700
updating phegenToBed, refs #21052

diff --git src/utils/phegenToBed src/utils/phegenToBed
index df4ce95..3e77416 100755
--- src/utils/phegenToBed
+++ src/utils/phegenToBed
@@ -25,67 +25,81 @@
         logging.basicConfig(level=logging.INFO)
     return args, options
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     filename = args[0]
     snpBed = args[1]
 
     #	Trait	SNP rs	Context	Gene	Gene ID	Gene 2	Gene ID 2	Chromosome	Location	P-Value	Source	PubMed	Analysis ID	Study ID	Study Name
     bySnp = defaultdict(list)
     for line in open(filename):
         if line.startswith("#"):
             continue
         row = line.rstrip("\n").split("\t")
+
+        if row[-2]!="":
+            row[-2] = "phs%06d" % int(row[-2])
+        if row[2]!="":
+            row[2] = "rs"+row[2]
+
         _, trait, snpId, context, gene, geneId, gene2, geneId2, chrom, start, pVal, source, pmid, analysisId, studyId, studyName = row
         #if source=="NHGRI": # remove GWAS catalog for now?
             #continue
         # need phsxxxxxx for linkout to NCBI
-        if row[-2]!="":
-            row[-2] = "phs%06d" % int(row[-2])
         extraCount = len(row)
         bySnp[snpId].append(row)
 
     bySnp = dict(bySnp)
 
     for line in open(snpBed):
         line = line.rstrip("\n")
         snpId = line.rstrip("\n").split('\t')[3]
         chrom, start, end, name = line.split("\t") # has chrom, start, end, name
         bed = [chrom, start, end, name]
-        rows = bySnp.get(snpId.replace("rs",""), None)
+        rows = bySnp.get(snpId, None)
         if rows==None:
             continue
 
         bed.append(str(len(rows)))
         bed.append(".")
         bed.append(start)
         bed.append(end)
         bed.append("0")
 
         bedCount = len(bed)
 
         extras = []
         for i in range(1, extraCount):
             extras.append( [] )
 
+        hasGwasCatalog = False
         for row in rows:
             for i in range(1, len(row)):
                 #print row, i, len(row)
                 extras[i-1].append(row[i].replace(",", ";"))
+                _, trait, snpId, context, gene, geneId, gene2, geneId2, chrom, start, pVal, source, pmid, analysisId, studyId, studyName = row
+
+                if source=="NHGRI":
+                    hasGwasCatalog = True
 
         extras = [", ".join(x) for x in extras]
 
         bed.extend(extras)
 
         # fixup a few things
         bed[10] = "rs"+bed[10]
         # use trait as name, shorten
         traitShort = bed[9].split(",")[0].split(';')[0].replace(" Disorders", "").replace(" Tests", "")
         if len(traitShort) > 17:
             traitShort = traitShort[:16]+".."
         bed[3] = traitShort
 
+        if hasGwasCatalog:
+            bed.append(snpId)
+        else:
+            bed.append("")
+
         print "\t".join(bed)
 
 main()