src/hg/encode3/cellsFromEncode3.py 5a39815b8ab378d38285b805538121c591f82e07

5a39815b8ab378d38285b805538121c591f82e07
kate
  Tue Jan 20 13:01:44 2015 -0800
Add 'subGroupMetaTables' setting to trackDb.  This setting allows specification of a metadata table (minimally description+url) describing terms in a subgroup (e.g. hgFixed.wgEncodeCell for the 'cellType' subgroup in ENCODE tracks.   This ia aimed at providing basic metadata functionality with lower overhead than the cv.ra + metaDb table used during ENCODE2. refs #14353
diff --git src/hg/encode3/cellsFromEncode3.py src/hg/encode3/cellsFromEncode3.py
new file mode 100755
index 0000000..e7d1044
--- /dev/null
+++ src/hg/encode3/cellsFromEncode3.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python2
+
+# List all ENCODE cell types in ENCODE2 CV and retrieve ENCODE3 fields of interest
+#
+# Here are the URL's for ENCODE2 and ENCODE3:
+# 1) List all ENCODE2 cell types: 
+#       http://genome.ucsc.edu/cgi-bin/hgEncodeApi?cmd=cv&type=cellType
+# 2) Retrieve info from ENCODE2 CV for a cell type term: 
+#       http://genome.ucsc.edu/cgi-bin/hgEncodeApi?cmd=cv&type=cellType&term=K562
+# 3) Retrieve info from ENCODE3 for an ENCODE2 cell type term:
+#       http://www.encodeproject.org/search/?type=biosample&dbxrefs=UCSC-ENCODE-cv:K562&format=json&frame=object
+
+import requests, json
+
+# get list of all ENCODE2 cells
+URL = "http://genome.ucsc.edu/cgi-bin/hgEncodeApi?cmd=cv&type=cellType"
+response = requests.get(URL)
+cells = response.json()
+
+# print headers
+e2Header = ['e2-term', 'e2-lineage', 'e2-tissue', 'e2-karyotype', 'e2-vendorId', 
+                'e2-vendorName', 'e2-termId']
+e3Header = ['e3-type', 'e3-developmental', 'e3-organ', 'e3-donor', 'e3-sex', 'e3-age', 
+                'e3-health', 'e3-termId', 'e3-url', 'e3-description']
+print "\t".join(e2Header),"\t",
+print "\t".join(e3Header)
+
+# get ENCODE2 and ENCODE3 info for each cell type
+cells.sort(key=lambda cell: cell['term'])
+for cell in cells:
+    if cell['organism'] != 'human':
+        continue
+
+    # print ENCODE2 info
+    encode2 = [cell['term'], cell['lineage'], cell['tissue'], cell['karyotype'],
+                cell['vendorId'], cell['vendorName'], cell['termId']]
+    #cell['termUrl'], cell['orderUrl'],
+    print "\t".join(encode2), "\t",
+
+    # get ENCODE3 info
+    URL = "http://www.encodeproject.org/search/?type=biosample&dbxrefs=UCSC-ENCODE-cv:" \
+                + cell['term'] \
+                + "&format=json&frame=object"
+    response = requests.get(URL)
+    if response.status_code == requests.codes.ok:
+        e3Resp = response.json()
+        if '@graph' in e3Resp and len(e3Resp['@graph']) > 0:
+            e3Cell = e3Resp['@graph'][0]
+            encode3 = [e3Cell['biosample_type']]
+            encode3.append("unknown" if len(e3Cell['developmental_slims']) == 0 else \
+                               e3Cell['developmental_slims'][0])
+            encode3.append("unknown" if len(e3Cell['organ_slims']) == 0 else \
+                               e3Cell['organ_slims'][0])
+            encode3.append("unknown" if 'donor' not in e3Cell else e3Cell['donor'])
+            encode3.append("unknown" if 'sex' not in e3Cell else e3Cell['sex'])
+            encode3.append("unknown" if 'age' not in e3Cell else e3Cell['age'])
+            encode3.append("unknown" if 'health_status' not in e3Cell else e3Cell['health_status'])
+            encode3 += [e3Cell['biosample_term_id'], e3Cell['description']]
+                        # e3Cell['url'], 
+            print "\t".join(encode3), 
+    print "\n"
+