src/hg/utils/otto/civic/civicToBed.py cbfe2c0af6949ffd02b9f52f5734d2d44c3c9011

cbfe2c0af6949ffd02b9f52f5734d2d44c3c9011
cvaske
  Thu Dec 12 14:26:34 2024 -0800
CIViC: add mouseover text and full otto crontab refs #34371

diff --git src/hg/utils/otto/civic/civicToBed.py src/hg/utils/otto/civic/civicToBed.py
index 82e9abe..b5cee02 100644
--- src/hg/utils/otto/civic/civicToBed.py
+++ src/hg/utils/otto/civic/civicToBed.py
@@ -109,31 +109,31 @@
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 
 DOWNLOAD_BASE_URL: Final = "https://civicdb.org/downloads"
 
 DATA_TABLES: Final = [
     "MolecularProfileSummaries",
     "VariantSummaries",
     "ClinicalEvidenceSummaries",
     "AssertionSummaries",
 ]
 
 ## Maximum lengeth of a string (e.g. insAGCATGACCAG...) before
 ## being truncated and appended with an ellipsis
-MAX_VARIANT_LENGTH: Final = 15
+MAX_VARIANT_LENGTH: Final = 20
 MAX_LONG_BED_FIELD_LENGTH: Final = 5000
 
 ## Default color for items in output bed
 DEFAULT_BED_VARIANT_COLOR: Final = "0"
 
 DOWNLOAD_DIR: Final = "downloads"
 
 # these are in percentages
 MAX_ROW_INCREASE: Final = 100.0
 MAX_ROW_DECREASE: Final = -10.0
 
 # Minimum fraction of ENSEMBL transcripts found for fusions for success
 MIN_ENSEMBL_TX = 0.90
 
 ##
@@ -244,59 +244,67 @@
     )
     return (
         bed12df.assign(
             variant_link=(
                 sourcedf.variant_id.astype("str")
                 + "|"
                 + sourcedf.feature_name
                 + " "
                 + sourcedf.variant
             )
         )
         .assign(reference_build=sourcedf["reference_build"])
         .assign(allele_registry_id=sourcedf["allele_registry_id"].fillna(""))
         .assign(clinvar_id=cvids)
         .assign(last_review_date=sourcedf["last_review_date"].fillna(""))
-        .assign(disease_link=sourcedf["diseases"])
+        .assign(disease_link=sourcedf["disease_links"])
         .assign(therapies=sourcedf["therapies"])
+        .assign(
+            mouseOverHTML="<b>Associated therapies:</b> "
+            + sourcedf["therapies"].str.replace(",", ", ")
+            + "<br><b>Associated diseases:</b> "
+            + sourcedf["disease_html"]
+        )
     )
     ## These are includde sometimes for debugging purposes...
     # "variant_id": gdf.variant_id,
     # "reference_bases": gdf.reference_bases,
     # "variant_bases": gdf.variant_bases,
 
 
 def extract_variant_name(gdf: pd.DataFrame) -> pd.Series:
     """
     Make a name for a variant to show in the browser, summarizing
     the genomic change (sequence change, fusion partners, expression,
     etc.)
     """
     # this got a bit ugly, but functional if statements be like that
     full_variant_string = pd.Series(
         np.where(
             gdf.feature_type == "Fusion",
             gdf.feature_name,  # fusions have very descriptive names.
-            np.where(  # these are the gene type variants (TODO: consider adding the gene)
+            np.where(  # these are the gene type variants
                 gdf.reference_bases.isnull(),
                 np.where(  # empty variant bases --> variant descrption or an insertion
-                    gdf.variant_bases.isnull(), gdf.variant, "ins" + gdf.variant_bases
+                    gdf.variant_bases.isnull(),
+                    gdf.gene + " " + gdf.variant,
+                    gdf.gene + " ins" + gdf.variant_bases,
                 ),
                 np.where(  # specified variant bases --> deleteion or variant
                     gdf.variant_bases.isnull(),
-                    "del" + gdf.reference_bases,
-                    gdf.reference_bases + ">" + gdf.variant_bases,
+                    gdf.gene + " " + "del" + gdf.reference_bases,
+                    gdf.gene + " " + gdf.reference_bases + ">" + gdf.variant_bases,
                 ),
             ),
         )
     )
 
     return pd.Series(
         np.where(
             full_variant_string.str.len() > MAX_VARIANT_LENGTH,
             full_variant_string.str.slice(start=0, stop=MAX_VARIANT_LENGTH - 3) + "...",
             full_variant_string,
         )
     )
 
 
 def transform_gene_variant_summaries(df: pd.DataFrame) -> pd.DataFrame:
@@ -558,66 +566,52 @@
 
 
 def transform_assertion_summaries(df: pd.DataFrame, mpdf: pd.DataFrame) -> pd.DataFrame:
     # Check overlap with molecular profiles
     assertions_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"])
     assertions_with_mps_p = assertions_with_mps.mean() * 100
     assertions_without_mps = str(list(df["assertion_id"][~assertions_with_mps]))
     logging.info(
         f"AssertionSummaries whose molecular_profile_id exists: {assertions_with_mps_p:.2f}%, assertion_ids with missing:{assertions_without_mps}"
     )
     expect(
         assertions_with_mps.mean() > 0.95,
         message="At least 95% of Assertions should have an existing MolecularProfile",
     )
 
+    df["disease_html"] = "<i>" + df["disease"] + "</i>"
     doid = df["doid"].astype("Int64").astype("str")
-    df["disease_html"] = doid.where(doid != "<NA>", df["disease"]).where(
-        doid == "<NA>",
-        "<a href='https://www.disease-ontology.org/?id=DOID:"
-        + doid
-        + "'>"
-        + df["disease"]
-        + "</a>",
-    )
     df["disease_link"] = doid.where(doid != "<NA>", df["disease"]).where(
         doid == "<NA>", doid + "|" + df["disease"]
     )
     return df
 
 
 def transform_clinical_evidence(df: pd.DataFrame, mpdf: pd.DataFrame):
     evidence_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"])
     evidence_with_mps_p = evidence_with_mps.mean() * 100
     evidence_without_mps = str(list(df["evidence_id"][~evidence_with_mps]))
     logging.info(
         f"ClincialEvidenceSummaries whose molecular_profile_id exists: {evidence_with_mps_p:.2f}%, evidence_ids with missing:{evidence_without_mps}"
     )
     expect(
         evidence_with_mps.mean() > 0.95,
         message="At least 95% of Evidence should have an existing MolecularProfile",
     )
 
+    df["disease_html"] = "<i>" + df["disease"] + "</i>"
     doid = df["doid"].astype("Int64").astype("str")
-    df["disease_html"] = doid.where(doid != "<NA>", df["disease"]).where(
-        doid == "<NA>",
-        "<a href='https://www.disease-ontology.org/?id=DOID:"
-        + doid
-        + "'>"
-        + df["disease"]
-        + "</a>",
-    )
     df["disease_link"] = doid.where(doid != "<NA>", df["disease"]).where(
         doid == "<NA>", doid + "|" + df["disease"]
     )
     return df
 
 
 def load_dataframes(table_dict: dict[str, str]) -> dict[str, pd.DataFrame]:
     """Load several dataframes.
     Input is a dict from name to the source path.
     Output is a dict from name to a Pandas DataFrame"""
 
     return {name: pd.read_csv(path, sep="\t") for name, path in table_dict.items()}
 
 
 def urlretrieve(url, filename):
@@ -780,105 +774,114 @@
     )
     logging.info(
         f"Dataframe shape of variants after filtering flagged: {orig_vs_df.shape}"
     )
 
     return orig_vs_df.assign(
         short_variant_string=extract_variant_name(orig_vs_df),
         molecular_profile_desc="",
         assertion_desc="",
         evidence_desc="",
     )
 
 
 def join_set(
     sep: str,
-    values: Sequence[str],
+    values: set[str],
     max_length: int = MAX_LONG_BED_FIELD_LENGTH,
     ellipses: str = "...",
 ) -> str:
     """Join strings together, but only up to a maximum length. If the
     maximum length is exceeded, add the ellipses string at the end.
 
     """
     ellipses_len = len(sep) + len(ellipses)
     max_length -= ellipses_len
 
     r = ""
     loop_sep = ""  # after first iteration switch to `sep`
     for v in values:
         if len(r) + len(v) > max_length:
             return r + loop_sep + ellipses
         r += loop_sep + v
         loop_sep = sep
     return r
 
 
 def add_variant_diseases_therapies(
     assertion_df: pd.DataFrame,
     evidence_df: pd.DataFrame,
     variant_df: pd.DataFrame,
     mid2vids: dict[int, list[int]],
 ) -> pd.DataFrame:
-    vid2diseases = defaultdict(set)
-    vid2therapies = defaultdict(set)
+    vid2diseases: dict[int, set[str]] = defaultdict(set)
+    vid2dis_html: dict[int, set[str]] = defaultdict(set)
+    vid2therapies: dict[int, set[str]] = defaultdict(set)
     missing_mps = []
 
-    columns = ["molecular_profile_id", "disease_link", "therapies"]
+    columns = ["molecular_profile_id", "disease_link", "disease_html", "therapies"]
 
-    for _, mpid, disease, therapies in assertion_df[columns].itertuples():
+    # Gather all the associations from the Assertion table
+    for _, mpid, disease, disease_html, therapies in assertion_df[columns].itertuples():
+        # tests for self-equality will remove NaNs
         tset = set(therapies.split(",")) if therapies == therapies else set()
         dset = set([disease]) if disease == disease else set()
+        dhset = set([disease_html]) if disease_html == disease_html else set()
 
         if mpid in mid2vids:
             for vid in mid2vids[mpid]:
                 vid2diseases[vid] |= dset
+                vid2dis_html[vid] |= dhset
                 vid2therapies[vid] |= tset
         else:
             missing_mps.append(mpid)
 
-    for _, mpid, disease, therapies in evidence_df[columns].itertuples():
+    # Gather all the associations from the Evidence table
+    for _, mpid, disease, disease_html, therapies in evidence_df[columns].itertuples():
         tset = set(therapies.split(",")) if therapies == therapies else set()
         dset = set([disease]) if disease == disease else set()
+        dhset = set([disease_html]) if disease_html == disease_html else set()
 
         if mpid in mid2vids:
             for vid in mid2vids[mpid]:
                 vid2diseases[vid] |= dset
+                vid2dis_html[vid] |= dhset
                 vid2therapies[vid] |= tset
         else:
             missing_mps.append(mpid)
 
-    disease_df = pd.DataFrame(
-        {
-            "diseases": [
-                join_set(",", d, ellipses="|...") for d in vid2diseases.values()
-            ]
-        },
-        index=vid2diseases.keys(),
-    )
+    variant_id = []
+    dhtml_col = []
+    dlink_col = []
+    therapy_col = []
 
-    therapy_df = pd.DataFrame(
-        {"therapies": [join_set(", ", t) for t in vid2therapies.values()]},
-        index=vid2therapies.keys(),
-    )
+    for vid in set(vid2diseases.keys()) | set(vid2therapies.keys()):
+        variant_id.append(vid)
+        dhtml_col.append(join_set(", ", vid2dis_html.get(vid, set())))
+        dlink_col.append(join_set(",", vid2diseases.get(vid, set()), ellipses="|..."))
+        therapy_col.append(join_set(", ", vid2therapies.get(vid, set())))
 
-    return (
-        variant_df.set_index("variant_id")
-        .join(disease_df)
-        .join(therapy_df)
-        .reset_index()
-    )
+    dt_df = pd.DataFrame(
+        {
+            "variant_id": variant_id,
+            "disease_html": dhtml_col,
+            "disease_links": dlink_col,
+            "therapies": therapy_col,
+        }
+    ).set_index("variant_id")
+
+    return variant_df.set_index("variant_id").join(dt_df).reset_index()
 
 
 def transform_dfs(dfs: dict[str, pd.DataFrame]) -> dict[str, str]:
     """From a dictionary of data frames for VariantSummaries,
     MolecularProfileSummaries, AssertionSummaries, and ClinicalEvidenceSummaries,
     create a unifed bigBed file for each reference genome.
 
     Returns a dictionary from reference genome slug (hg19/hg38) to the
     file path of the bidBed files.
     """
     adf = transform_assertion_summaries(
         dfs["AssertionSummaries"], dfs["MolecularProfileSummaries"]
     )
     edf = transform_clinical_evidence(
         dfs["ClinicalEvidenceSummaries"], dfs["MolecularProfileSummaries"]