src/hg/utils/otto/civic/civicToBed.py 92cebf70e231042c0a994708fd8e2f850e71844a

92cebf70e231042c0a994708fd8e2f850e71844a
cvaske
  Tue Feb 11 16:04:45 2025 -0800
CIViC variant track: disease parsing

Handle any cases that may come up with disease names
that include commas

diff --git src/hg/utils/otto/civic/civicToBed.py src/hg/utils/otto/civic/civicToBed.py
index df2f06e3354..f15d7d51925 100644
--- src/hg/utils/otto/civic/civicToBed.py
+++ src/hg/utils/otto/civic/civicToBed.py
@@ -560,50 +560,52 @@
 
 
 def transform_assertion_summaries(df: pd.DataFrame, mpdf: pd.DataFrame) -> pd.DataFrame:
     # Check overlap with molecular profiles
     assertions_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"])
     assertions_with_mps_p = assertions_with_mps.mean() * 100
     assertions_without_mps = str(list(df["assertion_id"][~assertions_with_mps]))
     logging.info(
         f"AssertionSummaries whose molecular_profile_id exists: {assertions_with_mps_p:.2f}%, assertion_ids with missing:{assertions_without_mps}"
     )
     expect(
         assertions_with_mps.mean() > 0.95,
         message="At least 95% of Assertions should have an existing MolecularProfile",
     )
 
+    df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', '&#44'))
     df["disease_html"] = "<i>" + df["disease"] + "</i>"
     doid = df["doid"].astype("Int64").astype("str")
     df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where(
         doid.isnull(), doid + "|" + df["disease"]
     )
     return df
 
 
 def transform_clinical_evidence(df: pd.DataFrame, mpdf: pd.DataFrame):
     evidence_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"])
     evidence_with_mps_p = evidence_with_mps.mean() * 100
     evidence_without_mps = str(list(df["evidence_id"][~evidence_with_mps]))
     logging.info(
         f"ClincialEvidenceSummaries whose molecular_profile_id exists: {evidence_with_mps_p:.2f}%, evidence_ids with missing:{evidence_without_mps}"
     )
     expect(
         evidence_with_mps.mean() > 0.95,
         message="At least 95% of Evidence should have an existing MolecularProfile",
     )
 
+    df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', '&#44'))
     df["disease_html"] = "<i>" + df["disease"] + "</i>"
     doid = df["doid"]
     df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where(
         doid.isnull(), doid + "|" + df["disease"]
     )
     return df
 
 
 def load_dataframes(table_dict: dict[str, str]) -> dict[str, pd.DataFrame]:
     """Load several dataframes.
     Input is a dict from name to the source path.
     Output is a dict from name to a Pandas DataFrame"""
 
     return {name: pd.read_csv(path, sep="\t", dtype={"doid": str}) for name, path in table_dict.items()}