92cebf70e231042c0a994708fd8e2f850e71844a
cvaske
Tue Feb 11 16:04:45 2025 -0800
CIViC variant track: disease parsing
Handle any cases that may come up with disease names
that include commas
diff --git src/hg/utils/otto/civic/civicToBed.py src/hg/utils/otto/civic/civicToBed.py
index df2f06e3354..f15d7d51925 100644
--- src/hg/utils/otto/civic/civicToBed.py
+++ src/hg/utils/otto/civic/civicToBed.py
@@ -560,50 +560,52 @@
def transform_assertion_summaries(df: pd.DataFrame, mpdf: pd.DataFrame) -> pd.DataFrame:
# Check overlap with molecular profiles
assertions_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"])
assertions_with_mps_p = assertions_with_mps.mean() * 100
assertions_without_mps = str(list(df["assertion_id"][~assertions_with_mps]))
logging.info(
f"AssertionSummaries whose molecular_profile_id exists: {assertions_with_mps_p:.2f}%, assertion_ids with missing:{assertions_without_mps}"
)
expect(
assertions_with_mps.mean() > 0.95,
message="At least 95% of Assertions should have an existing MolecularProfile",
)
+ df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', ','))
df["disease_html"] = "" + df["disease"] + ""
doid = df["doid"].astype("Int64").astype("str")
df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where(
doid.isnull(), doid + "|" + df["disease"]
)
return df
def transform_clinical_evidence(df: pd.DataFrame, mpdf: pd.DataFrame):
evidence_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"])
evidence_with_mps_p = evidence_with_mps.mean() * 100
evidence_without_mps = str(list(df["evidence_id"][~evidence_with_mps]))
logging.info(
f"ClincialEvidenceSummaries whose molecular_profile_id exists: {evidence_with_mps_p:.2f}%, evidence_ids with missing:{evidence_without_mps}"
)
expect(
evidence_with_mps.mean() > 0.95,
message="At least 95% of Evidence should have an existing MolecularProfile",
)
+ df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', ','))
df["disease_html"] = "" + df["disease"] + ""
doid = df["doid"]
df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where(
doid.isnull(), doid + "|" + df["disease"]
)
return df
def load_dataframes(table_dict: dict[str, str]) -> dict[str, pd.DataFrame]:
"""Load several dataframes.
Input is a dict from name to the source path.
Output is a dict from name to a Pandas DataFrame"""
return {name: pd.read_csv(path, sep="\t", dtype={"doid": str}) for name, path in table_dict.items()}