92cebf70e231042c0a994708fd8e2f850e71844a cvaske Tue Feb 11 16:04:45 2025 -0800 CIViC variant track: disease parsing Handle any cases that may come up with disease names that include commas diff --git src/hg/utils/otto/civic/civicToBed.py src/hg/utils/otto/civic/civicToBed.py index df2f06e3354..f15d7d51925 100644 --- src/hg/utils/otto/civic/civicToBed.py +++ src/hg/utils/otto/civic/civicToBed.py @@ -560,50 +560,52 @@ def transform_assertion_summaries(df: pd.DataFrame, mpdf: pd.DataFrame) -> pd.DataFrame: # Check overlap with molecular profiles assertions_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"]) assertions_with_mps_p = assertions_with_mps.mean() * 100 assertions_without_mps = str(list(df["assertion_id"][~assertions_with_mps])) logging.info( f"AssertionSummaries whose molecular_profile_id exists: {assertions_with_mps_p:.2f}%, assertion_ids with missing:{assertions_without_mps}" ) expect( assertions_with_mps.mean() > 0.95, message="At least 95% of Assertions should have an existing MolecularProfile", ) + df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', ',')) df["disease_html"] = "" + df["disease"] + "" doid = df["doid"].astype("Int64").astype("str") df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where( doid.isnull(), doid + "|" + df["disease"] ) return df def transform_clinical_evidence(df: pd.DataFrame, mpdf: pd.DataFrame): evidence_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"]) evidence_with_mps_p = evidence_with_mps.mean() * 100 evidence_without_mps = str(list(df["evidence_id"][~evidence_with_mps])) logging.info( f"ClincialEvidenceSummaries whose molecular_profile_id exists: {evidence_with_mps_p:.2f}%, evidence_ids with missing:{evidence_without_mps}" ) expect( evidence_with_mps.mean() > 0.95, message="At least 95% of Evidence should have an existing MolecularProfile", ) + df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', ',')) df["disease_html"] = "" + df["disease"] + "" doid = df["doid"] df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where( doid.isnull(), doid + "|" + df["disease"] ) return df def load_dataframes(table_dict: dict[str, str]) -> dict[str, pd.DataFrame]: """Load several dataframes. Input is a dict from name to the source path. Output is a dict from name to a Pandas DataFrame""" return {name: pd.read_csv(path, sep="\t", dtype={"doid": str}) for name, path in table_dict.items()}