3bbcbf1c3ac0742cafa3eb6fb2bcdfe7c51fde52 cvaske Tue Feb 11 16:06:26 2025 -0800 CIViC variant track: disease parsing Handle any cases that may come up with disease names that include commas diff --git src/hg/utils/otto/civic/civicToBed.py src/hg/utils/otto/civic/civicToBed.py index f15d7d51925..ac9ce13300f 100644 --- src/hg/utils/otto/civic/civicToBed.py +++ src/hg/utils/otto/civic/civicToBed.py @@ -560,52 +560,52 @@ def transform_assertion_summaries(df: pd.DataFrame, mpdf: pd.DataFrame) -> pd.DataFrame: # Check overlap with molecular profiles assertions_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"]) assertions_with_mps_p = assertions_with_mps.mean() * 100 assertions_without_mps = str(list(df["assertion_id"][~assertions_with_mps])) logging.info( f"AssertionSummaries whose molecular_profile_id exists: {assertions_with_mps_p:.2f}%, assertion_ids with missing:{assertions_without_mps}" ) expect( assertions_with_mps.mean() > 0.95, message="At least 95% of Assertions should have an existing MolecularProfile", ) - df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', ',')) + df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', ',')) df["disease_html"] = "<i>" + df["disease"] + "</i>" doid = df["doid"].astype("Int64").astype("str") df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where( doid.isnull(), doid + "|" + df["disease"] ) return df def transform_clinical_evidence(df: pd.DataFrame, mpdf: pd.DataFrame): evidence_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"]) evidence_with_mps_p = evidence_with_mps.mean() * 100 evidence_without_mps = str(list(df["evidence_id"][~evidence_with_mps])) logging.info( f"ClincialEvidenceSummaries whose molecular_profile_id exists: {evidence_with_mps_p:.2f}%, evidence_ids with missing:{evidence_without_mps}" ) expect( evidence_with_mps.mean() > 0.95, message="At least 95% of Evidence should have an existing MolecularProfile", ) - df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', ',')) + df["disease"] = df["disease"].where(df["disease"].isnull(), df["disease"].str.replace(',', ',')) df["disease_html"] = "<i>" + df["disease"] + "</i>" doid = df["doid"] df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where( doid.isnull(), doid + "|" + df["disease"] ) return df def load_dataframes(table_dict: dict[str, str]) -> dict[str, pd.DataFrame]: """Load several dataframes. Input is a dict from name to the source path. Output is a dict from name to a Pandas DataFrame""" return {name: pd.read_csv(path, sep="\t", dtype={"doid": str}) for name, path in table_dict.items()}