155aea7ccb73e2e11ce80d93c44f7502f39ea617 cvaske Tue Feb 11 15:42:35 2025 -0800 CIViC cancer variant track: fix DOID parsing The doid column of several tables looks a lot like a number, but many entries have leading zeroes. Pandas was reading the entire column as integer. The parsing command was changed to force the `doid` column to be treated as a string. diff --git src/hg/utils/otto/civic/civicToBed.py src/hg/utils/otto/civic/civicToBed.py index 40c14533de5..df2f06e3354 100644 --- src/hg/utils/otto/civic/civicToBed.py +++ src/hg/utils/otto/civic/civicToBed.py @@ -562,62 +562,62 @@ def transform_assertion_summaries(df: pd.DataFrame, mpdf: pd.DataFrame) -> pd.DataFrame: # Check overlap with molecular profiles assertions_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"]) assertions_with_mps_p = assertions_with_mps.mean() * 100 assertions_without_mps = str(list(df["assertion_id"][~assertions_with_mps])) logging.info( f"AssertionSummaries whose molecular_profile_id exists: {assertions_with_mps_p:.2f}%, assertion_ids with missing:{assertions_without_mps}" ) expect( assertions_with_mps.mean() > 0.95, message="At least 95% of Assertions should have an existing MolecularProfile", ) df["disease_html"] = "<i>" + df["disease"] + "</i>" doid = df["doid"].astype("Int64").astype("str") - df["disease_link"] = doid.where(doid != "<NA>", df["disease"]).where( - doid == "<NA>", doid + "|" + df["disease"] + df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where( + doid.isnull(), doid + "|" + df["disease"] ) return df def transform_clinical_evidence(df: pd.DataFrame, mpdf: pd.DataFrame): evidence_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"]) evidence_with_mps_p = evidence_with_mps.mean() * 100 evidence_without_mps = str(list(df["evidence_id"][~evidence_with_mps])) logging.info( f"ClincialEvidenceSummaries whose molecular_profile_id exists: {evidence_with_mps_p:.2f}%, evidence_ids with missing:{evidence_without_mps}" ) expect( evidence_with_mps.mean() > 0.95, message="At least 95% of Evidence should have an existing MolecularProfile", ) df["disease_html"] = "<i>" + df["disease"] + "</i>" - doid = df["doid"].astype("Int64").astype("str") - df["disease_link"] = doid.where(doid != "<NA>", df["disease"]).where( - doid == "<NA>", doid + "|" + df["disease"] + doid = df["doid"] + df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where( + doid.isnull(), doid + "|" + df["disease"] ) return df def load_dataframes(table_dict: dict[str, str]) -> dict[str, pd.DataFrame]: """Load several dataframes. Input is a dict from name to the source path. Output is a dict from name to a Pandas DataFrame""" - return {name: pd.read_csv(path, sep="\t") for name, path in table_dict.items()} + return {name: pd.read_csv(path, sep="\t", dtype={"doid": str}) for name, path in table_dict.items()} def urlretrieve(url, filename): with closing(open(filename, "wb")) as outfile: with closing(urllib.request.urlopen(url)) as instream: outfile.write(instream.read()) def download_datadir( basedir: str, baseurl: str, dateslug: str, tablelist: list[str], overwrite: bool = True, ) -> dict[str, str]: