src/hg/utils/otto/civic/civicToBed.py 155aea7ccb73e2e11ce80d93c44f7502f39ea617

155aea7ccb73e2e11ce80d93c44f7502f39ea617
cvaske
  Tue Feb 11 15:42:35 2025 -0800
CIViC cancer variant track: fix DOID parsing

The doid column of several tables looks a lot like a number,
but many entries have leading zeroes. Pandas was reading the entire
column as integer. The parsing command was changed to force the
`doid` column to be treated as a string.

diff --git src/hg/utils/otto/civic/civicToBed.py src/hg/utils/otto/civic/civicToBed.py
index 40c14533de5..df2f06e3354 100644
--- src/hg/utils/otto/civic/civicToBed.py
+++ src/hg/utils/otto/civic/civicToBed.py
@@ -562,62 +562,62 @@
 def transform_assertion_summaries(df: pd.DataFrame, mpdf: pd.DataFrame) -> pd.DataFrame:
     # Check overlap with molecular profiles
     assertions_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"])
     assertions_with_mps_p = assertions_with_mps.mean() * 100
     assertions_without_mps = str(list(df["assertion_id"][~assertions_with_mps]))
     logging.info(
         f"AssertionSummaries whose molecular_profile_id exists: {assertions_with_mps_p:.2f}%, assertion_ids with missing:{assertions_without_mps}"
     )
     expect(
         assertions_with_mps.mean() > 0.95,
         message="At least 95% of Assertions should have an existing MolecularProfile",
     )
 
     df["disease_html"] = "<i>" + df["disease"] + "</i>"
     doid = df["doid"].astype("Int64").astype("str")
-    df["disease_link"] = doid.where(doid != "<NA>", df["disease"]).where(
-        doid == "<NA>", doid + "|" + df["disease"]
+    df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where(
+        doid.isnull(), doid + "|" + df["disease"]
     )
     return df
 
 
 def transform_clinical_evidence(df: pd.DataFrame, mpdf: pd.DataFrame):
     evidence_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"])
     evidence_with_mps_p = evidence_with_mps.mean() * 100
     evidence_without_mps = str(list(df["evidence_id"][~evidence_with_mps]))
     logging.info(
         f"ClincialEvidenceSummaries whose molecular_profile_id exists: {evidence_with_mps_p:.2f}%, evidence_ids with missing:{evidence_without_mps}"
     )
     expect(
         evidence_with_mps.mean() > 0.95,
         message="At least 95% of Evidence should have an existing MolecularProfile",
     )
 
     df["disease_html"] = "<i>" + df["disease"] + "</i>"
-    doid = df["doid"].astype("Int64").astype("str")
-    df["disease_link"] = doid.where(doid != "<NA>", df["disease"]).where(
-        doid == "<NA>", doid + "|" + df["disease"]
+    doid = df["doid"]
+    df["disease_link"] = doid.where(doid.notnull(), df["disease"]).where(
+        doid.isnull(), doid + "|" + df["disease"]
     )
     return df
 
 
 def load_dataframes(table_dict: dict[str, str]) -> dict[str, pd.DataFrame]:
     """Load several dataframes.
     Input is a dict from name to the source path.
     Output is a dict from name to a Pandas DataFrame"""
 
-    return {name: pd.read_csv(path, sep="\t") for name, path in table_dict.items()}
+    return {name: pd.read_csv(path, sep="\t", dtype={"doid": str}) for name, path in table_dict.items()}
 
 
 def urlretrieve(url, filename):
     with closing(open(filename, "wb")) as outfile:
         with closing(urllib.request.urlopen(url)) as instream:
             outfile.write(instream.read())
 
 
 def download_datadir(
     basedir: str,
     baseurl: str,
     dateslug: str,
     tablelist: list[str],
     overwrite: bool = True,
 ) -> dict[str, str]: