cbfe2c0af6949ffd02b9f52f5734d2d44c3c9011 cvaske Thu Dec 12 14:26:34 2024 -0800 CIViC: add mouseover text and full otto crontab refs #34371 diff --git src/hg/utils/otto/civic/civicToBed.py src/hg/utils/otto/civic/civicToBed.py index 82e9abe..b5cee02 100644 --- src/hg/utils/otto/civic/civicToBed.py +++ src/hg/utils/otto/civic/civicToBed.py @@ -109,31 +109,31 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) DOWNLOAD_BASE_URL: Final = "https://civicdb.org/downloads" DATA_TABLES: Final = [ "MolecularProfileSummaries", "VariantSummaries", "ClinicalEvidenceSummaries", "AssertionSummaries", ] ## Maximum lengeth of a string (e.g. insAGCATGACCAG...) before ## being truncated and appended with an ellipsis -MAX_VARIANT_LENGTH: Final = 15 +MAX_VARIANT_LENGTH: Final = 20 MAX_LONG_BED_FIELD_LENGTH: Final = 5000 ## Default color for items in output bed DEFAULT_BED_VARIANT_COLOR: Final = "0" DOWNLOAD_DIR: Final = "downloads" # these are in percentages MAX_ROW_INCREASE: Final = 100.0 MAX_ROW_DECREASE: Final = -10.0 # Minimum fraction of ENSEMBL transcripts found for fusions for success MIN_ENSEMBL_TX = 0.90 ## @@ -244,59 +244,67 @@ ) return ( bed12df.assign( variant_link=( sourcedf.variant_id.astype("str") + "|" + sourcedf.feature_name + " " + sourcedf.variant ) ) .assign(reference_build=sourcedf["reference_build"]) .assign(allele_registry_id=sourcedf["allele_registry_id"].fillna("")) .assign(clinvar_id=cvids) .assign(last_review_date=sourcedf["last_review_date"].fillna("")) - .assign(disease_link=sourcedf["diseases"]) + .assign(disease_link=sourcedf["disease_links"]) .assign(therapies=sourcedf["therapies"]) + .assign( + mouseOverHTML="<b>Associated therapies:</b> " + + sourcedf["therapies"].str.replace(",", ", ") + + "<br><b>Associated diseases:</b> " + + sourcedf["disease_html"] + ) ) ## These are includde sometimes for debugging purposes... # "variant_id": gdf.variant_id, # "reference_bases": gdf.reference_bases, # "variant_bases": gdf.variant_bases, def extract_variant_name(gdf: pd.DataFrame) -> pd.Series: """ Make a name for a variant to show in the browser, summarizing the genomic change (sequence change, fusion partners, expression, etc.) """ # this got a bit ugly, but functional if statements be like that full_variant_string = pd.Series( np.where( gdf.feature_type == "Fusion", gdf.feature_name, # fusions have very descriptive names. - np.where( # these are the gene type variants (TODO: consider adding the gene) + np.where( # these are the gene type variants gdf.reference_bases.isnull(), np.where( # empty variant bases --> variant descrption or an insertion - gdf.variant_bases.isnull(), gdf.variant, "ins" + gdf.variant_bases + gdf.variant_bases.isnull(), + gdf.gene + " " + gdf.variant, + gdf.gene + " ins" + gdf.variant_bases, ), np.where( # specified variant bases --> deleteion or variant gdf.variant_bases.isnull(), - "del" + gdf.reference_bases, - gdf.reference_bases + ">" + gdf.variant_bases, + gdf.gene + " " + "del" + gdf.reference_bases, + gdf.gene + " " + gdf.reference_bases + ">" + gdf.variant_bases, ), ), ) ) return pd.Series( np.where( full_variant_string.str.len() > MAX_VARIANT_LENGTH, full_variant_string.str.slice(start=0, stop=MAX_VARIANT_LENGTH - 3) + "...", full_variant_string, ) ) def transform_gene_variant_summaries(df: pd.DataFrame) -> pd.DataFrame: @@ -558,66 +566,52 @@ def transform_assertion_summaries(df: pd.DataFrame, mpdf: pd.DataFrame) -> pd.DataFrame: # Check overlap with molecular profiles assertions_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"]) assertions_with_mps_p = assertions_with_mps.mean() * 100 assertions_without_mps = str(list(df["assertion_id"][~assertions_with_mps])) logging.info( f"AssertionSummaries whose molecular_profile_id exists: {assertions_with_mps_p:.2f}%, assertion_ids with missing:{assertions_without_mps}" ) expect( assertions_with_mps.mean() > 0.95, message="At least 95% of Assertions should have an existing MolecularProfile", ) + df["disease_html"] = "<i>" + df["disease"] + "</i>" doid = df["doid"].astype("Int64").astype("str") - df["disease_html"] = doid.where(doid != "<NA>", df["disease"]).where( - doid == "<NA>", - "<a href='https://www.disease-ontology.org/?id=DOID:" - + doid - + "'>" - + df["disease"] - + "</a>", - ) df["disease_link"] = doid.where(doid != "<NA>", df["disease"]).where( doid == "<NA>", doid + "|" + df["disease"] ) return df def transform_clinical_evidence(df: pd.DataFrame, mpdf: pd.DataFrame): evidence_with_mps = df["molecular_profile_id"].isin(mpdf["molecular_profile_id"]) evidence_with_mps_p = evidence_with_mps.mean() * 100 evidence_without_mps = str(list(df["evidence_id"][~evidence_with_mps])) logging.info( f"ClincialEvidenceSummaries whose molecular_profile_id exists: {evidence_with_mps_p:.2f}%, evidence_ids with missing:{evidence_without_mps}" ) expect( evidence_with_mps.mean() > 0.95, message="At least 95% of Evidence should have an existing MolecularProfile", ) + df["disease_html"] = "<i>" + df["disease"] + "</i>" doid = df["doid"].astype("Int64").astype("str") - df["disease_html"] = doid.where(doid != "<NA>", df["disease"]).where( - doid == "<NA>", - "<a href='https://www.disease-ontology.org/?id=DOID:" - + doid - + "'>" - + df["disease"] - + "</a>", - ) df["disease_link"] = doid.where(doid != "<NA>", df["disease"]).where( doid == "<NA>", doid + "|" + df["disease"] ) return df def load_dataframes(table_dict: dict[str, str]) -> dict[str, pd.DataFrame]: """Load several dataframes. Input is a dict from name to the source path. Output is a dict from name to a Pandas DataFrame""" return {name: pd.read_csv(path, sep="\t") for name, path in table_dict.items()} def urlretrieve(url, filename): @@ -780,105 +774,114 @@ ) logging.info( f"Dataframe shape of variants after filtering flagged: {orig_vs_df.shape}" ) return orig_vs_df.assign( short_variant_string=extract_variant_name(orig_vs_df), molecular_profile_desc="", assertion_desc="", evidence_desc="", ) def join_set( sep: str, - values: Sequence[str], + values: set[str], max_length: int = MAX_LONG_BED_FIELD_LENGTH, ellipses: str = "...", ) -> str: """Join strings together, but only up to a maximum length. If the maximum length is exceeded, add the ellipses string at the end. """ ellipses_len = len(sep) + len(ellipses) max_length -= ellipses_len r = "" loop_sep = "" # after first iteration switch to `sep` for v in values: if len(r) + len(v) > max_length: return r + loop_sep + ellipses r += loop_sep + v loop_sep = sep return r def add_variant_diseases_therapies( assertion_df: pd.DataFrame, evidence_df: pd.DataFrame, variant_df: pd.DataFrame, mid2vids: dict[int, list[int]], ) -> pd.DataFrame: - vid2diseases = defaultdict(set) - vid2therapies = defaultdict(set) + vid2diseases: dict[int, set[str]] = defaultdict(set) + vid2dis_html: dict[int, set[str]] = defaultdict(set) + vid2therapies: dict[int, set[str]] = defaultdict(set) missing_mps = [] - columns = ["molecular_profile_id", "disease_link", "therapies"] + columns = ["molecular_profile_id", "disease_link", "disease_html", "therapies"] - for _, mpid, disease, therapies in assertion_df[columns].itertuples(): + # Gather all the associations from the Assertion table + for _, mpid, disease, disease_html, therapies in assertion_df[columns].itertuples(): + # tests for self-equality will remove NaNs tset = set(therapies.split(",")) if therapies == therapies else set() dset = set([disease]) if disease == disease else set() + dhset = set([disease_html]) if disease_html == disease_html else set() if mpid in mid2vids: for vid in mid2vids[mpid]: vid2diseases[vid] |= dset + vid2dis_html[vid] |= dhset vid2therapies[vid] |= tset else: missing_mps.append(mpid) - for _, mpid, disease, therapies in evidence_df[columns].itertuples(): + # Gather all the associations from the Evidence table + for _, mpid, disease, disease_html, therapies in evidence_df[columns].itertuples(): tset = set(therapies.split(",")) if therapies == therapies else set() dset = set([disease]) if disease == disease else set() + dhset = set([disease_html]) if disease_html == disease_html else set() if mpid in mid2vids: for vid in mid2vids[mpid]: vid2diseases[vid] |= dset + vid2dis_html[vid] |= dhset vid2therapies[vid] |= tset else: missing_mps.append(mpid) - disease_df = pd.DataFrame( - { - "diseases": [ - join_set(",", d, ellipses="|...") for d in vid2diseases.values() - ] - }, - index=vid2diseases.keys(), - ) + variant_id = [] + dhtml_col = [] + dlink_col = [] + therapy_col = [] - therapy_df = pd.DataFrame( - {"therapies": [join_set(", ", t) for t in vid2therapies.values()]}, - index=vid2therapies.keys(), - ) + for vid in set(vid2diseases.keys()) | set(vid2therapies.keys()): + variant_id.append(vid) + dhtml_col.append(join_set(", ", vid2dis_html.get(vid, set()))) + dlink_col.append(join_set(",", vid2diseases.get(vid, set()), ellipses="|...")) + therapy_col.append(join_set(", ", vid2therapies.get(vid, set()))) - return ( - variant_df.set_index("variant_id") - .join(disease_df) - .join(therapy_df) - .reset_index() - ) + dt_df = pd.DataFrame( + { + "variant_id": variant_id, + "disease_html": dhtml_col, + "disease_links": dlink_col, + "therapies": therapy_col, + } + ).set_index("variant_id") + + return variant_df.set_index("variant_id").join(dt_df).reset_index() def transform_dfs(dfs: dict[str, pd.DataFrame]) -> dict[str, str]: """From a dictionary of data frames for VariantSummaries, MolecularProfileSummaries, AssertionSummaries, and ClinicalEvidenceSummaries, create a unifed bigBed file for each reference genome. Returns a dictionary from reference genome slug (hg19/hg38) to the file path of the bidBed files. """ adf = transform_assertion_summaries( dfs["AssertionSummaries"], dfs["MolecularProfileSummaries"] ) edf = transform_clinical_evidence( dfs["ClinicalEvidenceSummaries"], dfs["MolecularProfileSummaries"]