d069df7855eb648b5b53c51ef5218fe2e4a1d087 lrnassar Wed May 13 12:48:53 2026 -0700 mpraVarDb script rebuild: standardize NA, fix upstream typos, add pvalue note. refs #37359 Build-script changes (mpravardbToBed.py + mpravardb.as): - sanitize_text() now normalizes "None"/"NA"/"N/A"/"null"/"NULL"/"nan" to empty string. Removes 55,108 stale sentinels: 53,144 disease= "None" eQTL rows, 1,964 disease="NA" Kircher rows, 44 ref/alt=NA Myint rows. - Literal-replacement table for three upstream typos: "30 UTR" -> "3'UTR" (26,546 Schuster description rows), "Familial hypercholesterol emia" -> "Familial hypercholesterolemia" (2,176 Kircher disease rows), "Alchol use disorder" -> "Alcohol use disorder" (88 Rao disease rows). - fmt_mo() renders NaN floats as "NA" in mouseOver helper fields; 30,921 rows that previously showed literal "nan" now read "NA". - Tightened name+rsid handling: treat a value as an rsID only if it starts with "rs". 2,088 hg19-coord-style names like "1_1403972_CG" are reformatted to "chr<X>:<hg38pos>:<ref>><alt>" and the rsid field cleared to "" so the dbSNP linkout doesn't fire on a bogus value. - Removed the 250-char truncation that cut Griesemer descriptions mid-sentence; mpravardb.as switched description and mpraStudy from "string" to "lstring" to admit the full upstream text. Description page (mpraVarDb.html): - Added a "Note (pending upstream fix)" paragraph in Methods explaining the 5,092 rows with pvalue > 1 (test statistic mislabeled in the pvalue field by upstream curators for Mouri 2022 and Tewhey 2016). Bracketed by an HTML comment "TEMP: remove once Tao Wang fixes..." for future cleanup when the next snapshot lands. bigBed rebuilt; itemCount 239,028 preserved. Pre-rebuild backup: /hive/data/genomes/hg38/bed/mpra/mpravardb/mpravardb.bb.pre-2026-05-14-backup Makedoc updated with the QA-2 build-script rebuild section. diff --git src/hg/makeDb/scripts/mpravardb/mpravardb.as src/hg/makeDb/scripts/mpravardb/mpravardb.as index 927c655c1b9..11e7d9a6175 100644 --- src/hg/makeDb/scripts/mpravardb/mpravardb.as +++ src/hg/makeDb/scripts/mpravardb/mpravardb.as @@ -1,26 +1,26 @@ table mpravardb "MPRA Variant Database - experimentally tested regulatory variants from MPRA studies" ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Variant name (rsID or chr:pos:ref>alt)" uint score; "Score from 0-1000 based on -log10(pvalue)" char[1] strand; "Strand (always .)" uint thickStart; "Same as chromStart" uint thickEnd; "Same as chromEnd" uint reserved; "itemRgb - color by significance" string ref; "Reference allele" string alt; "Alternate allele" string rsid; "dbSNP rsID" string disease; "Associated disease or trait" string cellLine; "Cell line used in MPRA experiment" - string description; "Description of the MPRA experiment" + lstring description; "Description of the MPRA experiment" float log2FC; "Log2 fold change (alt vs ref allele activity)" float pvalue; "P-value for allelic difference" float fdr; "False discovery rate" - string mpraStudy; "MPRA study citation" + lstring mpraStudy; "MPRA study citation" string _mouseOverLog2FC; "log2FC rounded to 3 digits for mouseOver" string _mouseOverPvalue; "p-value rounded to 3 digits for mouseOver" string _mouseOverFdr; "FDR rounded to 3 digits for mouseOver" )