Validation Report Demo¶
Path: examples/demos/03_validation_report
Goal¶
Compare real vs synthetic tables, compute KS/TVD metrics, correlation distance, and render an HTML report.
Run¶
python examples/demos/03_validation_report/run.py
Outputs¶
outputs/validation_metrics.jsonoutputs/validation_report.html- Sample inputs:
outputs/real_users.csv,outputs/synthetic_users.csv
Notes¶
- Uses
ValidationReport.generatefor both JSON and HTML formats. - Open the HTML in a browser to inspect column-level results and previews.
Source Code¶
from pathlib import Path
import argparse
import numpy as np
import pandas as pd
from syntho_hive.validation.report_generator import ValidationReport
def make_real_data(num_rows: int) -> pd.DataFrame:
rng = np.random.default_rng(21)
df = pd.DataFrame(
{
"user_id": np.arange(1, num_rows + 1),
"age": rng.integers(18, 70, size=num_rows),
"region": rng.choice(["NE", "SE", "MW", "W"], size=num_rows, p=[0.25, 0.25, 0.25, 0.25]),
"monthly_spend": rng.normal(120, 35, size=num_rows).round(2),
"active": rng.choice([0, 1], size=num_rows, p=[0.35, 0.65]),
}
)
return df
def make_synthetic_variant(real_df: pd.DataFrame) -> pd.DataFrame:
"""
Create a synthetic dataset with slight distribution shifts to highlight
how the validator surfaces differences.
"""
rng = np.random.default_rng(22)
synth = real_df.copy()
synth["monthly_spend"] = (synth["monthly_spend"] * rng.normal(1.05, 0.05, size=len(real_df))).round(2)
synth["active"] = rng.choice([0, 1], size=len(real_df), p=[0.4, 0.6])
synth["region"] = rng.choice(["NE", "SE", "MW", "W"], size=len(real_df), p=[0.35, 0.2, 0.25, 0.2])
return synth
def main():
parser = argparse.ArgumentParser(description="Generate a validation report for real vs synthetic data.")
parser.add_argument("--rows", type=int, default=300, help="Rows per dataset to generate.")
parser.add_argument(
"--output-dir",
default="examples/demos/03_validation_report/outputs",
help="Directory where the report files will be written.",
)
args = parser.parse_args()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
real_df = make_real_data(args.rows)
synthetic_df = make_synthetic_variant(real_df)
real_df.to_csv(output_dir / "real_users.csv", index=False)
synthetic_df.to_csv(output_dir / "synthetic_users.csv", index=False)
report = ValidationReport()
html_path = output_dir / "validation_report.html"
json_path = output_dir / "validation_metrics.json"
report.generate(real_data={"users": real_df}, synth_data={"users": synthetic_df}, output_path=str(html_path))
report.generate(real_data={"users": real_df}, synth_data={"users": synthetic_df}, output_path=str(json_path))
print(f"Wrote HTML report to {html_path}")
print(f"Wrote JSON metrics to {json_path}")
if __name__ == "__main__":
main()