Skip to content

Validation

syntho_hive.validation.report_generator.ValidationReport

Generate summary reports of validation metrics.

Source code in syntho_hive/validation/report_generator.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
class ValidationReport:
    """Generate summary reports of validation metrics."""

    def __init__(self):
        """Initialize statistical validator and metric store."""
        self.validator = StatisticalValidator()
        self.metrics = {}

    def _calculate_detailed_stats(self, real_df: pd.DataFrame, synth_df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate descriptive statistics for side-by-side comparison.

        Args:
            real_df: Real dataframe.
            synth_df: Synthetic dataframe aligned to the real columns.

        Returns:
            Nested dict of summary stats for each column.
        """
        stats = {}
        for col in real_df.columns:
            if col not in synth_df.columns:
                continue

            col_stats = {"real": {}, "synth": {}}

            for name, df, res in [("real", real_df, col_stats["real"]), ("synth", synth_df, col_stats["synth"])]:
                series = df[col]
                if pd.api.types.is_numeric_dtype(series):
                    res["mean"] = series.mean()
                    res["std"] = series.std()
                    res["min"] = series.min()
                    res["max"] = series.max()
                else:
                    res["unique_count"] = series.nunique()
                    res["top_value"] = series.mode().iloc[0] if not series.mode().empty else "N/A"
                    res["top_freq"] = series.value_counts().iloc[0] if not series.empty else 0

            stats[col] = col_stats
        return stats

    def generate(self, real_data: Dict[str, pd.DataFrame], synth_data: Dict[str, pd.DataFrame], output_path: str):
        """Run validation and save a report.

        Args:
            real_data: Mapping of table name to real dataframe.
            synth_data: Mapping of table name to synthetic dataframe.
            output_path: Destination path for HTML or JSON report.
        """
        report = {
            "tables": {},
            "summary": "Validation Report"
        }

        for table_name, real_df in real_data.items():
            if table_name not in synth_data:
                continue

            synth_df = synth_data[table_name]

            # 1. Column comparisons
            col_metrics = self.validator.compare_columns(real_df, synth_df)

            # 2. Correlation
            corr_diff = self.validator.check_correlations(real_df, synth_df)

            # 3. Detailed Stats
            stats = self._calculate_detailed_stats(real_df, synth_df)

            # 4. Data Preview
            # Use Pandas to_html for easy formatting, strict constraints
            preview = {
                "real_html": real_df.head(10).to_html(index=False, classes='scroll-table', border=0),
                "synth_html": synth_df.head(10).to_html(index=False, classes='scroll-table', border=0)
            }

            report["tables"][table_name] = {
                "column_metrics": col_metrics,
                "correlation_distance": corr_diff,
                "detailed_stats": stats,
                "preview": preview
            }

        if output_path.endswith(".html"):
            self._save_html(report, output_path)
        else:
            # Save to JSON for now (PDF requires more deps)
            with open(output_path, "w") as f:
                json.dump(report, f, indent=2, default=str)

        import os
        print(f"Report saved to {os.path.abspath(output_path)}")

    def _save_html(self, report: Dict[str, Any], output_path: str):
        """Render a rich HTML report with metric explanations, stats, and previews.

        Args:
            report: Structured report dictionary produced by ``generate``.
            output_path: Filesystem path to write the HTML file.
        """
        html_content = [
            """<html>
            <head>
                <style>
                    body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; margin: 20px; background-color: #f9f9f9; color: #333; }
                    h1, h2, h3 { color: #2c3e50; }
                    .container { max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }

                    /* Tables */
                    table { border-collapse: collapse; width: 100%; margin-bottom: 20px; font-size: 14px; }
                    th, td { border: 1px solid #e1e4e8; padding: 10px; text-align: left; }
                    th { background-color: #f1f8ff; color: #0366d6; font-weight: 600; }
                    tr:nth-child(even) { background-color: #f8f9fa; }

                    /* Status Colors */
                    .pass { color: #28a745; font-weight: bold; }
                    .fail { color: #dc3545; font-weight: bold; }

                    /* Layout */
                    .section { margin-top: 40px; border-top: 1px solid #eee; padding-top: 20px; }
                    .metric-box { background: #f0f4f8; padding: 15px; border-radius: 5px; margin-bottom: 20px; border-left: 5px solid #0366d6; }
                    .row { display: flex; gap: 20px; }
                    .col { flex: 1; overflow-x: auto; }

                    /* Tabs/Previews */
                    .preview-header { font-weight: bold; margin-bottom: 10px; color: #555; }
                    .scroll-table { max-height: 400px; overflow-y: auto; display: block; }
                </style>
            </head>
            <body>
            <div class="container">
                <h1>Validation Report</h1>

                <div class="metric-box">
                    <h3>Metric Explanations</h3>
                    <ul>
                        <li><strong>KS Test (Kolmogorov-Smirnov):</strong> Used for continuous numerical columns. Compares the cumulative distribution functions of the real and synthetic data. <br>
                            <em>Result:</em> Returns a p-value. If p > 0.05, we fail to reject the null hypothesis (i.e., distributions are likely the same).</li>
                        <li><strong>TVD (Total Variation Distance):</strong> Used for categorical or discrete columns. Measures the maximum difference between probabilities assigned to the same event by two distributions. <br>
                            <em>Result:</em> Value between 0 and 1. Lower is better (0 means identical). We consider < 0.1 as passing.</li>
                        <li><strong>Correlation Distance:</strong> Measures how well the pairwise correlations between numerical columns are preserved. Calculated as the Frobenius norm of the difference between correlation matrices. <br>
                            <em>Result:</em> Lower is better (0 means identical correlation structure).</li>
                    </ul>
                </div>
            """]

        for table_name, data in report["tables"].items():
            html_content.append(f"<div class='section'><h2>Table: {table_name}</h2>")

            # --- 1. Correlation & Overall ---
            corr_dist = data.get('correlation_distance', 0.0)
            html_content.append(f"<p><strong>Correlation Distance:</strong> {corr_dist:.4f}</p>")

            # --- 2. Column Metrics ---
            html_content.append("<h3>Column Validation Metrics</h3>")
            html_content.append("<table><tr><th>Column</th><th>Test Type</th><th>Statistic</th><th>P-Value / Score</th><th>Status</th></tr>")

            for col, metrics in data["column_metrics"].items():
                if "error" in metrics:
                    html_content.append(f"<tr><td>{col}</td><td colspan='4' class='fail'>Error: {metrics['error']}</td></tr>")
                    continue

                status = "PASS" if metrics.get("passed", False) else "FAIL"
                cls = "pass" if status == "PASS" else "fail"

                stat = f"{metrics.get('statistic', 0):.4f}"
                # TVD doesn't have a p-value, KS does.
                pval = f"{metrics.get('p_value', 0):.4f}" if metrics.get('p_value') is not None else "N/A"
                test_name = metrics.get('test', 'N/A')

                html_content.append(f"<tr><td>{col}</td><td>{test_name}</td><td>{stat}</td><td>{pval}</td><td class='{cls}'>{status}</td></tr>")

            html_content.append("</table>")

            # --- 3. Detailed Statistics ---
            if "detailed_stats" in data:
                html_content.append("<h3>Detailed Statistics (Real vs Synthetic)</h3>")
                html_content.append("<table><tr><th>Column</th><th>Metric</th><th>Real</th><th>Synthetic</th></tr>")

                for col, stats in data["detailed_stats"].items():
                    # stats has "real": {...}, "synth": {...}
                    real_s = stats.get("real", {})
                    synth_s = stats.get("synth", {})

                    # Merge keys to show
                    all_keys = sorted(list(set(real_s.keys()) | set(synth_s.keys())))
                    # Usually we want mean, std, min, max or unique, top

                    first = True
                    for k in all_keys:
                        r_val = real_s.get(k, "-")
                        s_val = synth_s.get(k, "-")

                        # Format floats
                        if isinstance(r_val, (float, np.floating)): r_val = f"{r_val:.4f}"
                        if isinstance(s_val, (float, np.floating)): s_val = f"{s_val:.4f}"

                        row_start = f"<tr><td rowspan='{len(all_keys)}'>{col}</td>" if first else "<tr>"
                        row_end = f"<td>{k}</td><td>{r_val}</td><td>{s_val}</td></tr>"
                        html_content.append(row_start + row_end)
                        first = False
                html_content.append("</table>")

            # --- 4. Data Preview ---
            if "preview" in data:
                html_content.append("<h3>Data Preview (First 10 Rows)</h3>")
                html_content.append("<div class='row'>")

                # Real
                html_content.append("<div class='col'>")
                html_content.append("<div class='preview-header'>Original Data (Real)</div>")
                html_content.append(data["preview"]["real_html"])
                html_content.append("</div>")

                # Synth
                html_content.append("<div class='col'>")
                html_content.append("<div class='preview-header'>Synthetic Data (Generated)</div>")
                html_content.append(data["preview"]["synth_html"])
                html_content.append("</div>")

                html_content.append("</div>") # End row

            html_content.append("</div>") # End section

        html_content.append("</div></body></html>")

        with open(output_path, "w") as f:
            f.write("\n".join(html_content))

generate

generate(real_data: Dict[str, DataFrame], synth_data: Dict[str, DataFrame], output_path: str)

Run validation and save a report.

Parameters:

Name Type Description Default
real_data Dict[str, DataFrame]

Mapping of table name to real dataframe.

required
synth_data Dict[str, DataFrame]

Mapping of table name to synthetic dataframe.

required
output_path str

Destination path for HTML or JSON report.

required
Source code in syntho_hive/validation/report_generator.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def generate(self, real_data: Dict[str, pd.DataFrame], synth_data: Dict[str, pd.DataFrame], output_path: str):
    """Run validation and save a report.

    Args:
        real_data: Mapping of table name to real dataframe.
        synth_data: Mapping of table name to synthetic dataframe.
        output_path: Destination path for HTML or JSON report.
    """
    report = {
        "tables": {},
        "summary": "Validation Report"
    }

    for table_name, real_df in real_data.items():
        if table_name not in synth_data:
            continue

        synth_df = synth_data[table_name]

        # 1. Column comparisons
        col_metrics = self.validator.compare_columns(real_df, synth_df)

        # 2. Correlation
        corr_diff = self.validator.check_correlations(real_df, synth_df)

        # 3. Detailed Stats
        stats = self._calculate_detailed_stats(real_df, synth_df)

        # 4. Data Preview
        # Use Pandas to_html for easy formatting, strict constraints
        preview = {
            "real_html": real_df.head(10).to_html(index=False, classes='scroll-table', border=0),
            "synth_html": synth_df.head(10).to_html(index=False, classes='scroll-table', border=0)
        }

        report["tables"][table_name] = {
            "column_metrics": col_metrics,
            "correlation_distance": corr_diff,
            "detailed_stats": stats,
            "preview": preview
        }

    if output_path.endswith(".html"):
        self._save_html(report, output_path)
    else:
        # Save to JSON for now (PDF requires more deps)
        with open(output_path, "w") as f:
            json.dump(report, f, indent=2, default=str)

    import os
    print(f"Report saved to {os.path.abspath(output_path)}")

syntho_hive.validation.statistical.StatisticalValidator

Perform statistical checks between real and synthetic data.

Source code in syntho_hive/validation/statistical.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class StatisticalValidator:
    """Perform statistical checks between real and synthetic data."""

    def compare_columns(self, real_df: pd.DataFrame, synth_df: pd.DataFrame) -> Dict[str, Any]:
        """Compare column-wise distributions between real and synthetic data.

        Args:
            real_df: Real dataframe.
            synth_df: Synthetic dataframe aligned to the same schema.

        Returns:
            Mapping of column name to test results or error descriptions.
        """
        results = {}

        if real_df.empty or synth_df.empty:
            return {"error": "One or both DataFrames are empty."}

        for col in real_df.columns:
            if col not in synth_df.columns:
                results[col] = {"error": "Column missing in synthetic data"}
                continue

            real_data = real_df[col].dropna()
            synth_data = synth_df[col].dropna()

            if real_data.empty or synth_data.empty:
                results[col] = {"error": "Column data is empty after dropping NaNs"}
                continue

            # Check for type mismatch
            if real_data.dtype != synth_data.dtype:
                # Try to cast if compatible (e.g. float vs int)
                if pd.api.types.is_numeric_dtype(real_data) and pd.api.types.is_numeric_dtype(synth_data):
                    pass # Compatible enough for stats
                else:
                    results[col] = {"error": f"Type mismatch: Real {real_data.dtype} vs Synth {synth_data.dtype}"}
                    continue

            if pd.api.types.is_numeric_dtype(real_data):
                # KS Test
                try:
                    stat, p_value = ks_2samp(real_data, synth_data)
                    results[col] = {
                        "test": "ks_test",
                        "statistic": stat,
                        "p_value": p_value,
                        "passed": p_value > 0.05 # Null hypothesis: Same distribution
                    }
                except Exception as e:
                    results[col] = {"error": f"KS Test failed: {str(e)}"}
            else:
                # TVD (Total Variation Distance)
                try:
                    real_counts = real_data.value_counts(normalize=True)
                    synth_counts = synth_data.value_counts(normalize=True)

                    # Align categories
                    all_cats = set(real_counts.index).union(set(synth_counts.index))

                    tvd = 0.5 * sum(abs(real_counts.get(c, 0) - synth_counts.get(c, 0)) for c in all_cats)

                    results[col] = {
                        "test": "tvd",
                        "statistic": tvd,
                        "passed": tvd < 0.1 # Threshold arbitrary
                    }
                except Exception as e:
                    results[col] = {"error": f"TVD Checks failed: {str(e)}"}

        return results

    def check_correlations(self, real_df: pd.DataFrame, synth_df: pd.DataFrame) -> float:
        """Compare correlation matrices using Frobenius norm.

        Args:
            real_df: Real dataframe.
            synth_df: Synthetic dataframe.

        Returns:
            Frobenius norm distance between correlation matrices (0 when identical).
        """
        # Numeric only
        real_corr = real_df.select_dtypes(include=[np.number]).corr().fillna(0)
        synth_corr = synth_df.select_dtypes(include=[np.number]).corr().fillna(0)

        if real_corr.empty or synth_corr.empty:
            return 0.0

        diff = real_corr - synth_corr
        frobenius_norm = np.linalg.norm(diff.values)

        return float(frobenius_norm)

check_correlations

check_correlations(real_df: DataFrame, synth_df: DataFrame) -> float

Compare correlation matrices using Frobenius norm.

Parameters:

Name Type Description Default
real_df DataFrame

Real dataframe.

required
synth_df DataFrame

Synthetic dataframe.

required

Returns:

Type Description
float

Frobenius norm distance between correlation matrices (0 when identical).

Source code in syntho_hive/validation/statistical.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def check_correlations(self, real_df: pd.DataFrame, synth_df: pd.DataFrame) -> float:
    """Compare correlation matrices using Frobenius norm.

    Args:
        real_df: Real dataframe.
        synth_df: Synthetic dataframe.

    Returns:
        Frobenius norm distance between correlation matrices (0 when identical).
    """
    # Numeric only
    real_corr = real_df.select_dtypes(include=[np.number]).corr().fillna(0)
    synth_corr = synth_df.select_dtypes(include=[np.number]).corr().fillna(0)

    if real_corr.empty or synth_corr.empty:
        return 0.0

    diff = real_corr - synth_corr
    frobenius_norm = np.linalg.norm(diff.values)

    return float(frobenius_norm)

compare_columns

compare_columns(real_df: DataFrame, synth_df: DataFrame) -> Dict[str, Any]

Compare column-wise distributions between real and synthetic data.

Parameters:

Name Type Description Default
real_df DataFrame

Real dataframe.

required
synth_df DataFrame

Synthetic dataframe aligned to the same schema.

required

Returns:

Type Description
Dict[str, Any]

Mapping of column name to test results or error descriptions.

Source code in syntho_hive/validation/statistical.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def compare_columns(self, real_df: pd.DataFrame, synth_df: pd.DataFrame) -> Dict[str, Any]:
    """Compare column-wise distributions between real and synthetic data.

    Args:
        real_df: Real dataframe.
        synth_df: Synthetic dataframe aligned to the same schema.

    Returns:
        Mapping of column name to test results or error descriptions.
    """
    results = {}

    if real_df.empty or synth_df.empty:
        return {"error": "One or both DataFrames are empty."}

    for col in real_df.columns:
        if col not in synth_df.columns:
            results[col] = {"error": "Column missing in synthetic data"}
            continue

        real_data = real_df[col].dropna()
        synth_data = synth_df[col].dropna()

        if real_data.empty or synth_data.empty:
            results[col] = {"error": "Column data is empty after dropping NaNs"}
            continue

        # Check for type mismatch
        if real_data.dtype != synth_data.dtype:
            # Try to cast if compatible (e.g. float vs int)
            if pd.api.types.is_numeric_dtype(real_data) and pd.api.types.is_numeric_dtype(synth_data):
                pass # Compatible enough for stats
            else:
                results[col] = {"error": f"Type mismatch: Real {real_data.dtype} vs Synth {synth_data.dtype}"}
                continue

        if pd.api.types.is_numeric_dtype(real_data):
            # KS Test
            try:
                stat, p_value = ks_2samp(real_data, synth_data)
                results[col] = {
                    "test": "ks_test",
                    "statistic": stat,
                    "p_value": p_value,
                    "passed": p_value > 0.05 # Null hypothesis: Same distribution
                }
            except Exception as e:
                results[col] = {"error": f"KS Test failed: {str(e)}"}
        else:
            # TVD (Total Variation Distance)
            try:
                real_counts = real_data.value_counts(normalize=True)
                synth_counts = synth_data.value_counts(normalize=True)

                # Align categories
                all_cats = set(real_counts.index).union(set(synth_counts.index))

                tvd = 0.5 * sum(abs(real_counts.get(c, 0) - synth_counts.get(c, 0)) for c in all_cats)

                results[col] = {
                    "test": "tvd",
                    "statistic": tvd,
                    "passed": tvd < 0.1 # Threshold arbitrary
                }
            except Exception as e:
                results[col] = {"error": f"TVD Checks failed: {str(e)}"}

    return results