Skip to content

Privacy Sanitization Demo

Path: examples/demos/02_privacy_sanitization

Goal

Detect PII, apply sanitization (mask/hash/fake), and compare raw vs sanitized CSV outputs.

Run

python examples/demos/02_privacy_sanitization/run.py

Outputs

  • outputs/raw_users.csv
  • outputs/sanitized_users.csv

Notes

  • Uses PIISanitizer + ContextualFaker.
  • Adjust rules or actions in the demo script to experiment with masking vs hashing vs faking.

Source Code

from pathlib import Path
import argparse
import numpy as np
import pandas as pd

from syntho_hive.privacy.sanitizer import PIISanitizer, PrivacyConfig, PiiRule


def make_raw_users(num_rows: int) -> pd.DataFrame:
    rng = np.random.default_rng(7)
    first_names = ["Alex", "Jordan", "Sam", "Taylor", "Jamie", "Riley", "Casey", "Drew"]
    last_names = ["Lee", "Patel", "Garcia", "Chen", "Olsen", "Diaz", "Nguyen", "Brown"]
    cities = ["NY", "SF", "SEA", "DAL", "BOS"]

    rows = []
    for i in range(num_rows):
        first = rng.choice(first_names)
        last = rng.choice(last_names)
        city = rng.choice(cities)
        email = f"{first.lower()}.{last.lower()}{i}@example.com"
        phone = f"({rng.integers(200, 999)})-{rng.integers(200, 999)}-{rng.integers(1000, 9999)}"
        ssn = f"{rng.integers(100, 999):03d}-{rng.integers(10, 99):02d}-{rng.integers(1000, 9999):04d}"
        credit_card = f"{rng.integers(1000, 9999):04d}-{rng.integers(1000, 9999):04d}-{rng.integers(1000, 9999):04d}-{rng.integers(1000, 9999):04d}"
        loyalty_id = f"L-{rng.integers(10_000, 99_999)}"

        rows.append(
            {
                "user_id": i + 1,
                "first_name": first,
                "last_name": last,
                "city": city,
                "email": email,
                "phone": phone,
                "ssn": ssn,
                "credit_card": credit_card,
                "loyalty_id": loyalty_id,
                "notes": f"Called support on ticket {rng.integers(1000, 9999)}",
            }
        )

    return pd.DataFrame(rows)


def build_config() -> PrivacyConfig:
    """
    Extend the defaults with a custom rule to hash loyalty IDs
    instead of masking or faking them.
    """
    config = PrivacyConfig.default()
    config.rules.append(
        PiiRule(
            name="loyalty_id",
            patterns=[r"L-\d{5}"],
            action="hash",
        )
    )
    return config


def main():
    parser = argparse.ArgumentParser(description="Run the PII sanitization demo.")
    parser.add_argument("--rows", type=int, default=50, help="How many raw rows to generate.")
    parser.add_argument(
        "--output-dir",
        default="examples/demos/02_privacy_sanitization/outputs",
        help="Directory to place raw and sanitized CSVs.",
    )
    args = parser.parse_args()

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    raw_df = make_raw_users(args.rows)
    raw_path = output_dir / "raw_users.csv"
    raw_df.to_csv(raw_path, index=False)
    print(f"Wrote raw data to {raw_path}")

    config = build_config()
    sanitizer = PIISanitizer(config=config)
    detected = sanitizer.analyze(raw_df)
    print("Detected PII columns:", detected)

    sanitized_df = sanitizer.sanitize(raw_df, pii_map=detected)
    sanitized_path = output_dir / "sanitized_users.csv"
    sanitized_df.to_csv(sanitized_path, index=False)
    print(f"Wrote sanitized data to {sanitized_path}")
    print(sanitized_df.head())


if __name__ == "__main__":
    main()