Skip to content

Privacy & Sanitization

The privacy module ensures that sensitive information is detected and handled correctly before any modeling takes place.

Sanitizer

syntho_hive.privacy.sanitizer.PIISanitizer

Detect and sanitize PII columns based on configurable rules.

Source code in syntho_hive/privacy/sanitizer.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
class PIISanitizer:
    """Detect and sanitize PII columns based on configurable rules."""

    COLUMN_NAME_ALIASES: Dict[str, List[str]] = {
        "email": ["email", "e_mail", "email_address"],
        "ssn": ["ssn", "social_security", "social_sec"],
        "phone": ["phone", "mobile", "cell", "tel", "telephone", "phone_number"],
        "name": ["first_name", "last_name", "full_name", "firstname", "lastname"],
        "address": ["address", "street", "city", "zip", "zipcode", "postal"],
        "date_of_birth": ["dob", "date_of_birth", "birth_date", "birthday"],
        "credit_card": ["credit_card", "card_number", "cc_num", "card_num"],
    }

    def __init__(self, config: Optional[PrivacyConfig] = None):
        """Create a sanitizer with contextual faker support.

        Args:
            config: Optional privacy configuration; defaults to ``PrivacyConfig.default``.
        """
        self.config = config or PrivacyConfig.default()
        self.faker = ContextualFaker()
        self._hash_salt = secrets.token_bytes(32)

    def analyze(self, df: pd.DataFrame) -> Dict[str, str]:
        """Detect potential PII columns using configured rules.

        Args:
            df: DataFrame to inspect for PII.

        Returns:
            Mapping of column name to matched PII rule name.
        """
        detected = {}

        # 1. Check column names using alias-based heuristics
        for col in df.columns:
            col_lower = col.lower()
            for rule in self.config.rules:
                aliases = self.COLUMN_NAME_ALIASES.get(rule.name, [rule.name])
                for alias in aliases:
                    # Use word-boundary-like matching: check if alias matches the
                    # full column name or appears as a delimited token within it.
                    if col_lower == alias or re.search(
                        r"(?:^|[_\-\s])" + re.escape(alias) + r"(?:$|[_\-\s])",
                        col_lower,
                    ):
                        detected[col] = rule.name
                        break
                if col in detected:
                    break

        # 2. Check content for remaining columns
        # Sample up to 100 random rows to avoid positional bias
        sample = df.sample(min(100, len(df)), random_state=42)

        for col in df.columns:
            if col in detected:
                continue

            # Skip non-string columns for regex matching
            if not pd.api.types.is_string_dtype(sample[col]):
                continue

            valid_rows = sample[col].dropna().astype(str)
            if len(valid_rows) == 0:
                continue

            # Check each rule
            best_rule = None
            max_matches = 0

            for rule in self.config.rules:
                match_count = 0
                for val in valid_rows:
                    # Check any pattern for this rule
                    for pat in rule.patterns:
                        if re.search(pat, val):
                            match_count += 1
                            break  # Match found for this value

                # If > 50% match, consider it a candidate
                if match_count > len(valid_rows) * 0.5:
                    if match_count > max_matches:
                        max_matches = match_count
                        best_rule = rule.name

            if best_rule:
                detected[col] = best_rule

        return detected

    def sanitize(
        self, df: pd.DataFrame, pii_map: Optional[Dict[str, str]] = None
    ) -> pd.DataFrame:
        """Apply sanitization rules to a dataframe.

        Args:
            df: Input dataframe containing potential PII.
            pii_map: Optional precomputed map of column name to PII rule name.

        Returns:
            Sanitized dataframe with PII handled according to configured actions.
        """
        if pii_map is None:
            pii_map = self.analyze(df)
        else:
            invalid_cols = [col for col in pii_map if col not in df.columns]
            if invalid_cols:
                raise ValueError(
                    f"pii_map contains columns not in DataFrame: {invalid_cols}"
                )

        output_df = df.copy()

        for col, rule_name in pii_map.items():
            rule = next((r for r in self.config.rules if r.name == rule_name), None)
            if not rule:
                continue

            if rule.action == "drop":
                output_df.drop(columns=[col], inplace=True)

            elif rule.action == "mask":
                output_df[col] = output_df[col].apply(lambda x: self._mask_value(x))

            elif rule.action == "hash":
                output_df[col] = output_df[col].apply(lambda x: self._hash_value(x))

            elif rule.action == "fake":
                output_df[col] = self._fake_column(output_df, col, rule)

            elif rule.action == "custom":
                if rule.custom_generator:
                    # Use custom generator, passing row context
                    # Note: This checks frame line by line, slower but powerful
                    output_df[col] = output_df.apply(
                        lambda row: rule.custom_generator(row.to_dict()), axis=1
                    )
                else:
                    # Fallback if no generator provided
                    output_df[col] = output_df[col].apply(self._mask_value)

        return output_df

    def _mask_value(self, val: Any) -> Any:
        """Mask a value, preserving only the last four characters."""
        if pd.isna(val) or val is None:
            return val
        s = str(val)
        if len(s) <= 4:
            return "*" * len(s)
        return "*" * (len(s) - 4) + s[-4:]

    def _hash_value(self, val: Any) -> Any:
        """Return an HMAC-SHA256 hash representation of a value using a per-instance salt."""
        if pd.isna(val) or val is None:
            return val
        return hmac.new(self._hash_salt, str(val).encode(), hashlib.sha256).hexdigest()

    def _fake_column(self, df: pd.DataFrame, col: str, rule: PiiRule) -> pd.Series:
        """Generate fake data for a column using contextual faker.

        Args:
            df: DataFrame containing the column to fake.
            col: Column name.
            rule: PII rule describing the type being faked.

        Returns:
            Series of fake values aligned to ``df``.
        """
        # Context strategy:
        # If the rule has a context_key (not yet fully implemented in config, but good for future), use it.
        # Fallback to simple random generation.

        # We can pass the dataframe to the faker to handle this column
        # But our FakerContextual currently handles whole DF.
        # Let's call generate_pii for the length of DF.

        # Optimization: fast path if no context needed
        return df.apply(
            lambda row: self.faker.generate_pii(rule.name, context=row.to_dict())[0],
            axis=1,
        )

analyze

analyze(df: DataFrame) -> Dict[str, str]

Detect potential PII columns using configured rules.

Parameters:

Name Type Description Default
df DataFrame

DataFrame to inspect for PII.

required

Returns:

Type Description
Dict[str, str]

Mapping of column name to matched PII rule name.

Source code in syntho_hive/privacy/sanitizer.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def analyze(self, df: pd.DataFrame) -> Dict[str, str]:
    """Detect potential PII columns using configured rules.

    Args:
        df: DataFrame to inspect for PII.

    Returns:
        Mapping of column name to matched PII rule name.
    """
    detected = {}

    # 1. Check column names using alias-based heuristics
    for col in df.columns:
        col_lower = col.lower()
        for rule in self.config.rules:
            aliases = self.COLUMN_NAME_ALIASES.get(rule.name, [rule.name])
            for alias in aliases:
                # Use word-boundary-like matching: check if alias matches the
                # full column name or appears as a delimited token within it.
                if col_lower == alias or re.search(
                    r"(?:^|[_\-\s])" + re.escape(alias) + r"(?:$|[_\-\s])",
                    col_lower,
                ):
                    detected[col] = rule.name
                    break
            if col in detected:
                break

    # 2. Check content for remaining columns
    # Sample up to 100 random rows to avoid positional bias
    sample = df.sample(min(100, len(df)), random_state=42)

    for col in df.columns:
        if col in detected:
            continue

        # Skip non-string columns for regex matching
        if not pd.api.types.is_string_dtype(sample[col]):
            continue

        valid_rows = sample[col].dropna().astype(str)
        if len(valid_rows) == 0:
            continue

        # Check each rule
        best_rule = None
        max_matches = 0

        for rule in self.config.rules:
            match_count = 0
            for val in valid_rows:
                # Check any pattern for this rule
                for pat in rule.patterns:
                    if re.search(pat, val):
                        match_count += 1
                        break  # Match found for this value

            # If > 50% match, consider it a candidate
            if match_count > len(valid_rows) * 0.5:
                if match_count > max_matches:
                    max_matches = match_count
                    best_rule = rule.name

        if best_rule:
            detected[col] = best_rule

    return detected

sanitize

sanitize(df: DataFrame, pii_map: Optional[Dict[str, str]] = None) -> pd.DataFrame

Apply sanitization rules to a dataframe.

Parameters:

Name Type Description Default
df DataFrame

Input dataframe containing potential PII.

required
pii_map Optional[Dict[str, str]]

Optional precomputed map of column name to PII rule name.

None

Returns:

Type Description
DataFrame

Sanitized dataframe with PII handled according to configured actions.

Source code in syntho_hive/privacy/sanitizer.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def sanitize(
    self, df: pd.DataFrame, pii_map: Optional[Dict[str, str]] = None
) -> pd.DataFrame:
    """Apply sanitization rules to a dataframe.

    Args:
        df: Input dataframe containing potential PII.
        pii_map: Optional precomputed map of column name to PII rule name.

    Returns:
        Sanitized dataframe with PII handled according to configured actions.
    """
    if pii_map is None:
        pii_map = self.analyze(df)
    else:
        invalid_cols = [col for col in pii_map if col not in df.columns]
        if invalid_cols:
            raise ValueError(
                f"pii_map contains columns not in DataFrame: {invalid_cols}"
            )

    output_df = df.copy()

    for col, rule_name in pii_map.items():
        rule = next((r for r in self.config.rules if r.name == rule_name), None)
        if not rule:
            continue

        if rule.action == "drop":
            output_df.drop(columns=[col], inplace=True)

        elif rule.action == "mask":
            output_df[col] = output_df[col].apply(lambda x: self._mask_value(x))

        elif rule.action == "hash":
            output_df[col] = output_df[col].apply(lambda x: self._hash_value(x))

        elif rule.action == "fake":
            output_df[col] = self._fake_column(output_df, col, rule)

        elif rule.action == "custom":
            if rule.custom_generator:
                # Use custom generator, passing row context
                # Note: This checks frame line by line, slower but powerful
                output_df[col] = output_df.apply(
                    lambda row: rule.custom_generator(row.to_dict()), axis=1
                )
            else:
                # Fallback if no generator provided
                output_df[col] = output_df[col].apply(self._mask_value)

    return output_df

Configuration

syntho_hive.privacy.sanitizer.PiiRule dataclass

Configuration for a single PII type and handling strategy.

Source code in syntho_hive/privacy/sanitizer.py
11
12
13
14
15
16
17
18
19
20
21
22
23
@dataclass
class PiiRule:
    """Configuration for a single PII type and handling strategy."""

    name: str
    patterns: List[str]  # List of regex patterns to match
    action: str = "drop"  # Options: "drop", "mask", "hash", "fake", "custom", "keep"
    context_key: Optional[str] = (
        None  # Key to look for in context (e.g. 'country' for locale)
    )
    custom_generator: Optional[Callable[[Dict[str, Any]], Any]] = (
        None  # Custom lambda for generation
    )

syntho_hive.privacy.sanitizer.PrivacyConfig dataclass

Collection of rules for PII detection and handling.

Source code in syntho_hive/privacy/sanitizer.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
@dataclass
class PrivacyConfig:
    """Collection of rules for PII detection and handling."""

    rules: List[PiiRule] = field(default_factory=list)

    @classmethod
    def default(cls) -> "PrivacyConfig":
        """Create a default privacy configuration with common PII rules."""
        return cls(
            rules=[
                PiiRule(
                    name="email",
                    patterns=[r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"],
                    action="fake",
                ),
                PiiRule(
                    name="ssn",
                    patterns=[r"^\d{3}-\d{2}-\d{4}$", r"^\d{9}$"],
                    action="mask",
                ),
                PiiRule(
                    name="phone",
                    patterns=[
                        r"^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$",
                        r"^\+\d{1,3}[-.\s]?\d{1,14}$",
                    ],
                    action="fake",
                ),
                PiiRule(
                    name="credit_card",
                    patterns=[
                        r"^\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}$",
                        r"^\d{4}[-\s]?\d{6}[-\s]?\d{5}$",
                    ],
                    action="mask",
                ),
                PiiRule(
                    name="ipv4",
                    patterns=[
                        r"^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$"
                    ],
                    action="fake",
                ),
                PiiRule(name="name", patterns=[], action="fake"),
                PiiRule(name="address", patterns=[], action="fake"),
                PiiRule(
                    name="date_of_birth",
                    patterns=[r"^\d{4}-\d{2}-\d{2}$", r"^\d{2}/\d{2}/\d{4}$"],
                    action="mask",
                ),
            ]
        )

default classmethod

default() -> PrivacyConfig

Create a default privacy configuration with common PII rules.

Source code in syntho_hive/privacy/sanitizer.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
@classmethod
def default(cls) -> "PrivacyConfig":
    """Create a default privacy configuration with common PII rules."""
    return cls(
        rules=[
            PiiRule(
                name="email",
                patterns=[r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"],
                action="fake",
            ),
            PiiRule(
                name="ssn",
                patterns=[r"^\d{3}-\d{2}-\d{4}$", r"^\d{9}$"],
                action="mask",
            ),
            PiiRule(
                name="phone",
                patterns=[
                    r"^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$",
                    r"^\+\d{1,3}[-.\s]?\d{1,14}$",
                ],
                action="fake",
            ),
            PiiRule(
                name="credit_card",
                patterns=[
                    r"^\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}$",
                    r"^\d{4}[-\s]?\d{6}[-\s]?\d{5}$",
                ],
                action="mask",
            ),
            PiiRule(
                name="ipv4",
                patterns=[
                    r"^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$"
                ],
                action="fake",
            ),
            PiiRule(name="name", patterns=[], action="fake"),
            PiiRule(name="address", patterns=[], action="fake"),
            PiiRule(
                name="date_of_birth",
                patterns=[r"^\d{4}-\d{2}-\d{2}$", r"^\d{2}/\d{2}/\d{4}$"],
                action="mask",
            ),
        ]
    )

Faking Strategy

syntho_hive.privacy.faker_contextual.ContextualFaker

Context-aware PII generator leveraging Faker locales.

Source code in syntho_hive/privacy/faker_contextual.py
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
class ContextualFaker:
    """Context-aware PII generator leveraging Faker locales."""

    LOCALE_MAP = {
        "JP": "ja_JP",
        "US": "en_US",
        "UK": "en_GB",
        "GB": "en_GB",
        "DE": "de_DE",
        "FR": "fr_FR",
        "CN": "zh_CN",
        "IN": "en_IN",
        # Add more as needed
    }

    def __init__(self):
        """Initialize faker cache and logger."""
        self._fakers: Dict[str, Faker] = {}
        # Initialize default
        self._fakers["default"] = Faker()
        self.logger = logging.getLogger(__name__)

    def _get_faker(self, locale: Optional[str]) -> Faker:
        """Get or create a Faker instance for a locale.

        Args:
            locale: Optional locale string (e.g., ``"JP"`` or ``"en_US"``).

        Returns:
            Faker instance configured for the requested locale.
        """
        if not locale:
            return self._fakers["default"]

        mapped_locale = self.LOCALE_MAP.get(locale.upper(), "en_US")

        if mapped_locale not in self._fakers:
            try:
                self._fakers[mapped_locale] = Faker(mapped_locale)
            except Exception as e:
                self.logger.warning(
                    f"Could not load locale {mapped_locale}, falling back to default. Error: {e}"
                )
                self._fakers[mapped_locale] = self._fakers["default"]

        return self._fakers[mapped_locale]

    def generate_pii(
        self, pii_type: str, context: Optional[Dict[str, Any]] = None, count: int = 1
    ) -> List[str]:
        """Generate PII values with optional contextual locale.

        Args:
            pii_type: Faker provider name (e.g., ``"email"`` or ``"phone"``).
            context: Optional row context used to infer locale (country/locale/region keys).
            count: Number of values to generate.

        Returns:
            List of generated PII strings.
        """
        if context is None:
            context = {}

        # Attempt to infer locale from context
        # Heuristic: Look for 'country', 'region', 'locale' keys
        locale = (
            context.get("country") or context.get("locale") or context.get("region")
        )

        fake = self._get_faker(locale if isinstance(locale, str) else None)

        results = []
        for _ in range(count):
            val = self._generate_single_value(fake, pii_type)
            results.append(val)

        return results

    def _generate_single_value(self, fake: Faker, pii_type: str) -> str:
        """Generate a single PII value using a Faker instance."""
        try:
            if hasattr(fake, pii_type):
                # Dynamic method call on Faker instance
                return str(getattr(fake, pii_type)())

            # Custom mappings for common PII types if name mismatch or special logic
            if pii_type == "phone":
                return fake.phone_number()
            elif pii_type == "ip" or pii_type == "ipv4":
                return fake.ipv4()
            elif pii_type == "credit_card":
                return fake.credit_card_number()
            elif pii_type == "date_of_birth":
                return str(fake.date_of_birth())
            elif pii_type == "address":
                return fake.address()

            # Fallback
            return str(fake.text(max_nb_chars=20))
        except Exception as e:
            self.logger.error(f"Error generating {pii_type}: {e}")
            return "REDACTED"

    def process_dataframe(
        self, df: pd.DataFrame, pii_cols: Dict[str, str]
    ) -> pd.DataFrame:
        """Replace placeholders with generated PII in a dataframe.

        Args:
            df: Input dataframe containing placeholder columns.
            pii_cols: Mapping of column name to PII type (e.g., ``{"user_email": "email"}``).

        Returns:
            DataFrame with specified columns replaced by generated PII.
        """
        output_df = df.copy()

        # Check if we have context columns
        has_country_context = "country" in df.columns or "locale" in df.columns

        if not has_country_context:
            # Fast path: Vectorized apply (Fake doesn't vectorize well but we avoid row iteration overhead if possible)
            # Actually simpler: Just generate N fake values using default locale
            for col, pii_type in pii_cols.items():
                fake = self._get_faker(None)
                # Generate list
                values = [
                    self._generate_single_value(fake, pii_type) for _ in range(len(df))
                ]
                output_df[col] = values
        else:
            # Slow path: Row-by-row for context awareness
            for idx, row in output_df.iterrows():
                context = row.to_dict()
                for col, pii_type in pii_cols.items():
                    val = self.generate_pii(pii_type, context=context, count=1)[0]
                    output_df.at[idx, col] = val

        return output_df

generate_pii

generate_pii(pii_type: str, context: Optional[Dict[str, Any]] = None, count: int = 1) -> List[str]

Generate PII values with optional contextual locale.

Parameters:

Name Type Description Default
pii_type str

Faker provider name (e.g., "email" or "phone").

required
context Optional[Dict[str, Any]]

Optional row context used to infer locale (country/locale/region keys).

None
count int

Number of values to generate.

1

Returns:

Type Description
List[str]

List of generated PII strings.

Source code in syntho_hive/privacy/faker_contextual.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def generate_pii(
    self, pii_type: str, context: Optional[Dict[str, Any]] = None, count: int = 1
) -> List[str]:
    """Generate PII values with optional contextual locale.

    Args:
        pii_type: Faker provider name (e.g., ``"email"`` or ``"phone"``).
        context: Optional row context used to infer locale (country/locale/region keys).
        count: Number of values to generate.

    Returns:
        List of generated PII strings.
    """
    if context is None:
        context = {}

    # Attempt to infer locale from context
    # Heuristic: Look for 'country', 'region', 'locale' keys
    locale = (
        context.get("country") or context.get("locale") or context.get("region")
    )

    fake = self._get_faker(locale if isinstance(locale, str) else None)

    results = []
    for _ in range(count):
        val = self._generate_single_value(fake, pii_type)
        results.append(val)

    return results

process_dataframe

process_dataframe(df: DataFrame, pii_cols: Dict[str, str]) -> pd.DataFrame

Replace placeholders with generated PII in a dataframe.

Parameters:

Name Type Description Default
df DataFrame

Input dataframe containing placeholder columns.

required
pii_cols Dict[str, str]

Mapping of column name to PII type (e.g., {"user_email": "email"}).

required

Returns:

Type Description
DataFrame

DataFrame with specified columns replaced by generated PII.

Source code in syntho_hive/privacy/faker_contextual.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def process_dataframe(
    self, df: pd.DataFrame, pii_cols: Dict[str, str]
) -> pd.DataFrame:
    """Replace placeholders with generated PII in a dataframe.

    Args:
        df: Input dataframe containing placeholder columns.
        pii_cols: Mapping of column name to PII type (e.g., ``{"user_email": "email"}``).

    Returns:
        DataFrame with specified columns replaced by generated PII.
    """
    output_df = df.copy()

    # Check if we have context columns
    has_country_context = "country" in df.columns or "locale" in df.columns

    if not has_country_context:
        # Fast path: Vectorized apply (Fake doesn't vectorize well but we avoid row iteration overhead if possible)
        # Actually simpler: Just generate N fake values using default locale
        for col, pii_type in pii_cols.items():
            fake = self._get_faker(None)
            # Generate list
            values = [
                self._generate_single_value(fake, pii_type) for _ in range(len(df))
            ]
            output_df[col] = values
    else:
        # Slow path: Row-by-row for context awareness
        for idx, row in output_df.iterrows():
            context = row.to_dict()
            for col, pii_type in pii_cols.items():
                val = self.generate_pii(pii_type, context=context, count=1)[0]
                output_df.at[idx, col] = val

    return output_df