Skip to main content

flow_security/
sanitizer.rs

1use regex::Regex;
2use std::collections::HashMap;
3
4const PLACEHOLDER_HOSTNAME: &str = "[HOSTNAME]";
5const PLACEHOLDER_DATASET: &str = "[DATASET_NAME]";
6const PLACEHOLDER_IP: &str = "[IP]";
7const PLACEHOLDER_CRED: &str = "[CRED]";
8
9#[derive(Debug, Clone, Default)]
10pub struct RehydrationMap {
11    entries: HashMap<String, String>,
12}
13
14impl RehydrationMap {
15    pub fn rehydrate(&self, text: &str) -> String {
16        let mut out = text.to_string();
17        for (placeholder, original) in &self.entries {
18            out = out.replace(placeholder.as_str(), original);
19        }
20        out
21    }
22
23    pub fn is_empty(&self) -> bool {
24        self.entries.is_empty()
25    }
26}
27
28#[derive(Debug, Clone)]
29pub struct SanitizedInput {
30    pub text: String,
31    pub map: RehydrationMap,
32}
33
34pub struct PiiSanitizer {
35    ip_re: Regex,
36    hostname_re: Regex,
37    dataset_re: Regex,
38    cred_re: Regex,
39}
40
41impl Default for PiiSanitizer {
42    fn default() -> Self {
43        Self::new()
44    }
45}
46
47impl PiiSanitizer {
48    pub fn new() -> Self {
49        Self {
50            ip_re: Regex::new(r"\b\d{1,3}(?:\.\d{1,3}){3}\b").unwrap(),
51            hostname_re: Regex::new(
52                r"\b(?:[A-Za-z0-9][A-Za-z0-9\-]*\.)+[A-Za-z]{2,}\b",
53            )
54            .unwrap(),
55            dataset_re: Regex::new(
56                r"\b[A-Z][A-Z0-9$#@]{0,7}(?:\.[A-Z][A-Z0-9$#@]{0,7}){1,21}\b",
57            )
58            .unwrap(),
59            cred_re: Regex::new(
60                r"(?i)\b(?:password|passwd|pwd|secret|token|apikey|api_key|authorization)\s*[:=]\s*\S+",
61            )
62            .unwrap(),
63        }
64    }
65
66    pub fn sanitize(&self, input: &str) -> SanitizedInput {
67        let mut entries: HashMap<String, String> = HashMap::new();
68        let mut counter = 0u32;
69
70        let mut text = input.to_string();
71        text = replace_with_placeholder(
72            &self.cred_re,
73            &text,
74            PLACEHOLDER_CRED,
75            &mut counter,
76            &mut entries,
77        );
78        text = replace_with_placeholder(
79            &self.ip_re,
80            &text,
81            PLACEHOLDER_IP,
82            &mut counter,
83            &mut entries,
84        );
85        // Datasets (all-caps qualified names) before hostnames so they don't get mis-classified.
86        text = replace_with_placeholder(
87            &self.dataset_re,
88            &text,
89            PLACEHOLDER_DATASET,
90            &mut counter,
91            &mut entries,
92        );
93        text = replace_with_placeholder(
94            &self.hostname_re,
95            &text,
96            PLACEHOLDER_HOSTNAME,
97            &mut counter,
98            &mut entries,
99        );
100
101        SanitizedInput {
102            text,
103            map: RehydrationMap { entries },
104        }
105    }
106}
107
108fn replace_with_placeholder(
109    re: &Regex,
110    text: &str,
111    base_placeholder: &str,
112    counter: &mut u32,
113    entries: &mut HashMap<String, String>,
114) -> String {
115    let mut out = String::with_capacity(text.len());
116    let mut last = 0;
117    for m in re.find_iter(text) {
118        out.push_str(&text[last..m.start()]);
119        *counter += 1;
120        let placeholder = format!("{}_{}", base_placeholder.trim_end_matches(']'), counter);
121        let placeholder = format!("{}]", placeholder);
122        entries.insert(placeholder.clone(), m.as_str().to_string());
123        out.push_str(&placeholder);
124        last = m.end();
125    }
126    out.push_str(&text[last..]);
127    out
128}
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133
134    #[test]
135    fn redacts_ip_address() {
136        let s = PiiSanitizer::new();
137        let r = s.sanitize("connect 10.20.30.40 now");
138        assert!(!r.text.contains("10.20.30.40"));
139        assert!(r.text.contains("[IP_"));
140    }
141
142    #[test]
143    fn redacts_dataset_name() {
144        let s = PiiSanitizer::new();
145        let r = s.sanitize("//SYSIN DD DSN=USER01.PROD.DATA,DISP=SHR");
146        assert!(!r.text.contains("USER01.PROD.DATA"));
147        assert!(r.text.contains("[DATASET_NAME_"));
148    }
149
150    #[test]
151    fn redacts_credential_pattern() {
152        let s = PiiSanitizer::new();
153        let r = s.sanitize("password=hunter2");
154        assert!(!r.text.contains("hunter2"));
155    }
156
157    #[test]
158    fn rehydrate_restores_originals() {
159        let s = PiiSanitizer::new();
160        let r = s.sanitize("server 10.0.0.1 down");
161        let back = r.map.rehydrate(&r.text);
162        assert_eq!(back, "server 10.0.0.1 down");
163    }
164
165    #[test]
166    fn empty_input_is_empty_output() {
167        let s = PiiSanitizer::new();
168        let r = s.sanitize("");
169        assert_eq!(r.text, "");
170        assert!(r.map.is_empty());
171    }
172}