From 5e41ac23b13ea596689f92919d8062305826ff41 Mon Sep 17 00:00:00 2001 From: Edward Date: Mon, 17 Nov 2025 15:04:28 -0600 Subject: [PATCH 1/8] Add domain cleaner --- src/vcr_cleaner/cleaners/uri.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/vcr_cleaner/cleaners/uri.py b/src/vcr_cleaner/cleaners/uri.py index db857be..bc84e3b 100644 --- a/src/vcr_cleaner/cleaners/uri.py +++ b/src/vcr_cleaner/cleaners/uri.py @@ -16,3 +16,29 @@ def clean_uri(request: dict, response: dict): clean_uri.__doc__ = f"Replaces the request URI string with all " \ f"occurrences of substring '{old}' replaced by '{new}'." return clean_uri + + +def _clean_dict_hostnames(message: dict, rule: str, replacement: str): + '''Update the dictionary with rule matches replaced.''' + + cleaned = re.sub(rule, replacement, json.dumps(message)) + + # Update the original dict + message.clear() + message.update(json.loads(cleaned)) + + +def clean_domains(domain: str, replacement: str='cleaned.example.edu'): + '''Replace anything that looks like the given domain.''' + + rule = f"/[^/]+{ domain.replace('.', '\.') }" + rep = f"/{ replacement }" + + def wrapper(request: dict, response: dict): + _clean_dict_hostnames(request, rule, rep) + _clean_dict_hostnames(response, rule, rep) + + wrapper.__doc__ = clean_domains.__doc__ + + return wrapper + From b5e5873c34af33c0fb859760f1e9c39c5abd5f08 Mon Sep 17 00:00:00 2001 From: Edward Date: Mon, 17 Nov 2025 15:18:21 -0600 Subject: [PATCH 2/8] WIP - Add TODO --- src/vcr_cleaner/cleaners/uri.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vcr_cleaner/cleaners/uri.py b/src/vcr_cleaner/cleaners/uri.py index bc84e3b..14ad354 100644 --- a/src/vcr_cleaner/cleaners/uri.py +++ b/src/vcr_cleaner/cleaners/uri.py @@ -30,6 +30,7 @@ def _clean_dict_hostnames(message: dict, rule: str, replacement: str): def clean_domains(domain: str, replacement: str='cleaned.example.edu'): '''Replace anything that looks like the given domain.''' + # TODO: Add a unit test for this. - Confirm it can handle top level domain and sub-domain. rule = f"/[^/]+{ domain.replace('.', '\.') }" rep = f"/{ replacement }" From 4f1d4b270650ea97d10ba177bbf71f3de07e709b Mon Sep 17 00:00:00 2001 From: Edward Date: Mon, 17 Nov 2025 15:26:26 -0600 Subject: [PATCH 3/8] Fixup --- src/vcr_cleaner/cleaners/uri.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/vcr_cleaner/cleaners/uri.py b/src/vcr_cleaner/cleaners/uri.py index 14ad354..f0ba5f2 100644 --- a/src/vcr_cleaner/cleaners/uri.py +++ b/src/vcr_cleaner/cleaners/uri.py @@ -28,18 +28,17 @@ def _clean_dict_hostnames(message: dict, rule: str, replacement: str): message.update(json.loads(cleaned)) -def clean_domains(domain: str, replacement: str='cleaned.example.edu'): +def clean_domains(domain: str, replacement: str = 'cleaned.example.edu'): '''Replace anything that looks like the given domain.''' # TODO: Add a unit test for this. - Confirm it can handle top level domain and sub-domain. - rule = f"/[^/]+{ domain.replace('.', '\.') }" - rep = f"/{ replacement }" + rule = f"/[^/]+{domain.replace('.', '\.')}" + rep = f"/{replacement}" def wrapper(request: dict, response: dict): - _clean_dict_hostnames(request, rule, rep) - _clean_dict_hostnames(response, rule, rep) + _clean_dict_hostnames(request, rule, rep) + _clean_dict_hostnames(response, rule, rep) wrapper.__doc__ = clean_domains.__doc__ return wrapper - From 6fdc2b995d029576e09b7a42b054f64100d4067e Mon Sep 17 00:00:00 2001 From: Edward Date: Mon, 17 Nov 2025 15:28:43 -0600 Subject: [PATCH 4/8] Fixup --- src/vcr_cleaner/cleaners/uri.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vcr_cleaner/cleaners/uri.py b/src/vcr_cleaner/cleaners/uri.py index f0ba5f2..b99f86a 100644 --- a/src/vcr_cleaner/cleaners/uri.py +++ b/src/vcr_cleaner/cleaners/uri.py @@ -32,7 +32,7 @@ def clean_domains(domain: str, replacement: str = 'cleaned.example.edu'): '''Replace anything that looks like the given domain.''' # TODO: Add a unit test for this. - Confirm it can handle top level domain and sub-domain. - rule = f"/[^/]+{domain.replace('.', '\.')}" + rule = f"/[^/]+{domain.replace('.', r'\.')}" rep = f"/{replacement}" def wrapper(request: dict, response: dict): From 570397ca20c237a9e48b0d08ccf25fd90fdc2b7e Mon Sep 17 00:00:00 2001 From: Edward Date: Mon, 17 Nov 2025 15:30:27 -0600 Subject: [PATCH 5/8] Fixup --- src/vcr_cleaner/cleaners/uri.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/vcr_cleaner/cleaners/uri.py b/src/vcr_cleaner/cleaners/uri.py index b99f86a..570f883 100644 --- a/src/vcr_cleaner/cleaners/uri.py +++ b/src/vcr_cleaner/cleaners/uri.py @@ -1,3 +1,7 @@ +import json +import re + + def clean_uri(old: str, new: str): """Returns a cleaner function that replaces the request URI string with all occurrences of substring old replaced by new. @@ -20,7 +24,6 @@ def clean_uri(request: dict, response: dict): def _clean_dict_hostnames(message: dict, rule: str, replacement: str): '''Update the dictionary with rule matches replaced.''' - cleaned = re.sub(rule, replacement, json.dumps(message)) # Update the original dict @@ -30,7 +33,8 @@ def _clean_dict_hostnames(message: dict, rule: str, replacement: str): def clean_domains(domain: str, replacement: str = 'cleaned.example.edu'): '''Replace anything that looks like the given domain.''' - # TODO: Add a unit test for this. - Confirm it can handle top level domain and sub-domain. + # TODO: Add a unit test for this. + # - Confirm it can handle top level domain and sub-domain. rule = f"/[^/]+{domain.replace('.', r'\.')}" rep = f"/{replacement}" From 7c9fbe404739110e4ff4449ca983cba99a34aa77 Mon Sep 17 00:00:00 2001 From: Edward Date: Mon, 17 Nov 2025 15:50:10 -0600 Subject: [PATCH 6/8] Add tests for clean_domain --- src/vcr_cleaner/cleaners/uri.py | 4 +--- tests/test_uri.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/vcr_cleaner/cleaners/uri.py b/src/vcr_cleaner/cleaners/uri.py index 570f883..9967caa 100644 --- a/src/vcr_cleaner/cleaners/uri.py +++ b/src/vcr_cleaner/cleaners/uri.py @@ -33,10 +33,8 @@ def _clean_dict_hostnames(message: dict, rule: str, replacement: str): def clean_domains(domain: str, replacement: str = 'cleaned.example.edu'): '''Replace anything that looks like the given domain.''' - # TODO: Add a unit test for this. - # - Confirm it can handle top level domain and sub-domain. - rule = f"/[^/]+{domain.replace('.', r'\.')}" + rule = f"/[^/]*{domain.replace('.', r'\.')}" rep = f"/{replacement}" def wrapper(request: dict, response: dict): diff --git a/tests/test_uri.py b/tests/test_uri.py index b64fbc5..f288264 100644 --- a/tests/test_uri.py +++ b/tests/test_uri.py @@ -1,4 +1,7 @@ -from vcr_cleaner.cleaners.uri import clean_uri +from vcr_cleaner.cleaners.uri import ( + clean_domains, + clean_uri, +) def test_simple_clean_uri(): @@ -13,3 +16,27 @@ def test_simple_clean_uri(): assert str(cleaner.__doc__) != 'None' assert 'example' in str(cleaner.__doc__) assert 'foo' in str(cleaner.__doc__) + + +def test_clean_domain(): + request = { + 'sub-domain': 'https://foo.illinois.edu', + 'sub-sub-domain': 'https://foo.bar.illinois.edu', + 'uri': 'https://illinois.edu', + 'insecure': 'http://illinois.edu', + 'essay': 'Lorum ipsum https://illinois.edu, and so on...', + } + response = request.copy() + + cleaner = clean_domains('illinois.edu') + cleaner(request, response) + + assert str(cleaner.__doc__) != 'None' + + for key in ['uri', 'sub-domain', 'sub-sub-domain']: + assert request[key] == 'https://cleaned.example.edu' + + assert request['insecure'] == 'http://cleaned.example.edu' + assert 'illinois.edu' not in request['essay'] + + assert request == response From fa873dfcf9baea467e0da5cae0ae7eef96a1fa10 Mon Sep 17 00:00:00 2001 From: Edward Date: Mon, 17 Nov 2025 15:50:53 -0600 Subject: [PATCH 7/8] Fixup --- src/vcr_cleaner/cleaners/uri.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vcr_cleaner/cleaners/uri.py b/src/vcr_cleaner/cleaners/uri.py index 9967caa..e2695ed 100644 --- a/src/vcr_cleaner/cleaners/uri.py +++ b/src/vcr_cleaner/cleaners/uri.py @@ -33,7 +33,6 @@ def _clean_dict_hostnames(message: dict, rule: str, replacement: str): def clean_domains(domain: str, replacement: str = 'cleaned.example.edu'): '''Replace anything that looks like the given domain.''' - rule = f"/[^/]*{domain.replace('.', r'\.')}" rep = f"/{replacement}" From 9621796b054e3dd6f5dc6c78dd23a1e74d74b47b Mon Sep 17 00:00:00 2001 From: Edward Date: Mon, 17 Nov 2025 15:55:20 -0600 Subject: [PATCH 8/8] Fixup; --- src/vcr_cleaner/cleaners/uri.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vcr_cleaner/cleaners/uri.py b/src/vcr_cleaner/cleaners/uri.py index e2695ed..a6aac07 100644 --- a/src/vcr_cleaner/cleaners/uri.py +++ b/src/vcr_cleaner/cleaners/uri.py @@ -22,7 +22,7 @@ def clean_uri(request: dict, response: dict): return clean_uri -def _clean_dict_hostnames(message: dict, rule: str, replacement: str): +def _regex_sub_dict(message: dict, rule: str, replacement: str): '''Update the dictionary with rule matches replaced.''' cleaned = re.sub(rule, replacement, json.dumps(message)) @@ -32,13 +32,13 @@ def _clean_dict_hostnames(message: dict, rule: str, replacement: str): def clean_domains(domain: str, replacement: str = 'cleaned.example.edu'): - '''Replace anything that looks like the given domain.''' + '''Replace anything that looks like the given domain and sub-domains.''' rule = f"/[^/]*{domain.replace('.', r'\.')}" rep = f"/{replacement}" def wrapper(request: dict, response: dict): - _clean_dict_hostnames(request, rule, rep) - _clean_dict_hostnames(response, rule, rep) + _regex_sub_dict(request, rule, rep) + _regex_sub_dict(response, rule, rep) wrapper.__doc__ = clean_domains.__doc__