From ca35299dcd03f6283e6b78bf8303c4462115e7a0 Mon Sep 17 00:00:00 2001 From: "marc.torsoc" Date: Wed, 21 May 2025 12:08:53 +0200 Subject: [PATCH] add link cleaning and a test for it --- src/pipecat/utils/text/markdown_text_filter.py | 3 +++ tests/test_markdown_text_filter.py | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/pipecat/utils/text/markdown_text_filter.py b/src/pipecat/utils/text/markdown_text_filter.py index 6f5e16bd0..5ec960ad2 100644 --- a/src/pipecat/utils/text/markdown_text_filter.py +++ b/src/pipecat/utils/text/markdown_text_filter.py @@ -100,6 +100,9 @@ class MarkdownTextFilter(BaseTextFilter): # Restore leading and trailing spaces filtered_text = re.sub("§", " ", filtered_text) + ## Make links more readable + filtered_text = re.sub(r"https?://", "", filtered_text) + return filtered_text else: return text diff --git a/tests/test_markdown_text_filter.py b/tests/test_markdown_text_filter.py index a82a85811..d1cd79a4a 100644 --- a/tests/test_markdown_text_filter.py +++ b/tests/test_markdown_text_filter.py @@ -137,6 +137,18 @@ class TestMarkdownTextFilter(unittest.IsolatedAsyncioTestCase): result, expected, f"Newline handling failed for:\n{input_text}\nGot:\n{result}" ) + async def test_links_cleaning(self): + """Test cleaning of links and URLs, i.e. https?:// is removed.""" + test_cases = { + "Please check http://example.com": "Please check example.com", + "Visit https://www.google.com for more": "Visit www.google.com for more", + "No link here": "No link here", # No link to clean + } + + for input_text, expected in test_cases.items(): + result = await self.filter.filter(input_text) + self.assertEqual(result, expected, f"Link cleaning failed for: '{input_text}'") + async def test_numbered_list_marker_handling(self): """Test handling of numbered lists with the special §NUM§ marker.""" test_cases = {