pipecat/tests/test_simple_text_aggregator.py

#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import unittest

from pipecat.utils.text.simple_text_aggregator import SimpleTextAggregator


class TestSimpleTextAggregator(unittest.IsolatedAsyncioTestCase):
    def setUp(self):
        self.aggregator = SimpleTextAggregator()

    async def test_reset_aggregations(self):
        text = "Hello "
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # No complete sentences yet
        assert len(results) == 0
        assert self.aggregator.text.text == "Hello"
        await self.aggregator.reset()
        assert self.aggregator.text.text == ""

    async def test_simple_sentence(self):
        text = "Hello Pipecat!"
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # No complete sentences yet (waiting for lookahead after "!")
        assert len(results) == 0

        # Flush to get the pending sentence
        aggregate = await self.aggregator.flush()
        assert aggregate.text == "Hello Pipecat!"
        assert aggregate.type == "sentence"
        assert self.aggregator.text.text == ""

    async def test_multiple_sentences(self):
        text = "Hello Pipecat! How are you?"
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # First sentence should be complete (lookahead from "H" confirmed it)
        assert len(results) == 1
        assert results[0].text == "Hello Pipecat!"

        # Flush to get the pending sentence
        result = await self.aggregator.flush()
        assert result.text == "How are you?"

    async def test_lookahead_decimal_number(self):
        """Test that $29.95 is not split at $29."""
        text = "Ask me for only $29.95/month."
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # No complete sentences yet (waiting for lookahead after final ".")
        assert len(results) == 0

        # Can use flush() to get the pending sentence at end of stream
        result = await self.aggregator.flush()
        assert result.text == "Ask me for only $29.95/month."

    async def test_lookahead_abbreviation(self):
        """Test that Mr. Smith is not split at Mr."""
        text = "Hello Mr. Smith."
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # No complete sentences yet (waiting for lookahead after final ".")
        assert len(results) == 0

        # Can use flush() to get the pending sentence at end of stream
        result = await self.aggregator.flush()
        assert result.text == "Hello Mr. Smith."

    async def test_lookahead_actual_sentence_end(self):
        """Test that a real sentence end is detected after lookahead."""
        text = "Hello world. Next sentence"
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # First sentence should be complete (lookahead from "N" confirmed it)
        assert len(results) == 1
        assert results[0].text == "Hello world."

    async def test_flush_pending_sentence(self):
        """Test that flush() returns pending sentence waiting for lookahead."""
        text = "Hello world."
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # No complete sentences yet (waiting for lookahead)
        assert len(results) == 0

        # Call flush to get it
        result = await self.aggregator.flush()
        assert result is not None
        assert result.text == "Hello world."
        # Flush again should return None
        assert await self.aggregator.flush() == None

    async def test_flush_with_no_pending(self):
        """Test that flush() returns any remaining text in buffer."""
        text = "Hello"
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # No complete sentences
        assert len(results) == 0

        result = await self.aggregator.flush()
        # flush() now returns any remaining text, not just pending lookahead
        assert result is not None
        assert result.text == "Hello"
        # Buffer should be empty after flush
        assert self.aggregator.text.text == ""

    async def test_flush_after_lookahead_confirmed(self):
        """Test flush after lookahead has already confirmed sentence."""
        text = "Hello. W"
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # First sentence should be complete (lookahead from "W" confirmed it)
        assert len(results) == 1
        assert results[0].text == "Hello."

        # flush() returns any remaining text (the "W" in this case)
        result = await self.aggregator.flush()
        assert result.text == "W"

    async def test_japanese_multiple_sentences(self):
        """Test that Japanese sentences are properly split during streaming."""
        text = "こんにちは。元気ですか？"
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # First sentence detected when 元 arrives as lookahead after 。
        assert len(results) == 1
        assert results[0].text == "こんにちは。"

        # Flush returns the second sentence
        result = await self.aggregator.flush()
        assert result.text == "元気ですか？"

    async def test_japanese_sentence_with_lookahead(self):
        """Test that a Japanese sentence is detected with a lookahead character."""
        text = "こんにちは。元"
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # 。 triggers lookahead, then 元 confirms it
        assert len(results) == 1
        assert results[0].text == "こんにちは。"

        # Flush returns remainder
        result = await self.aggregator.flush()
        assert result.text == "元"

    async def test_chinese_streaming_tokens(self):
        """Test Chinese text split across multiple streaming tokens."""
        aggregator = SimpleTextAggregator()

        tokens = ["你好", "世界", "。", "下一", "句话", "。"]
        all_results = []
        for token in tokens:
            results = [agg async for agg in aggregator.aggregate(token)]
            all_results.extend(results)

        # First sentence detected when 下 arrives after 。
        assert len(all_results) == 1
        assert all_results[0].text == "你好世界。"

        # Flush returns the second sentence
        result = await aggregator.flush()
        assert result.text == "下一句话。"

    async def test_japanese_single_sentence_flush(self):
        """Test that a single Japanese sentence with no lookahead flushes correctly."""
        text = "こんにちは。"
        results = [agg async for agg in self.aggregator.aggregate(text)]

        # No lookahead yet - waiting
        assert len(results) == 0

        # Flush returns the complete sentence
        result = await self.aggregator.flush()
        assert result.text == "こんにちは。"


class TestSimpleTextAggregatorTokenMode(unittest.IsolatedAsyncioTestCase):
    def setUp(self):
        from pipecat.utils.text.base_text_aggregator import AggregationType

        self.aggregator = SimpleTextAggregator(aggregation_type=AggregationType.TOKEN)

    async def test_token_passthrough(self):
        """TOKEN mode yields text immediately without buffering."""
        results = [agg async for agg in self.aggregator.aggregate("Hello")]
        assert len(results) == 1
        assert results[0].text == "Hello"
        assert results[0].type == "token"

    async def test_token_multiple_calls(self):
        """Each aggregate call yields its text independently."""
        r1 = [agg async for agg in self.aggregator.aggregate("Hello ")]
        r2 = [agg async for agg in self.aggregator.aggregate("world.")]
        assert len(r1) == 1
        assert r1[0].text == "Hello "
        assert len(r2) == 1
        assert r2[0].text == "world."

    async def test_token_empty_text(self):
        """Empty text yields nothing."""
        results = [agg async for agg in self.aggregator.aggregate("")]
        assert len(results) == 0

    async def test_token_flush_returns_none(self):
        """Flush returns None in TOKEN mode since nothing is buffered."""
        await self.aggregator.aggregate("Hello").__anext__()
        result = await self.aggregator.flush()
        assert result is None


if __name__ == "__main__":
    unittest.main()