Use majority language for Soniox transcripts
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
|
||||
import json
|
||||
import time
|
||||
from collections import Counter
|
||||
from collections.abc import AsyncGenerator
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
@@ -202,16 +203,21 @@ def _prepare_language_hints(
|
||||
|
||||
|
||||
def _language_from_tokens(tokens: list[dict]) -> Language | None:
|
||||
for token in reversed(tokens):
|
||||
language_counts: Counter[Language] = Counter()
|
||||
|
||||
for token in tokens:
|
||||
language = token.get("language")
|
||||
if not language:
|
||||
continue
|
||||
try:
|
||||
return Language(language)
|
||||
language_counts[Language(language)] += 1
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return None
|
||||
if not language_counts:
|
||||
return None
|
||||
|
||||
return language_counts.most_common(1)[0][0]
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -34,25 +34,26 @@ def test_language_from_tokens_uses_single_recognized_language():
|
||||
assert _language_from_tokens(tokens) == Language.EN
|
||||
|
||||
|
||||
def test_language_from_tokens_uses_latest_language():
|
||||
def test_language_from_tokens_uses_most_common_language():
|
||||
tokens = [
|
||||
{"text": "Hallo", "language": "nl"},
|
||||
{"text": " world", "language": "en"},
|
||||
{"text": "Ik", "language": "nl"},
|
||||
{"text": " zoek", "language": "nl"},
|
||||
{"text": " computer", "language": "en"},
|
||||
]
|
||||
|
||||
assert _language_from_tokens(tokens) == Language.EN
|
||||
assert _language_from_tokens(tokens) == Language.NL
|
||||
|
||||
|
||||
def test_language_from_tokens_skips_unknown_latest_language():
|
||||
def test_language_from_tokens_skips_unknown_language():
|
||||
tokens = [
|
||||
{"text": " world", "language": "en"},
|
||||
{"text": "Hello", "language": "en"},
|
||||
{"text": "!", "language": "klingon"},
|
||||
]
|
||||
|
||||
assert _language_from_tokens(tokens) == Language.EN
|
||||
|
||||
|
||||
def test_language_from_tokens_skips_missing_latest_language():
|
||||
def test_language_from_tokens_skips_missing_language():
|
||||
tokens = [
|
||||
{"text": "Hello", "language": "en"},
|
||||
{"text": " wereld"},
|
||||
@@ -71,6 +72,15 @@ def test_language_from_tokens_ignores_unknown_and_missing_languages():
|
||||
assert _language_from_tokens(tokens) is None
|
||||
|
||||
|
||||
def test_language_from_tokens_uses_first_language_on_tie():
|
||||
tokens = [
|
||||
{"text": "Hello", "language": "en"},
|
||||
{"text": " wereld", "language": "nl"},
|
||||
]
|
||||
|
||||
assert _language_from_tokens(tokens) == Language.EN
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_receive_messages_sets_final_transcription_language(monkeypatch):
|
||||
service = SonioxSTTService(api_key="test-key")
|
||||
@@ -90,8 +100,9 @@ async def test_receive_messages_sets_final_transcription_language(monkeypatch):
|
||||
json.dumps(
|
||||
{
|
||||
"tokens": [
|
||||
{"text": "Hello", "is_final": True, "language": "en"},
|
||||
{"text": " world", "is_final": True, "language": "en"},
|
||||
{"text": "Ik", "is_final": True, "language": "nl"},
|
||||
{"text": " zoek", "is_final": True, "language": "nl"},
|
||||
{"text": " computer", "is_final": True, "language": "en"},
|
||||
{"text": END_TOKEN, "is_final": True},
|
||||
]
|
||||
}
|
||||
@@ -108,14 +119,15 @@ async def test_receive_messages_sets_final_transcription_language(monkeypatch):
|
||||
|
||||
final_frames = [frame for frame in pushed_frames if isinstance(frame, TranscriptionFrame)]
|
||||
assert len(final_frames) == 1
|
||||
assert final_frames[0].text == "Hello world"
|
||||
assert final_frames[0].language == Language.EN
|
||||
assert final_frames[0].text == "Ik zoek computer"
|
||||
assert final_frames[0].language == Language.NL
|
||||
assert final_frames[0].finalized is True
|
||||
assert final_frames[0].result == [
|
||||
{"text": "Hello", "is_final": True, "language": "en"},
|
||||
{"text": " world", "is_final": True, "language": "en"},
|
||||
{"text": "Ik", "is_final": True, "language": "nl"},
|
||||
{"text": " zoek", "is_final": True, "language": "nl"},
|
||||
{"text": " computer", "is_final": True, "language": "en"},
|
||||
]
|
||||
assert traced_transcriptions == [("Hello world", True, Language.EN)]
|
||||
assert traced_transcriptions == [("Ik zoek computer", True, Language.NL)]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
Reference in New Issue
Block a user