one morning later… english speech to text, then translated to spanish — in realtime & offline

we live in the future!! this is some crazy stuff

here’s a short/random video demo (needs audio on)


The translation engine really wants proper grammar (e.g. capitalization and punctuation), and the transcription engine really doesn’t care about all that nonsense.

So now I’m working through grammar checkers.

attempts at grammar fixing

I found a promising library that unfortunately turned out to just be a wrapper around an API service, a.k.a. doesn’t function offline T^T It works quite well.
The other two options: language_tool_python, which is a wrapper around the grammar/spell check that libreoffice uses. It’s more rules-based. And then some random transformers model.

Here’s the results. The random transformers model doesn’t perform as well as the (company, gingerit)’s fix and is rather … unpredictable. The language tool doesn’t perform great either, but it’s relatively fast. And then we have gingerit, which is slow purely b/c it’s an online model (and not a fair comparison in that case for accuracy).

I also show how fixing the grammar/spelling matters when then putting it through argotranslate.

grammar fix tool comparison

Timing Code for above

import time
from funcy import print_durations
from gingerit.gingerit import GingerIt
import language_tool_python
from happytransformer import HappyTextToText
from happytransformer import TTSettings

import argostranslate.package
import argostranslate.translate

def trans(txt):
   return argostranslate.translate.translate(
       txt, 'en', 'es')

parser = GingerIt()
tool = language_tool_python.LanguageTool('en-US')
happy_tt = HappyTextToText("T5", "prithivida/grammar_error_correcter_v1")
settings = TTSettings(do_sample=True, top_k=10, temperature=0.5, min_length=1, max_length=100)

txt1 = 'i have a cat in my pants'
txt2 = 'hi hows it going'

for i, txt in enumerate([txt1, txt2]):
    print('Text to fix: ', txt)
    print('Translation w/o grammar fix: ', trans(txt))
    print('\tgingerit\t\t', parser.parse(txt)['result'], '\t',
          f"{trans(parser.parse(txt)['result']) if i==1 else ''}")
    print('\tlanguage_tool_python\t', tool.correct(txt),'\t',
          f"{trans(tool.correct(txt)) if i==1 else ''}"
    print('\tt5 transformer\t\t', 
          happy_tt.generate_text(txt, args=settings).text,'\t',
          f"{trans(happy_tt.generate_text(txt, args=settings).text) if i==1 else ''}"

    with print_durations('Timing gingerit'):
        for i in range(20):

    with print_durations('Timing language_tool_python'):
        for i in range(20):

    with print_durations('Timing t5 transformer'):
        for i in range(20):
              happy_tt.generate_text(txt, args=settings).text

code for the video demo above (just mashed up from the argotranslate and vosk-api readmes)

# 12 May 2023
# nrobot

import sys
import argostranslate.package
import argostranslate.translate
import queue
import json

import sounddevice as sd
import wave
from vosk import Model, KaldiRecognizer, SetLogLevel

from_code = "en"
to_code = "es"

def setup_trans():

# Download and install Argos Translate package
    available_packages = argostranslate.package.get_available_packages()
    package_to_install = next(
            lambda x: x.from_code == from_code and x.to_code == to_code, available_packages

def transl(phrase):
   return argostranslate.translate.translate(phrase, from_code, to_code)

def setup_transcribe():
# You can set log level to -1 to disable debug messages

if __name__ == '__main__': 
    wf ='test.wav', "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        print("Audio file must be WAV format mono PCM.")

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
        if rec.AcceptWaveform(data):

    print('final result' , rec.FinalResult())
    res = json.loads(result)

    model = Model(lang="en-us")

    q = queue.Queue()

    def callback(indata, frames, time, status):
        """This is called (from a separate thread) for each audio block."""
        if status:
            print(status, file=sys.stderr)

    device = None
    device_info = sd.query_devices(device, 'input')
    # soundfile expects an int, sounddevice provides a float:
    samplerate = int(device_info['default_samplerate'])

    model = Model(lang="en-us")

        with sd.RawInputStream(samplerate=samplerate, blocksize = 8000, device=dfg
evice, dtype='int16',
                               channels=1, callback=callback):
            print('#' * 80)
            print('Press Ctrl+C to stop the recording')
            print('#' * 80)
            print(f'Samplerate: {samplerate}, device: {device}')

            rec = KaldiRecognizer(model, samplerate)

            #translating = False
            while True:
                data = q.get()
                if rec.AcceptWaveform(data):
                    sentence = json.loads(rec.Result())['text']
                    print('\t !------ \n')
                    print('sentence: ', sentence)
                    print('translation: ', transl(sentence))
                    print('listening for input again')
                    #print('waiting for a full sentence')
                    #print('partial result', rec.PartialResult())

    except KeyboardInterrupt:
    except Exception as e:
        print('Exception: ', e)