I am trying to use the speech to text API to write a pytest but I am having some issues. I am not clear if this is due to the configuration or my misunderstanding. Am I going about this correctly?
I am downloading wav files from here:
https://evolution.voxeo.com/library/audio/prompts/numbers/0.wav
Then re-encoding it using this:
audio_segment = AudioSegment.from_file(BytesIO(data))
audio_segment = audio_segment.set_frame_rate(16000)
audio_segment = audio_segment.set_channels(1)
audio_segment = audio_segment.set_sample_width(2)
wav_data = audio_segment.export(format="wav").read()
# Initialize RhasspyClient
client = RhasspyClient("http://localhost",12101,"/api")
This makes a call to the transcribe_wave file function I have
result = await client.transcribe_wav_file(file)
response_dict = json.loads(result)
response = response_dict["text"]
print(f"Transcribed text: {response}")
async def transcribe_wav_file(self, wav_data: bytes) -> str:
url = f"{self.url}/speech-to-text"
headers = {"Content-Type": "audio/wav"}
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, data=wav_data) as response:
text = await response.text()
return text
The response should return “zero” but I am getting “is the garage door open” back as a result.
I made these changes to the profile.js, but not sure if they are correct.
cat ~/.config/rhasspy/profiles/en/profile.json
{
"dialogue": {
"system": "rhasspy"
},
"microphone": {
"pyaudio": {
"device": "10"
},
"system": "pyaudio"
},
"sounds": {
"system": "aplay"
},
"speech_to_text": {
"kaldi": {
"speech_to_text.kaldi.open_transcription": true
},
"system": "kaldi"
},
"text_to_speech": {
"system": "nanotts"
},
"wake": {
"system": "porcupine"
}