Sounddevice amplitude different to system?

rolyan_trauts · December 3, 2022, 4:14am

I have been starting again with KWS as guess one might be needed.
Could anyone test this as my memory but confused as the input in my computer in this setup seems out of whack with sounddevice.
After some head scratching and looking at inputs I have had to set a gain of x10?!?

The Marvin tflite model is here but really its just gain as has something happened with ubuntu or sounddevice?

The script I was just starting with is

import tensorflow as tf
import sounddevice as sd
import numpy as np
import threading
  
def sd_callback(rec, frames, time, status):
    global gain, max_rec, kw_hit, kw_count, sample_rate, rec_duration
    # Notify if errors
    if status:
        print('Error:', status)
    
    rec = np.reshape(rec, (1, int(sample_rate * rec_duration)))
    rec = np.multiply(rec, gain)
    
    # Make prediction from model
    interpreter1.set_tensor(input_details1[0]['index'], rec)
    # set input states (index 1...)
    for s in range(1, len(input_details1)):
      interpreter1.set_tensor(input_details1[s]['index'], inputs1[s])
  
    interpreter1.invoke()
    output_data = interpreter1.get_tensor(output_details1[0]['index'])
    # get output states and set it back to input states
    # which will be fed in the next inference cycle
    for s in range(1, len(input_details1)):
      # The function `get_tensor()` returns a copy of the tensor data.
      # Use `tensor()` in order to get a pointer to the tensor.
      inputs1[s] = interpreter1.get_tensor(output_details1[s]['index'])
       
    lvl = np.max(np.abs(rec))
    if lvl > max_rec:
      max_rec = lvl
          
    if output_data[0][0] > 0.95:
      kw_hit = True
      kw_count += 1
      print("Marvin:", output_data[0][0], lvl)

    elif output_data[0][1] > 0.90:
        if kw_hit == True:
          print('Max lvl:', max_rec)
          kw_hit = False
          max_rec = 0.0
          if kw_count > 60:
            print('Hello Marvin', kw_count)
        kw_count = 0

        
# Parameters
rec_duration = 0.020
sample_rate = 16000
num_channels = 1

gain = 10.0
max_rec = 0.0
kw_hit = False
kw_count = 0
sd.default.latency= ('high', 'high')
sd.default.dtype= ('float32', 'float32')

# Load the TFLite model and allocate tensors.
interpreter1 = tf.lite.Interpreter(model_path="../GoogleKWS/models2/crnn/tflite_stream_state_external/stream_state_external.tflite")
#interpreter1 = tf.lite.Interpreter(model_path="../GoogleKWS/models2/bc_resnet_2/tflite_stream_state_external/stream_state_external.tflite")

interpreter1.allocate_tensors()

# Get input and output tensors.
input_details1 = interpreter1.get_input_details()
output_details1 = interpreter1.get_output_details()

inputs1 = []

for s in range(len(input_details1)):
  inputs1.append(np.zeros(input_details1[s]['shape'], dtype=np.float32))
    

# Start streaming from microphone
with sd.InputStream(channels=num_channels,
                    samplerate=sample_rate,
                    blocksize=int(sample_rate * rec_duration),
                    callback=sd_callback):
    threading.Event().wait()

Anyone notice if I am doing something dumb as you should be able to see max amplitude on a utterance of ‘Marvin’ just a test model but OK