commit 0ce39d14146df3261bceed824ab36f29c1068902
parent d52d260f3003e3dbebccd9f64594d80f0114b805
Author: khanumballz <[email protected]>
Date: Sat, 18 May 2024 07:10:27 +0100
Piper TTS and Ollama Optimization
Diffstat:
2 files changed, 104 insertions(+), 0 deletions(-)
diff --git a/python/conversation/llama/tts_output.wav b/python/conversation/llama/tts_output.wav
Binary files differ.
diff --git a/python/conversation/llama/voice-and-motor.py b/python/conversation/llama/voice-and-motor.py
@@ -0,0 +1,104 @@
+import serial
+import time
+import json
+import simpleaudio as sa
+import pyaudio
+from vosk import Model as srModel
+from vosk import KaldiRecognizer
+from dimits import Dimits
+import ollama
+#from ollama import generate
+
+# Open serial port
+ser = serial.Serial('/dev/ttyUSB0', 115200, timeout=1)
+dt_model = Dimits("en_US-amy-medium")
+
+srmodel = srModel("/home/khan/srmodels/vosk-model-small-en-us-0.15")
+recognizer = KaldiRecognizer(srmodel, 16000)
+
+if not ser.is_open:
+ print("Failed to open serial port.")
+
+print("Serial port opened successfully.")
+
+def send_command(ser, command):
+ ser.write(command.encode())
+ ser.write(b'\n') # Assuming commands are terminated with a newline character
+
+def generate_and_save_tts(dt_model, text):
+ # Initialize Dimits with the desired voice model
+ dt = dt_model
+ # Convert text to audio and save it as a WAV file
+ audio_path = dt.text_2_audio_file(text, "tts_output", "/home/khan/sylvie-2024/python/conversation/llama", format="wav")
+ # Return None as the audio path (since it's saved to a file directly)
+ return audio_path
+
+def play_audio_file(audio_path):
+ # Play the saved WAV file using simpleaudio
+ wave_obj = sa.WaveObject.from_wave_file(audio_path)
+ play_obj = wave_obj.play()
+ play_obj.wait_done() # Wait for the audio playback to finish
+
+def main():
+ mic = pyaudio.PyAudio()
+ stream = mic.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8192)
+ stream.start_stream()
+
+ user_prompt = "You are Sylvie Deane who is a cute librarian from French Polynesia, and somebody asked you: "
+
+ print('Ask Sylvie a question: ')
+ while True:
+ srdata = stream.read(4096)
+
+ if recognizer.AcceptWaveform(srdata):
+ srtext = recognizer.Result()
+
+ # Convert the recognizer result string into a dictionary
+ resultDict = json.loads(srtext)
+ if not resultDict.get("text", "") == "":
+ user_prompt += resultDict["text"]
+ user_prompt += "?"
+ user_prompt += " So here is how you responded:"
+ print(user_prompt)
+ break
+ else:
+ print("No input sound")
+
+ send_command(ser, "1sr")
+
+ # Generate a response using the Ollama API
+ ollama_stream = ollama.chat(
+ model='dolphin-phi2-usb',
+ messages=[{'role': 'user', 'content': user_prompt}],
+ stream=True,
+ )
+
+ generated_text = ""
+ for chunk in ollama_stream:
+ generated_text += chunk['message']['content']
+ if len(generated_text.split()) >= 20:
+ print(generated_text)
+ user_text = generated_text
+ break
+
+ # Determine the command type based on the length of the input
+ if len(user_text.split()) > 10:
+ command_type = "1sm" # Use long command for speech
+ elif len(user_text.split()) > 5:
+ command_type = "1so" # Use medium command for speech
+ else:
+ command_type = "1sn"
+
+ # Generate the TTS audio file with the user's text
+ audio_path = generate_and_save_tts(dt_model, user_text)
+
+ # Play the TTS audio file using simpleaudio
+ send_command(ser, command_type) # Use variable command_type
+ print("Sent motor command")
+ play_audio_file(audio_path)
+ time.sleep(3)
+
+if __name__ == "__main__":
+ while True:
+ main()
+ #break # Exit after one iteration, remove or modify for continuous operation