Skip to content

Commit

Permalink
Merge pull request #364 from deepgram/is-final-handling
Browse files Browse the repository at this point in the history
Handle is_final and endpointing together with utterance end + clean u…
  • Loading branch information
dvonthenen committed Apr 15, 2024
2 parents 3904fcc + 10a8b2e commit 81e2940
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 23 deletions.
58 changes: 45 additions & 13 deletions examples/streaming/async_microphone/main.py
Expand Up @@ -18,6 +18,8 @@

load_dotenv()

# We will collect the is_final=true messages here so we can use them when the person finishes speaking
is_finals = []

async def main():
try:
Expand All @@ -42,31 +44,52 @@ async def main():
dg_connection = deepgram.listen.asynclive.v("1")

async def on_open(self, open, **kwargs):
print(f"\n\n{open}\n\n")
print(f"Deepgram Connection Open")

async def on_message(self, result, **kwargs):
global is_finals
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
print(f"speaker: {sentence}")
if result.is_final:
# We need to collect these and concatenate them together when we get a speech_final=true
# See docs: https://developers.deepgram.com/docs/understand-endpointing-interim-results
is_finals.append(sentence)

# Speech Final means we have detected sufficent silence to consider this end of speech
# Speech final is the lowest latency result as it triggers as soon an the endpointing value has triggered
if result.speech_final:
utterance = ' '.join(is_finals)
print(f"Speech Final: {utterance}")
is_finals = []
else:
# These are useful if you need real time captioning and update what the Interim Results produced
print(f"Is Final: {sentence}")
else:
# These are useful if you need real time captioning of what is being spoken
print(f"Interim Results: {sentence}")

async def on_metadata(self, metadata, **kwargs):
print(f"\n\n{metadata}\n\n")
print(f"Deepgram Metadata: {metadata}")

async def on_speech_started(self, speech_started, **kwargs):
print(f"\n\n{speech_started}\n\n")
print(f"Deepgram Speech Started")

async def on_utterance_end(self, utterance_end, **kwargs):
print(f"\n\n{utterance_end}\n\n")
global is_finals
if len(is_finals) > 0:
utterance = ' '.join(is_finals)
print(f"Deepgram Utterance End: {utterance}")
is_finals = []

def on_close(self, close, **kwargs):
print(f"\n\n{close}\n\n")
async def on_close(self, close, **kwargs):
print(f"Deepgram Connection Closed")

def on_error(self, error, **kwargs):
print(f"\n\n{error}\n\n")
async def on_error(self, error, **kwargs):
print(f"Deepgram Handled Error: {error}")

def on_unhandled(self, unhandled, **kwargs):
print(f"\n\n{unhandled}\n\n")
async def on_unhandled(self, unhandled, **kwargs):
print(f"Deepgram Unhandled Websocket Message: {unhandled}")

dg_connection.on(LiveTranscriptionEvents.Open, on_open)
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
Expand All @@ -80,19 +103,28 @@ def on_unhandled(self, unhandled, **kwargs):
# connect to websocket
options: LiveOptions = LiveOptions(
model="nova-2",
punctuate=True,
language="en-US",
# Apply smart formatting to the output
smart_format=True,
# Raw audio format deatils
encoding="linear16",
channels=1,
sample_rate=16000,
# To get UtteranceEnd, the following must be set:
interim_results=True,
utterance_end_ms="1000",
vad_events=True,
# Time in milliseconds of silence to wait for before finalizing speech
endpointing=300
)

addons = {
# Prevent waiting for additional numbers
"no_delay": "true"
}

print("\n\nStart talking! Press Ctrl+C to stop...\n")
if await dg_connection.start(options) is False:
if await dg_connection.start(options, addons=addons) is False:
print("Failed to connect to Deepgram")
return

Expand Down
52 changes: 42 additions & 10 deletions examples/streaming/microphone/main.py
Expand Up @@ -16,6 +16,8 @@

load_dotenv()

# We will collect the is_final=true messages here so we can use them when the person finishes speaking
is_finals = []

def main():
try:
Expand All @@ -30,31 +32,52 @@ def main():
dg_connection = deepgram.listen.live.v("1")

def on_open(self, open, **kwargs):
print(f"\n\n{open}\n\n")
print(f"Deepgram Connection Open")

def on_message(self, result, **kwargs):
global is_finals
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
print(f"speaker: {sentence}")
if result.is_final:
# We need to collect these and concatenate them together when we get a speech_final=true
# See docs: https://developers.deepgram.com/docs/understand-endpointing-interim-results
is_finals.append(sentence)

# Speech Final means we have detected sufficent silence to consider this end of speech
# Speech final is the lowest latency result as it triggers as soon an the endpointing value has triggered
if result.speech_final:
utterance = ' '.join(is_finals)
print(f"Speech Final: {utterance}")
is_finals = []
else:
# These are useful if you need real time captioning and update what the Interim Results produced
print(f"Is Final: {sentence}")
else:
# These are useful if you need real time captioning of what is being spoken
print(f"Interim Results: {sentence}")

def on_metadata(self, metadata, **kwargs):
print(f"\n\n{metadata}\n\n")
print(f"Deepgram Metadata: {metadata}")

def on_speech_started(self, speech_started, **kwargs):
print(f"\n\n{speech_started}\n\n")
print(f"Deepgram Speech Started")

def on_utterance_end(self, utterance_end, **kwargs):
print(f"\n\n{utterance_end}\n\n")
global is_finals
if len(is_finals) > 0:
utterance = ' '.join(is_finals)
print(f"Deepgram Utterance End: {utterance}")
is_finals = []

def on_close(self, close, **kwargs):
print(f"\n\n{close}\n\n")
print(f"Deepgram Connection Closed")

def on_error(self, error, **kwargs):
print(f"\n\n{error}\n\n")
print(f"Deepgram Handled Error: {error}")

def on_unhandled(self, unhandled, **kwargs):
print(f"\n\n{unhandled}\n\n")
print(f"Deepgram Unhandled Websocket Message: {unhandled}")

dg_connection.on(LiveTranscriptionEvents.Open, on_open)
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
Expand All @@ -67,19 +90,28 @@ def on_unhandled(self, unhandled, **kwargs):

options: LiveOptions = LiveOptions(
model="nova-2",
punctuate=True,
language="en-US",
# Apply smart formatting to the output
smart_format=True,
# Raw audio format deatils
encoding="linear16",
channels=1,
sample_rate=16000,
# To get UtteranceEnd, the following must be set:
interim_results=True,
utterance_end_ms="1000",
vad_events=True,
# Time in milliseconds of silence to wait for before finalizing speech
endpointing=300
)

addons = {
# Prevent waiting for additional numbers
"no_delay": "true"
}

print("\n\nPress Enter to stop recording...\n\n")
if dg_connection.start(options) is False:
if dg_connection.start(options, addons=addons) is False:
print("Failed to connect to Deepgram")
return

Expand Down

0 comments on commit 81e2940

Please sign in to comment.