Skip to content

Commit

Permalink
Merge pull request #625 from joy-void-joy/whisper_integration
Browse files Browse the repository at this point in the history
[feat(whisper)] Add recognize_whisper
  • Loading branch information
ftnext committed Nov 6, 2022
2 parents cafe8e9 + b3665f4 commit 7461563
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/unittests.yml
Expand Up @@ -27,9 +27,11 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install --no-install-recommends -y libpulse-dev libasound2-dev
sudo apt-get install --no-install-recommends -y ffmpeg
- name: Install Python dependencies
run: |
python -m pip install 'pocketsphinx<5'
python -m pip install git+https://github.com/openai/whisper.git
python -m pip install .
- name: Test with unittest
run: |
Expand Down
8 changes: 8 additions & 0 deletions README.rst
Expand Up @@ -38,6 +38,7 @@ Speech recognition engine/API support:
* `Snowboy Hotword Detection <https://snowboy.kitt.ai/>`__ (works offline)
* `Tensorflow <https://www.tensorflow.org/>`__
* `Vosk API <https://github.com/alphacep/vosk-api/>`__ (works offline)
* `OpenAI whisper <https://github.com/openai/whisper>`__ (works offline)

**Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.

Expand Down Expand Up @@ -93,6 +94,7 @@ To use all of the functionality of the library, you should have:
* **Google API Client Library for Python** (required only if you need to use the Google Cloud Speech API, ``recognizer_instance.recognize_google_cloud``)
* **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X)
* **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``)
* **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``)

The following requirements are optional, but can improve or extend functionality in some situations:

Expand Down Expand Up @@ -173,6 +175,12 @@ This is because monotonic time is necessary to handle cache expiry properly in t

To install, use `Pip <https://pip.readthedocs.org/>`__: execute ``pip install monotonic`` in a terminal.

Whisper (for Whisper users)
~~~~~~~~~~~~~~~~~~~~~
Whisper is **required if and only if you want to use whisper** (``recognizer_instance.recognize_whisper``).

You can install it with ``python3 -m pip install git+https://github.com/openai/whisper.git``.

Troubleshooting
---------------

Expand Down
8 changes: 8 additions & 0 deletions examples/microphone_recognition.py
Expand Up @@ -84,3 +84,11 @@
print("IBM Speech to Text could not understand audio")
except sr.RequestError as e:
print("Could not request results from IBM Speech to Text service; {0}".format(e))

# recognize speech using whisper
try:
print("Whisper thinks you said " + r.recognize_whisper(audio, language="english"))
except sr.UnknownValueError:
print("Whisper could not understand audio")
except sr.RequestError as e:
print("Could not request results from Whisper")
14 changes: 14 additions & 0 deletions reference/library-reference.rst
Expand Up @@ -300,6 +300,20 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.

``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options: Dict[Any, Any]=None, language:Optional[str]=None, translate:bool=False, **transcribe_options):``
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.

The recognition language is determined by ``language``, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py

model can be any of tiny, base, small, medium, large, tiny.en, base.en, small.en, medium.en. See https://github.com/openai/whisper for more details.

If show_dict is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.

You can translate the result to english with Whisper by passing translate=True

Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options

``AudioSource``
---------------

Expand Down
39 changes: 39 additions & 0 deletions speech_recognition/__init__.py
Expand Up @@ -4,6 +4,7 @@

import io
import os
import tempfile
import sys
import subprocess
import wave
Expand Down Expand Up @@ -1666,6 +1667,44 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac
for node_id in top_k:
human_string = self.tflabels[node_id]
return human_string

def recognize_whisper(self, audio_data, model="base", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
The recognition language is determined by ``language``, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
model can be any of tiny, base, small, medium, large, tiny.en, base.en, small.en, medium.en. See https://github.com/openai/whisper for more details.
If show_dict is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
You can translate the result to english with Whisper by passing translate=True
Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options
"""

assert isinstance(audio_data, AudioData), "Data must be audio data"
import whisper

if load_options or not hasattr(self, "whisper_model") or self.whisper_model.get(model) is None:
self.whisper_model = getattr(self, "whisper_model", {})
self.whisper_model[model] = whisper.load_model(model, **load_options or {})

with tempfile.NamedTemporaryFile(suffix=".wav") as f:
f.write(audio_data.get_wav_data())
f.flush()
result = self.whisper_model[model].transcribe(
f.name,
language=language,
task="translate" if translate else None,
**transcribe_options
)

if show_dict:
return result
else:
return result["text"]


def recognize_vosk(self, audio_data, language='en'):
from vosk import Model, KaldiRecognizer
Expand Down
15 changes: 15 additions & 0 deletions tests/test_recognition.py
Expand Up @@ -12,6 +12,7 @@ def setUp(self):
self.AUDIO_FILE_EN = os.path.join(os.path.dirname(os.path.realpath(__file__)), "english.wav")
self.AUDIO_FILE_FR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "french.aiff")
self.AUDIO_FILE_ZH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chinese.flac")
self.WHISPER_CONFIG = {"temperature": 0}

def test_sphinx_english(self):
r = sr.Recognizer()
Expand Down Expand Up @@ -81,6 +82,20 @@ def test_ibm_chinese(self):
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"], language="zh-CN"), u"砸 自己 的 脚 ")

def test_whisper_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, language="english", **self.WHISPER_CONFIG), " 1, 2, 3")

def test_whisper_french(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, language="french", **self.WHISPER_CONFIG), " et c'est la dictée numéro 1.")

def test_whisper_chinese(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳")

if __name__ == "__main__":
unittest.main()

0 comments on commit 7461563

Please sign in to comment.