Merge pull request #625 from joy-void-joy/whisper_integration

[feat(whisper)] Add recognize_whisper
Uberi · Nov 6, 2022 · 7461563 · 7461563
2 parents cafe8e9 + b3665f4
commit 7461563
Show file tree

Hide file tree

Showing 6 changed files with 86 additions and 0 deletions.
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -27,9 +27,11 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install --no-install-recommends -y libpulse-dev libasound2-dev
+          sudo apt-get install --no-install-recommends -y ffmpeg
       - name: Install Python dependencies
         run: |
           python -m pip install 'pocketsphinx<5'
+          python -m pip install git+https://github.com/openai/whisper.git
           python -m pip install .
       - name: Test with unittest
         run: |

diff --git a/README.rst b/README.rst
@@ -38,6 +38,7 @@ Speech recognition engine/API support:
 * `Snowboy Hotword Detection <https://snowboy.kitt.ai/>`__ (works offline)
 * `Tensorflow <https://www.tensorflow.org/>`__
 * `Vosk API <https://github.com/alphacep/vosk-api/>`__ (works offline)
+* `OpenAI whisper <https://github.com/openai/whisper>`__ (works offline)
 
 **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.
 
@@ -93,6 +94,7 @@ To use all of the functionality of the library, you should have:
 * **Google API Client Library for Python** (required only if you need to use the Google Cloud Speech API, ``recognizer_instance.recognize_google_cloud``)
 * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X)
 * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``)
+* **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``)
 
 The following requirements are optional, but can improve or extend functionality in some situations:
 
@@ -173,6 +175,12 @@ This is because monotonic time is necessary to handle cache expiry properly in t
 
 To install, use `Pip <https://pip.readthedocs.org/>`__: execute ``pip install monotonic`` in a terminal.
 
+Whisper (for Whisper users)
+~~~~~~~~~~~~~~~~~~~~~
+Whisper is **required if and only if you want to use whisper** (``recognizer_instance.recognize_whisper``).
+
+You can install it with ``python3 -m pip install git+https://github.com/openai/whisper.git``.
+
 Troubleshooting
 ---------------
 

diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py
@@ -84,3 +84,11 @@
     print("IBM Speech to Text could not understand audio")
 except sr.RequestError as e:
     print("Could not request results from IBM Speech to Text service; {0}".format(e))
+
+# recognize speech using whisper
+try:
+    print("Whisper thinks you said " + r.recognize_whisper(audio, language="english"))
+except sr.UnknownValueError:
+    print("Whisper could not understand audio")
+except sr.RequestError as e:
+    print("Could not request results from Whisper")
diff --git a/reference/library-reference.rst b/reference/library-reference.rst
@@ -300,6 +300,20 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot
 
 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
 
+``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options: Dict[Any, Any]=None, language:Optional[str]=None, translate:bool=False, **transcribe_options):``
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
+
+The recognition language is determined by ``language``, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+
+model can be any of tiny, base, small, medium, large, tiny.en, base.en, small.en, medium.en. See https://github.com/openai/whisper for more details.
+
+If show_dict is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
+
+You can translate the result to english with Whisper by passing translate=True
+
+Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options
+
 ``AudioSource``
 ---------------
 

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -4,6 +4,7 @@
 
 import io
 import os
+import tempfile
 import sys
 import subprocess
 import wave
@@ -1666,6 +1667,44 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac
             for node_id in top_k:
                 human_string = self.tflabels[node_id]
                 return human_string
+
+    def recognize_whisper(self, audio_data, model="base", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
+        """
+        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
+
+        The recognition language is determined by ``language``, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+
+        model can be any of tiny, base, small, medium, large, tiny.en, base.en, small.en, medium.en. See https://github.com/openai/whisper for more details.
+
+        If show_dict is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
+
+        You can translate the result to english with Whisper by passing translate=True
+
+        Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options
+        """
+
+        assert isinstance(audio_data, AudioData), "Data must be audio data"
+        import whisper
+
+        if load_options or not hasattr(self, "whisper_model") or self.whisper_model.get(model) is None:
+            self.whisper_model = getattr(self, "whisper_model", {})
+            self.whisper_model[model] = whisper.load_model(model, **load_options or {})
+
+        with tempfile.NamedTemporaryFile(suffix=".wav") as f:
+            f.write(audio_data.get_wav_data())
+            f.flush()
+            result = self.whisper_model[model].transcribe(
+                f.name,
+                language=language,
+                task="translate" if translate else None,
+                **transcribe_options
+            )
+
+        if show_dict:
+            return result
+        else:
+            return result["text"]
+
 
     def recognize_vosk(self, audio_data, language='en'):
         from vosk import Model, KaldiRecognizer

diff --git a/tests/test_recognition.py b/tests/test_recognition.py
@@ -12,6 +12,7 @@ def setUp(self):
         self.AUDIO_FILE_EN = os.path.join(os.path.dirname(os.path.realpath(__file__)), "english.wav")
         self.AUDIO_FILE_FR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "french.aiff")
         self.AUDIO_FILE_ZH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chinese.flac")
+        self.WHISPER_CONFIG = {"temperature": 0}
 
     def test_sphinx_english(self):
         r = sr.Recognizer()
@@ -81,6 +82,20 @@ def test_ibm_chinese(self):
         with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
         self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"], language="zh-CN"), u"砸 自己 的 脚 ")
 
+    def test_whisper_english(self):
+        r = sr.Recognizer()
+        with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
+        self.assertEqual(r.recognize_whisper(audio, language="english", **self.WHISPER_CONFIG), " 1, 2, 3")
+
+    def test_whisper_french(self):
+        r = sr.Recognizer()
+        with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
+        self.assertEqual(r.recognize_whisper(audio, language="french", **self.WHISPER_CONFIG), " et c'est la dictée numéro 1.")
+
+    def test_whisper_chinese(self):
+        r = sr.Recognizer()
+        with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
+        self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳")
 
 if __name__ == "__main__":
     unittest.main()