diff --git a/docs/source/_static/img/inference-graph-diagram.png b/docs/source/_static/img/inference-graph-diagram.png new file mode 100644 index 0000000000..fe444a59d3 Binary files /dev/null and b/docs/source/_static/img/inference-graph-diagram.png differ diff --git a/docs/source/_static/img/inference-graph-trace.png b/docs/source/_static/img/inference-graph-trace.png new file mode 100644 index 0000000000..88f946eda9 Binary files /dev/null and b/docs/source/_static/img/inference-graph-trace.png differ diff --git a/docs/source/guides/graph.rst b/docs/source/guides/graph.rst new file mode 100644 index 0000000000..1a1080ec67 --- /dev/null +++ b/docs/source/guides/graph.rst @@ -0,0 +1,127 @@ +=============== +Inference Graph +=============== + +Many ML problems require an ensemble of models to work together to solve. BentoML architecture can support any model inference graph natively in its +:ref:`Service APIs ` definition. Users can define parallel and sequential inference graphs with any control flows +by writing simple Python code. In this guide, we will build a text generation and classification service using model inference graph. The project +source code can be found in the BentoML `inference graph `_ example. + +.. image:: ../_static/img/inference-graph-diagram.png + +As illustrated in the diagram above, the service performs the following tasks. + +1. Accepts a text input. +2. Passes the input to three text generation models in parallel and receives a list of three generated texts. +3. Passes the list of generated texts to a text classification model iteratively and receives a list of three classification results. +4. Returns the list of generated texts and classification results in a JSON output. + +.. code-block:: json + :caption: Sample JSON Output + + [ + { + "generated": "I have an idea! Please share with, like and subscribe and leave a comment below!\n\nIf you like this post, please consider becoming a patron of Reddit or becoming a patron of the author.", + "score": 0.5149753093719482 + }, + { + "generated": "I have an idea! One that won't break your heart but will leave you gasping in awe. A book about the history of magic. And because what's better than magic? Some. It's a celebration of our ancient, universal gift of awe.\"\n\nThe result was the \"Vox Populi: A Memoir of the Ancient World\" by E.V. Okello (Ace Books), published in 1999.\n\nIn the past 20 years, Okello, professor of history at Ohio State University and author of such titles as \"The American Imagination\" and \"Walking With Elephants", + "score": 0.502700924873352 + }, + { + "generated": "I have an idea! I've been wondering what the name of this thing is. What's the point?\" - The Simpsons\n\n\n\"It's bigger, bigger than she needs!\" - SpongeBob SquarePants\n\n\n\"That's a funny thing. It's like my brain is the most gigantic living thing. I just like thinking big.\" - Simpsons\n\n\n\"Ooookay! Here comes Barty-Icarus himself! (pause)\" - A Christmas Tale\n\n\nBackground information Edit\n\nFormal name: Homer's Brain.\n\nHomer's Brain. Special name: Brain.\n\nAppearances Edit", + "score": 0.536346971988678 + } + ] + + +Declare Runners +############### + +Create :ref:`Runners ` for the three text generation models and the one text classification model using the ``to_runner`` function. + +.. code-block:: python + + gpt2_generator = bentoml.transformers.get("gpt2-generation:latest").to_runner() + distilgpt2_generator = bentoml.transformers.get("distilgpt2-generation:latest").to_runner() + distilbegpt2_medium_generator = bentoml.transformers.get("gpt2-medium-generation:latest").to_runner() + bert_base_uncased_classifier = bentoml.transformers.get("bert-base-uncased-classification:latest").to_runner() + + +Create Service +############## + +Create a :ref:`Service ` named ``inference_graph`` and specify the runners created earlier in the ``runners`` argument. + +.. code-block:: python + + svc = bentoml.Service( + "inference_graph", + runners=[ + gpt2_generator, + distilgpt2_generator, + distilbegpt2_medium_generator, + bert_base_uncased_classifier, + ], + ) + +Define API +########## + +First, define an async :ref:`API ` named ``classify_generated_texts`` that accepts a :ref:`Text ` +input and returns :ref:`JSON ` output. Second, pass the input simultaneously to all three text generation +runners through ``asyncio.gather`` and receive a list of three generated texts. Using ``asyncio.gather`` and Runner's ``async_run`` allows the inferences to happen +in parallel. Third, pass the list of generated texts to the text classification runner iteratively using a loop to get the classification score of each generated text. +Finally, return the list of generated texts and classification results in a dictionary. + +.. tip:: + + Using asynchronous Service and Runner APIs achives better performance and throughput for IO-intensive workloads. + See :ref:`Sync vs Async APIs ` for more details. + + +.. code-block:: python + + @svc.api(input=Text(), output=JSON()) + async def classify_generated_texts(original_sentence: str) -> dict: + generated_sentences = [ + result[0]["generated_text"] + for result in await asyncio.gather( + gpt2_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + distilgpt2_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + distilbegpt2_medium_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + ) + ] + + results = [] + for sentence in generated_sentences: + score = (await bert_base_uncased_classifier.async_run(sentence))[0]["score"] + results.append( + { + "generated": sentence, + "score": score, + } + ) + + return results + + +Inference Graph Trace +##################### + +The following tracing waterfall graphs demonstrates the execution flow of the inference graph. Note that the three calls to the text generation +runners happen in parallel without blocking each other and the calls to the text classification runner happen sequentially. + +.. image:: ../_static/img/inference-graph-trace.png diff --git a/docs/source/guides/index.rst b/docs/source/guides/index.rst index 4bb84e7481..eb92f44c32 100644 --- a/docs/source/guides/index.rst +++ b/docs/source/guides/index.rst @@ -17,6 +17,7 @@ into this part of the documentation. client server configuration + graph logging metrics performance diff --git a/examples/inference_graph/README.md b/examples/inference_graph/README.md index 3a79940bcd..25f87d5ae0 100644 --- a/examples/inference_graph/README.md +++ b/examples/inference_graph/README.md @@ -3,9 +3,10 @@ This is a sample project demonstrating model inference graph of [BentoML](https://github.com/bentoml) with Huggingface Transformers. -In this project, we will download three pretrained models, save them as three text classification -Transformers pipelines, build a text classification service via an HTTP server, and containerize the -service as a docker image for production deployment. +In this project, we will download and save three pretrained text generation models and a pretrained text classification model +to the model store. We will then build a service that accepts a text input, passes the input to the three text generation models, +classify each generated paragraph with the classification model, and return all three generated paragraphs with their classification +scores. The service will be served via HTTP and containerized as a docker image for production deployment. ### Install Dependencies @@ -16,7 +17,7 @@ pip install -r ./requirements.txt ### Model Training -First step, create and save three text classification pipelines from three different BERT models: +Create and save three text generation models and one text classification model. ```bash import bentoml @@ -25,15 +26,34 @@ import transformers if __name__ == "__main__": # Create Transformers pipelines from pretrained models - pipeline1 = transformers.pipeline(task="text-classification", model="bert-base-uncased", tokenizer="bert-base-uncased") - pipeline2 = transformers.pipeline(task="text-classification", model="distilbert-base-uncased-finetuned-sst-2-english") - pipeline3 = transformers.pipeline(task="text-classification", model="ProsusAI/finbert") + generation_pipeline_1 = transformers.pipeline( + task="text-generation", + model="gpt2", + ) + generation_pipeline_2 = transformers.pipeline( + task="text-generation", + model="distilgpt2", + ) + generation_pipeline_2 = transformers.pipeline( + task="text-generation", + model="gpt2-medium", + ) - # Save models to BentoML local model store - bentoml.transformers.save_model("bert-base-uncased", pipeline1) - bentoml.transformers.save_model("distilbert-base-uncased-finetuned-sst-2-english", pipeline2) - bentoml.transformers.save_model("prosusai-finbert", pipeline3) + classification_pipeline = transformers.pipeline( + task="text-classification", + model="bert-base-uncased", + tokenizer="bert-base-uncased", + ) + # Save models to BentoML local model store + m0 = bentoml.transformers.save_model("gpt2-generation", generation_pipeline_1) + m1 = bentoml.transformers.save_model("distilgpt2-generation", generation_pipeline_2) + m2 = bentoml.transformers.save_model( + "gpt2-medium-generation", generation_pipeline_2 + ) + m3 = bentoml.transformers.save_model( + "bert-base-uncased-classification", classification_pipeline + ) ``` This will save the models in the BentoML local model store, new version tags are automatically @@ -51,12 +71,12 @@ To verify that the saved model can be loaded correctly, run the following: ```python import bentoml -pipeline = bentoml.transformers.load_model("bert-base-uncased:latest") +pipeline = bentoml.transformers.load_model("gpt2-generation:latest") -pipeline("You look great today!") +pipeline("I have an idea!") ``` -In BentoML, the recommended way of running ML model inference in serving is via Runner, which +In BentoML, the recommended way of running ML model inference in serving is via Runners, which gives BentoML more flexibility in scheduling the inference computation, batching inference requests, and taking advantage of hardware resoureces available. Saved models can be loaded as Runner instance as shown below: @@ -65,40 +85,93 @@ shown below: import bentoml # Create a Runner instance: -bert_runner = bentoml.transformers.get("bert-base-uncased:latest").to_runner() +bert_runner = bentoml.transformers.get("gpt2-generation:latest").to_runner() # Runner#init_local initializes the model in current process, this is meant for development and testing only: bert_runner.init_local() # This should yield the same result as the loaded model: -bert_runner.run("You look great today!") +bert_runner.run("I have an idea!") ``` ### Serving the model -A simple BentoML Service that serves the model saved above look like this: +The service definition below achieves the inference graph logic described above. + +First, the we create three text generation runners and one text classification runners with the `to_runner` function +from the models we previously saved. Second, we create a `bentoml.Service` named "inference_graph" with pass in +the four runners instances. Lastly, we create an async `@svc.api` that accepts a `Text` input and `JSON` output. The API +passes the input simultaneously to all three text generation models through `asyncio.gather` and iteratively passes +the generated paragraphs to the text classification model. The API returns all three generated paragraphs and their +corresponding classification scores as a dictionary. ```python import asyncio -import bentoml -from bentoml.io import Text, JSON -from statistics import median - -bert_runner = bentoml.transformers.get("bert-base-uncased:latest").to_runner() -distilbert_runner = bentoml.transformers.get("distilbert-base-uncased-finetuned-sst-2-english:latest").to_runner() -finbert_runner = bentoml.transformers.get("prosusai-finbert:latest").to_runner() +import bentoml +from bentoml.io import JSON +from bentoml.io import Text + +gpt2_generator = bentoml.transformers.get("gpt2-generation:latest").to_runner() +distilgpt2_generator = bentoml.transformers.get( + "distilgpt2-generation:latest" +).to_runner() +distilbegpt2_medium_generator = bentoml.transformers.get( + "gpt2-medium-generation:latest" +).to_runner() +bert_base_uncased_classifier = bentoml.transformers.get( + "bert-base-uncased-classification:latest" +).to_runner() + +svc = bentoml.Service( + "inference_graph", + runners=[ + gpt2_generator, + distilgpt2_generator, + distilbegpt2_medium_generator, + bert_base_uncased_classifier, + ], +) + + +MAX_LENGTH = 128 +NUM_RETURN_SEQUENCE = 1 -svc = bentoml.Service("inference_graph", runners=[bert_runner, distilbert_runner, finbert_runner]) @svc.api(input=Text(), output=JSON()) -async def classify(input_data: str) -> dict: - results = await asyncio.gather( - bert_runner.async_run(input_data), - distilbert_runner.async_run(input_data), - finbert_runner.async_run(input_data), - ) +async def classify_generated_texts(original_sentence: str) -> dict: + generated_sentences = [ + result[0]["generated_text"] + for result in await asyncio.gather( + gpt2_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + distilgpt2_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + distilbegpt2_medium_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + ) + ] + + results = [] + for sentence in generated_sentences: + score = (await bert_base_uncased_classifier.async_run(sentence))[0]["score"] + results.append( + { + "generated": sentence, + "score": score, + } + ) + return results ``` @@ -112,13 +185,15 @@ You may also send request with `curl` command or any HTTP client, e.g.: ```bash curl -X 'POST' \ - 'http://127.0.0.1:3000/classify' \ + 'http://0.0.0.0:3000/classify_generated_texts' \ -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d 'You look great today!' + -H 'Content-Type: text/plain' \ + -d 'I have an idea!' ``` + + ### Build Bento for deployment Bento is the distribution format in BentoML which captures all the source code, model files, config @@ -137,6 +212,7 @@ include: python: packages: - transformers + - torch ``` Next, run `bentoml build` from current directory to start the Bento build: diff --git a/examples/inference_graph/service.py b/examples/inference_graph/service.py index 6394aa9f8f..909dc1d396 100644 --- a/examples/inference_graph/service.py +++ b/examples/inference_graph/service.py @@ -4,20 +4,63 @@ from bentoml.io import JSON from bentoml.io import Text -bert_runner = bentoml.transformers.get("bert-base-uncased:latest").to_runner() -distilbert_runner = bentoml.transformers.get("distilbert:latest").to_runner() -finbert_runner = bentoml.transformers.get("prosusai-finbert:latest").to_runner() +gpt2_generator = bentoml.transformers.get("gpt2-generation:latest").to_runner() +distilgpt2_generator = bentoml.transformers.get( + "distilgpt2-generation:latest" +).to_runner() +distilbegpt2_medium_generator = bentoml.transformers.get( + "gpt2-medium-generation:latest" +).to_runner() +bert_base_uncased_classifier = bentoml.transformers.get( + "bert-base-uncased-classification:latest" +).to_runner() svc = bentoml.Service( - "inference_graph", runners=[bert_runner, distilbert_runner, finbert_runner] + "inference_graph", + runners=[ + gpt2_generator, + distilgpt2_generator, + distilbegpt2_medium_generator, + bert_base_uncased_classifier, + ], ) +MAX_LENGTH = 128 +NUM_RETURN_SEQUENCE = 1 + + @svc.api(input=Text(), output=JSON()) -async def classify(input_data: str) -> dict: - results = await asyncio.gather( - bert_runner.async_run(input_data), - distilbert_runner.async_run(input_data), - finbert_runner.async_run(input_data), - ) +async def classify_generated_texts(original_sentence: str) -> dict: + generated_sentences = [ + result[0]["generated_text"] + for result in await asyncio.gather( + gpt2_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + distilgpt2_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + distilbegpt2_medium_generator.async_run( + original_sentence, + max_length=MAX_LENGTH, + num_return_sequences=NUM_RETURN_SEQUENCE, + ), + ) + ] + + results = [] + for sentence in generated_sentences: + score = (await bert_base_uncased_classifier.async_run(sentence))[0]["score"] + results.append( + { + "generated": sentence, + "score": score, + } + ) + return results diff --git a/examples/inference_graph/train.py b/examples/inference_graph/train.py index 12516d40fe..569ea67810 100644 --- a/examples/inference_graph/train.py +++ b/examples/inference_graph/train.py @@ -9,22 +9,33 @@ if __name__ == "__main__": # Create Transformers pipelines from pretrained models - pipeline1 = transformers.pipeline( + generation_pipeline_1 = transformers.pipeline( + task="text-generation", + model="gpt2", + ) + generation_pipeline_2 = transformers.pipeline( + task="text-generation", + model="distilgpt2", + ) + generation_pipeline_2 = transformers.pipeline( + task="text-generation", + model="gpt2-medium", + ) + + classification_pipeline = transformers.pipeline( task="text-classification", model="bert-base-uncased", tokenizer="bert-base-uncased", ) - pipeline2 = transformers.pipeline( - task="text-classification", - model="distilbert-base-uncased-finetuned-sst-2-english", - ) - pipeline3 = transformers.pipeline( - task="text-classification", model="ProsusAI/finbert" - ) # Save models to BentoML local model store - m1 = bentoml.transformers.save_model("bert-base-uncased", pipeline1) - m2 = bentoml.transformers.save_model("distilbert", pipeline2) - m3 = bentoml.transformers.save_model("prosusai-finbert", pipeline3) + m0 = bentoml.transformers.save_model("gpt2-generation", generation_pipeline_1) + m1 = bentoml.transformers.save_model("distilgpt2-generation", generation_pipeline_2) + m2 = bentoml.transformers.save_model( + "gpt2-medium-generation", generation_pipeline_2 + ) + m3 = bentoml.transformers.save_model( + "bert-base-uncased-classification", classification_pipeline + ) - print(f"Model saved: {m1}, {m2}, {m3}") + print(f"Model saved: {m0}, {m1}, {m2}, {m3}")