Merge branch 'main' into edgchen1/matmul_nbits_bias

microsoft · May 15, 2024 · 92ce6eb · 92ce6eb
2 parents b90c8ca + f5bfbd6
commit 92ce6eb
Show file tree

Hide file tree

Showing 97 changed files with 3,217 additions and 2,897 deletions.
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
@@ -71,6 +71,12 @@ if(onnxruntime_target_platform STREQUAL "ARM64EC")
     endif()
 endif()
 
+if(onnxruntime_target_platform STREQUAL "ARM64")
+    if (MSVC)
+        add_compile_options("/bigobj")
+    endif()
+endif()
+
 file(GLOB onnxruntime_common_src CONFIGURE_DEPENDS
     ${onnxruntime_common_src_patterns}
     )

diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
@@ -73,6 +73,9 @@ endif()
 if (onnxruntime_USE_COREML)
     set(NODEJS_BINDING_USE_COREML "--use_coreml")
 endif()
+if (onnxruntime_USE_QNN)
+    set(NODEJS_BINDING_USE_QNN "--use_qnn")
+endif()
 
 if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
 # add custom target
@@ -90,7 +93,7 @@ add_custom_target(nodejs_binding_wrapper ALL
     COMMAND ${NPM_CLI} ci
     COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
         --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_TENSORRT}
-        ${NODEJS_BINDING_USE_COREML}
+        ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
     WORKING_DIRECTORY ${JS_NODE_ROOT}
     COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
 

diff --git a/csharp/OnnxRuntime.CSharp.proj b/csharp/OnnxRuntime.CSharp.proj
@@ -50,8 +50,12 @@ CMake creates a target to this project
   <PropertyGroup>
     <!-- If we create multiple nuget packages in one job, major package and dependent packages version should be the same-->
     <!-- CurrentDate and CurrentTime are only used for dev packages-->
+    <CurrentDate Condition=" '$(BuildDate)'!='' ">$(BuildDate)</CurrentDate>
+    <CurrentTime Condition=" '$(BuildTime)'!='' ">$(BuildTime)</CurrentTime>
     <CurrentDate Condition="'$(CurrentDate)'==''">$([System.DateTime]::UtcNow.ToString(yyyyMMdd))</CurrentDate>
     <CurrentTime Condition="'$(CurrentTime)'==''">$([System.DateTime]::UtcNow.ToString(hhmm))</CurrentTime>
+
+
   </PropertyGroup>
 
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -5555,11 +5555,29 @@ This version of the operator has been available since version 1 of the 'com.micr
   When number of sparse layout is 1, all heads have same sparse layout. Otherwise, different layouts are used cyclically.
   For example, given 4 layouts (S0, S1, S2, S3), 8 heads will have layouts like (S0, S1, S2, S3, S0, S1, S2, S3).
 
-  Padding shall be on the right side.
+  The block_row_indices and block_col_indices are the CSR representation of block mask. The block_col_indices might contain
+  paddings at the right side when different layout has different number of non-zeros in block mask.
 
-  When do_rotary is True, cos_cache and sin_cache are required.
+  An example of block mask with 2 layouts where each layout is 4 x 4 blocks:
+    [[[1, 0, 0, 0],
+      [1, 1, 0, 0],
+      [0, 1, 1, 0],
+      [0, 1, 1, 1]],
+
+     [[1, 0, 0, 0],
+      [1, 1, 0, 0],
+      [1, 1, 1, 0],
+      [1, 0, 1, 1]]]
+
+  The corresponding CSR format:
+    block_col_indices = [[0,  0,  1,  1,  2,  1,  2,  3, -1], [0,  0,  1,  0,  1,  2,  0,  2,  3]]
+    block_row_indices = [[0, 1, 3, 5, 8], [0, 1, 3, 6, 9]]
+
+  When do_rotary is True, cos_cache and sin_cache are required. Note that the maximum sequence length supported by cos
+  or sin cache can be different from the maximum sequence length used by kv cache.
 
   Only supports unidirectional attention with cache of past key and value in linear buffers.
+
   For performance, past_key and present_key share same memory buffer, and past_value and present_value too.
 
 #### Version
@@ -5583,7 +5601,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Number of tokens per sparse block. Choices: 16, 32, 64, 128</dd>
 </dl>
 
-#### Inputs (8 - 10)
+#### Inputs (9 - 11)
 
 <dl>
 <dt><tt>query</tt> : T</dt>
@@ -5592,20 +5610,22 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Key with shape (batch_size, sequence_length, kv_num_heads * head_size)</dd>
 <dt><tt>value</tt> (optional) : T</dt>
 <dd>Value with shape (batch_size, sequence_length, kv_num_heads * head_size)</dd>
-<dt><tt>past_key</tt> (optional) : T</dt>
-<dd>Key cache with shape (batch_size, kv_num_heads, max_sequence_length, head_size)</dd>
-<dt><tt>past_value</tt> (optional) : T</dt>
-<dd>Value cache with shape (batch_size, kv_num_heads, max_sequence_length, head_size)</dd>
-<dt><tt>block_mask</tt> : M</dt>
-<dd>block mask. 1 indicates attention and 0 no attention. Its shape is (num_layout, max_blocks, max_blocks), where num_heads is divisible by num_layout, and max_blocks is max_sequence_length / sparse_block_size.</dd>
+<dt><tt>past_key</tt> : T</dt>
+<dd>Key cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size)</dd>
+<dt><tt>past_value</tt> : T</dt>
+<dd>Value cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size)</dd>
+<dt><tt>block_row_indices</tt> : M</dt>
+<dd>The row indices of CSR format of block mask with shape (num_layout, max_blocks + 1).The num_heads is divisible by num_layout, and max_blocks is max_sequence_length / sparse_block_size.</dd>
+<dt><tt>block_col_indices</tt> : M</dt>
+<dd>The col indices of CSR format of block mask with shape (num_layout, max_nnz_blocks).The max_nnz_blocks is the maximum number of non-zeros per layout in block mask.</dd>
 <dt><tt>total_sequence_length</tt> : M</dt>
 <dd>Scalar tensor of maximum total sequence length (past_sequence_length + sequence_length) among keys.</dd>
 <dt><tt>key_total_sequence_lengths</tt> : M</dt>
 <dd>1D tensor with shape (batch_size) where each value is total sequence length of key excluding paddings.</dd>
 <dt><tt>cos_cache</tt> (optional) : T</dt>
-<dd>Cos cache of rotary with shape (max_sequence_length, head_size / 2).</dd>
+<dd>Cos cache of rotary with shape (max_rotary_sequence_length, head_size / 2).</dd>
 <dt><tt>sin_cache</tt> (optional) : T</dt>
-<dd>Sin cache of rotary with shape (max_sequence_length, head_size / 2).</dd>
+<dd>Sin cache of rotary with shape (max_rotary_sequence_length, head_size / 2).</dd>
 </dl>
 
 #### Outputs
@@ -5614,9 +5634,9 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>output</tt> : T</dt>
 <dd>3D output tensor with shape (batch_size, sequence_length, num_heads * head_size)</dd>
 <dt><tt>present_key</tt> : T</dt>
-<dd>Updated key cache with shape (batch_size, kv_num_heads, max_sequence_length, head_size).</dd>
+<dd>Updated key cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size).</dd>
 <dt><tt>present_value</tt> : T</dt>
-<dd>Updated value cache with shape (batch_size, kv_num_heads, max_sequence_length, head_size).</dd>
+<dd>Updated value cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size).</dd>
 </dl>
 
 #### Type Constraints

diff --git a/docs/Model_Test.md b/docs/Model_Test.md
@@ -1,27 +1,20 @@
 ONNX has a collection of standard tests. This document describes how to run these tests through a C++ program named 'onnx_test_runner' in this repo. You could also run these test through onnxruntime python binding, which would be much easier to setup, but, a bit harder to debug issues.
 
 # Get the test data
-You should have:
-1. onnx single node test data
-2. onnx model zoo models
-
-## Install onnx python package
-You can get onnx python package from [pypi](https://pypi.org/). However, if you are a onnxruntime developer, you may need to work on a cutting edge ONNX version. In this case, you need to build and install ONNX from source code.
-
-### Install ONNX from source code
-1. (windows) set ONNX_ML=1    
-   (linux) export ONNX_ML=1    
-2. Install protobuf and put protoc into your PATH environment. When you compile protobuf, it's better to only enable the static libraries. 
-3. run "python setup.py bdist_wheel" and "pip install dist/*.whl"
-
-## Generate node test data
-$ python3 -m onnx.backend.test.cmd_tools generate-data -o <dest_folder>    
-e.g.    
-   python3 -m onnx.backend.test.cmd_tools generate-data -o C:\testdata
-
-
-## Get more models
-Download https://onnxruntimetestdata.blob.core.windows.net/models/20190419.zip and unzip it.
+```
+git submodule update --init --recursive
+pushd .
+cd cmake/external/emsdk
+./emsdk install latest
+./emsdk activate latest
+source ./emsdk_env.sh
+popd
+cd js
+npm install
+npm run prepare-node-tests
+```
+
+In addition to that, You can get more test models with their test data from https://github.com/onnx/models .
 
 
 # Compile onnx_test_runner and run the tests

diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
@@ -208,19 +208,6 @@ debugging).
 	export ORTMODULE_ENABLE_COMPUTE_OPTIMIZER=0 # Disable
 	```
 
-#### ORTMODULE_ENABLE_SPARSE_OPTIMIZER
-
-- **Feature Area**: *ORTMODULE/Optimizations*
-- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the input data sparsity
-based performance optimizations, including embedding sparsity and label sparsity.
-This optimization is applicable when using optimum, which has an implementation of the ModuleWithLoss class that wraps the HuggingFace Training that allows loss computation inside ONNX Runtime (ORT).
-If you're not using optimum but want to implement a similar wrapper in your codebase to compute the loss inside ONNX Runtime (ORT), you can refer to this [Link](ORTModule_ModuleWithLoss_Wrapper.md) for detailed steps and guidelines on how to achieve this.
-
-	```bash
-	export ORTMODULE_ENABLE_SPARSE_OPTIMIZER=1 # Enable
-	export ORTMODULE_ENABLE_SPARSE_OPTIMIZER=0 # Disable
-	```
-
 #### ORTMODULE_PRINT_INPUT_DENSITY
 
 - **Feature Area**: *ORTMODULE/RuntimeInspector*
@@ -254,6 +241,17 @@ data sparsity based performance optimizations.
 	export ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER=0 # Disable
 	```
 
+#### ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER
+
+- **Feature Area**: *ORTMODULE/Optimizations*
+- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the label input
+data sparsity based performance optimizations.
+
+	```bash
+	export ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER=1 # Enable
+	export ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER=0 # Disable
+	```
+
 #### ORTMODULE_CACHE_DIR
 
 - **Feature Area**: *ORTMODULE/RuntimeOptions*

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -906,7 +906,7 @@ Do not modify directly.*
 |SkipGroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *in* skip:**T**<br> *in* bias:**T**<br> *out* Y:**T**<br> *out* S:**T**|1+|**T** = tensor(float), tensor(float16)|
 |SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
 |SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
-|SparseAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* block_mask:**M**<br> *in* total_sequence_length:**M**<br> *in* key_total_sequence_lengths:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(bfloat16), tensor(float16)|
+|SparseAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* block_row_indices:**M**<br> *in* block_col_indices:**M**<br> *in* total_sequence_length:**M**<br> *in* key_total_sequence_lengths:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(bfloat16), tensor(float16)|
 |TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |Trilu|*in* X:**T**<br> *in* k:**tensor(int64)**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |UnfoldTensor|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|

diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
@@ -201,6 +201,7 @@ export declare namespace InferenceSession {
     webgl: WebGLExecutionProviderOption;
     webgpu: WebGpuExecutionProviderOption;
     webnn: WebNNExecutionProviderOption;
+    qnn: QnnExecutionProviderOption;
     xnnpack: XnnpackExecutionProviderOption;
   }
 
@@ -247,6 +248,10 @@ export declare namespace InferenceSession {
     numThreads?: number;
     powerPreference?: 'default'|'low-power'|'high-performance';
   }
+  export interface QnnExecutionProviderOption extends ExecutionProviderOption {
+    readonly name: 'qnn';
+    // TODO add flags
+  }
   export interface CoreMLExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'coreml';
     /**

diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
@@ -37,6 +37,7 @@ option(USE_DML "Build with DirectML support" OFF)
 option(USE_CUDA "Build with CUDA support" OFF)
 option(USE_TENSORRT "Build with TensorRT support" OFF)
 option(USE_COREML "Build with CoreML support" OFF)
+option(USE_QNN "Build with QNN support" OFF)
 
 if(USE_DML)
   add_compile_definitions(USE_DML=1)
@@ -50,6 +51,9 @@ endif()
 if(USE_COREML)
   add_compile_definitions(USE_COREML=1)
 endif()
+if(USE_QNN)
+  add_compile_definitions(USE_QNN=1)
+endif()
 
 # source files
 file(GLOB ORT_NODEJS_BINDING_SOURCE_FILES ${CMAKE_SOURCE_DIR}/src/*.cc)

diff --git a/js/node/script/build.ts b/js/node/script/build.ts
@@ -35,6 +35,8 @@ const USE_CUDA = !!buildArgs.use_cuda;
 const USE_TENSORRT = !!buildArgs.use_tensorrt;
 // --use_coreml
 const USE_COREML = !!buildArgs.use_coreml;
+// --use_qnn
+const USE_QNN = !!buildArgs.use_qnn;
 
 // build path
 const ROOT_FOLDER = path.join(__dirname, '..');
@@ -72,6 +74,9 @@ if (USE_TENSORRT) {
 if (USE_COREML) {
   args.push('--CDUSE_COREML=ON');
 }
+if (USE_QNN) {
+  args.push('--CDUSE_QNN=ON');
+}
 
 // set CMAKE_OSX_ARCHITECTURES for macOS build
 if (os.platform() === 'darwin') {

diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
@@ -252,6 +252,9 @@ Napi::Value InferenceSessionWrap::ListSupportedBackends(const Napi::CallbackInfo
 #ifdef USE_COREML
   result.Set(result.Length(), createObject("coreml", true));
 #endif
+#ifdef USE_QNN
+  result.Set(result.Length(), createObject("qnn", true));
+#endif
 
   return scope.Escape(result);
 }
diff --git a/js/node/src/session_options_helper.cc b/js/node/src/session_options_helper.cc
@@ -80,6 +80,13 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions &sess
 #ifdef USE_COREML
     } else if (name == "coreml") {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreMlFlags));
+#endif
+#ifdef USE_QNN
+    } else if (name == "qnn") {
+      std::unordered_map<std::string, std::string> qnn_options;
+      qnn_options["backend_path"] = "QnnHtp.dll";
+      qnn_options["enable_htp_fp16_precision"] = "1";
+      sessionOptions.AppendExecutionProvider("QNN", qnn_options);
 #endif
     } else {
       ORT_NAPI_THROW_ERROR(epList.Env(), "Invalid argument: sessionOptions.executionProviders[", i,

diff --git a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java
@@ -7,6 +7,7 @@
 import ai.onnxruntime.OnnxValue;
 import ai.onnxruntime.OrtEnvironment;
 import ai.onnxruntime.OrtException;
+import ai.onnxruntime.OrtLoggingLevel;
 import ai.onnxruntime.OrtSession;
 import ai.onnxruntime.OrtSession.Result;
 import ai.onnxruntime.OrtSession.RunOptions;
@@ -421,7 +422,7 @@ private SessionOptions parseSessionOptions(ReadableMap options) throws OrtExcept
 
     if (options.hasKey("logSeverityLevel")) {
       int logSeverityLevel = options.getInt("logSeverityLevel");
-      sessionOptions.setSessionLogVerbosityLevel(logSeverityLevel);
+      sessionOptions.setSessionLogLevel(OrtLoggingLevel.mapFromInt(logSeverityLevel));
     }
 
     return sessionOptions;
@@ -432,7 +433,7 @@ private RunOptions parseRunOptions(ReadableMap options) throws OrtException {
 
     if (options.hasKey("logSeverityLevel")) {
       int logSeverityLevel = options.getInt("logSeverityLevel");
-      runOptions.setLogVerbosityLevel(logSeverityLevel);
+      runOptions.setLogLevel(OrtLoggingLevel.mapFromInt(logSeverityLevel));
     }
 
     if (options.hasKey("tag")) {

diff --git a/js/react_native/ios/OnnxruntimeJSIHelper.mm b/js/react_native/ios/OnnxruntimeJSIHelper.mm
@@ -5,11 +5,16 @@
 
 @implementation OnnxruntimeJSIHelper
 
+@synthesize bridge = _bridge;
+
 RCT_EXPORT_MODULE()
 
+- (void)setBridge:(RCTBridge *)bridge {
+  _bridge = bridge;
+}
+
 RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(install) {
-  RCTBridge *bridge = [RCTBridge currentBridge];
-  RCTCxxBridge *cxxBridge = (RCTCxxBridge *)bridge;
+  RCTCxxBridge *cxxBridge = (RCTCxxBridge *)_bridge;
   if (cxxBridge == nil) {
     return @false;
   }

diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
@@ -54,6 +54,7 @@ Do not modify directly.*
 | GlobalMaxPool | ai.onnx(1+); com.ms.internal.nhwc(1+) |  |
 | Greater | ai.onnx(7-8,9-12,13+) |  |
 | GreaterOrEqual | ai.onnx(12-15,16+) |  |
+| GroupQueryAttention | com.microsoft(1+) |  |
 | HardSigmoid | ai.onnx(6+) |  |
 | If | ai.onnx(1-10,11-12,13-18,19+) |  |
 | InstanceNormalization | ai.onnx(6+); com.ms.internal.nhwc(6+) |  |

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -18,6 +18,7 @@ import {fastGelu} from './ops/fast-gelu';
 import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
 import {gemm, parseGemmAttributes} from './ops/gemm';
+import {groupQueryAttention, parseGroupQueryAttentionAttributes} from './ops/group-query-attention';
 import {instanceNorm} from './ops/instance-norm';
 import {layerNorm} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
@@ -88,6 +89,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
   ['Greater', [binaryOps.greater]],
   ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
+  ['GroupQueryAttention', [groupQueryAttention, parseGroupQueryAttentionAttributes]],
   ['HardSigmoid', [unaryOps.hardSigmoid, unaryOps.parseHardSigmoidAttributes]],
   ['InstanceNormalization', [instanceNorm]],
   ['LayerNormalization', [layerNorm]],