Skip to content

Commit

Permalink
Merge branch 'main' into edgchen1/matmul_nbits_bias
Browse files Browse the repository at this point in the history
  • Loading branch information
edgchen1 committed May 15, 2024
2 parents b90c8ca + f5bfbd6 commit 92ce6eb
Show file tree
Hide file tree
Showing 97 changed files with 3,217 additions and 2,897 deletions.
6 changes: 6 additions & 0 deletions cmake/onnxruntime_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ if(onnxruntime_target_platform STREQUAL "ARM64EC")
endif()
endif()

if(onnxruntime_target_platform STREQUAL "ARM64")
if (MSVC)
add_compile_options("/bigobj")
endif()
endif()

file(GLOB onnxruntime_common_src CONFIGURE_DEPENDS
${onnxruntime_common_src_patterns}
)
Expand Down
5 changes: 4 additions & 1 deletion cmake/onnxruntime_nodejs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ endif()
if (onnxruntime_USE_COREML)
set(NODEJS_BINDING_USE_COREML "--use_coreml")
endif()
if (onnxruntime_USE_QNN)
set(NODEJS_BINDING_USE_QNN "--use_qnn")
endif()

if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
# add custom target
Expand All @@ -90,7 +93,7 @@ add_custom_target(nodejs_binding_wrapper ALL
COMMAND ${NPM_CLI} ci
COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
--arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_TENSORRT}
${NODEJS_BINDING_USE_COREML}
${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
WORKING_DIRECTORY ${JS_NODE_ROOT}
COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")

Expand Down
4 changes: 4 additions & 0 deletions csharp/OnnxRuntime.CSharp.proj
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,12 @@ CMake creates a target to this project
<PropertyGroup>
<!-- If we create multiple nuget packages in one job, major package and dependent packages version should be the same-->
<!-- CurrentDate and CurrentTime are only used for dev packages-->
<CurrentDate Condition=" '$(BuildDate)'!='' ">$(BuildDate)</CurrentDate>
<CurrentTime Condition=" '$(BuildTime)'!='' ">$(BuildTime)</CurrentTime>
<CurrentDate Condition="'$(CurrentDate)'==''">$([System.DateTime]::UtcNow.ToString(yyyyMMdd))</CurrentDate>
<CurrentTime Condition="'$(CurrentTime)'==''">$([System.DateTime]::UtcNow.ToString(hhmm))</CurrentTime>


</PropertyGroup>

<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
Expand Down
46 changes: 33 additions & 13 deletions docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -5555,11 +5555,29 @@ This version of the operator has been available since version 1 of the 'com.micr
When number of sparse layout is 1, all heads have same sparse layout. Otherwise, different layouts are used cyclically.
For example, given 4 layouts (S0, S1, S2, S3), 8 heads will have layouts like (S0, S1, S2, S3, S0, S1, S2, S3).

Padding shall be on the right side.
The block_row_indices and block_col_indices are the CSR representation of block mask. The block_col_indices might contain
paddings at the right side when different layout has different number of non-zeros in block mask.

When do_rotary is True, cos_cache and sin_cache are required.
An example of block mask with 2 layouts where each layout is 4 x 4 blocks:
[[[1, 0, 0, 0],
[1, 1, 0, 0],
[0, 1, 1, 0],
[0, 1, 1, 1]],

[[1, 0, 0, 0],
[1, 1, 0, 0],
[1, 1, 1, 0],
[1, 0, 1, 1]]]

The corresponding CSR format:
block_col_indices = [[0, 0, 1, 1, 2, 1, 2, 3, -1], [0, 0, 1, 0, 1, 2, 0, 2, 3]]
block_row_indices = [[0, 1, 3, 5, 8], [0, 1, 3, 6, 9]]

When do_rotary is True, cos_cache and sin_cache are required. Note that the maximum sequence length supported by cos
or sin cache can be different from the maximum sequence length used by kv cache.

Only supports unidirectional attention with cache of past key and value in linear buffers.

For performance, past_key and present_key share same memory buffer, and past_value and present_value too.

#### Version
Expand All @@ -5583,7 +5601,7 @@ This version of the operator has been available since version 1 of the 'com.micr
<dd>Number of tokens per sparse block. Choices: 16, 32, 64, 128</dd>
</dl>

#### Inputs (8 - 10)
#### Inputs (9 - 11)

<dl>
<dt><tt>query</tt> : T</dt>
Expand All @@ -5592,20 +5610,22 @@ This version of the operator has been available since version 1 of the 'com.micr
<dd>Key with shape (batch_size, sequence_length, kv_num_heads * head_size)</dd>
<dt><tt>value</tt> (optional) : T</dt>
<dd>Value with shape (batch_size, sequence_length, kv_num_heads * head_size)</dd>
<dt><tt>past_key</tt> (optional) : T</dt>
<dd>Key cache with shape (batch_size, kv_num_heads, max_sequence_length, head_size)</dd>
<dt><tt>past_value</tt> (optional) : T</dt>
<dd>Value cache with shape (batch_size, kv_num_heads, max_sequence_length, head_size)</dd>
<dt><tt>block_mask</tt> : M</dt>
<dd>block mask. 1 indicates attention and 0 no attention. Its shape is (num_layout, max_blocks, max_blocks), where num_heads is divisible by num_layout, and max_blocks is max_sequence_length / sparse_block_size.</dd>
<dt><tt>past_key</tt> : T</dt>
<dd>Key cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size)</dd>
<dt><tt>past_value</tt> : T</dt>
<dd>Value cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size)</dd>
<dt><tt>block_row_indices</tt> : M</dt>
<dd>The row indices of CSR format of block mask with shape (num_layout, max_blocks + 1).The num_heads is divisible by num_layout, and max_blocks is max_sequence_length / sparse_block_size.</dd>
<dt><tt>block_col_indices</tt> : M</dt>
<dd>The col indices of CSR format of block mask with shape (num_layout, max_nnz_blocks).The max_nnz_blocks is the maximum number of non-zeros per layout in block mask.</dd>
<dt><tt>total_sequence_length</tt> : M</dt>
<dd>Scalar tensor of maximum total sequence length (past_sequence_length + sequence_length) among keys.</dd>
<dt><tt>key_total_sequence_lengths</tt> : M</dt>
<dd>1D tensor with shape (batch_size) where each value is total sequence length of key excluding paddings.</dd>
<dt><tt>cos_cache</tt> (optional) : T</dt>
<dd>Cos cache of rotary with shape (max_sequence_length, head_size / 2).</dd>
<dd>Cos cache of rotary with shape (max_rotary_sequence_length, head_size / 2).</dd>
<dt><tt>sin_cache</tt> (optional) : T</dt>
<dd>Sin cache of rotary with shape (max_sequence_length, head_size / 2).</dd>
<dd>Sin cache of rotary with shape (max_rotary_sequence_length, head_size / 2).</dd>
</dl>

#### Outputs
Expand All @@ -5614,9 +5634,9 @@ This version of the operator has been available since version 1 of the 'com.micr
<dt><tt>output</tt> : T</dt>
<dd>3D output tensor with shape (batch_size, sequence_length, num_heads * head_size)</dd>
<dt><tt>present_key</tt> : T</dt>
<dd>Updated key cache with shape (batch_size, kv_num_heads, max_sequence_length, head_size).</dd>
<dd>Updated key cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size).</dd>
<dt><tt>present_value</tt> : T</dt>
<dd>Updated value cache with shape (batch_size, kv_num_heads, max_sequence_length, head_size).</dd>
<dd>Updated value cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size).</dd>
</dl>

#### Type Constraints
Expand Down
35 changes: 14 additions & 21 deletions docs/Model_Test.md
Original file line number Diff line number Diff line change
@@ -1,27 +1,20 @@
ONNX has a collection of standard tests. This document describes how to run these tests through a C++ program named 'onnx_test_runner' in this repo. You could also run these test through onnxruntime python binding, which would be much easier to setup, but, a bit harder to debug issues.

# Get the test data
You should have:
1. onnx single node test data
2. onnx model zoo models

## Install onnx python package
You can get onnx python package from [pypi](https://pypi.org/). However, if you are a onnxruntime developer, you may need to work on a cutting edge ONNX version. In this case, you need to build and install ONNX from source code.

### Install ONNX from source code
1. (windows) set ONNX_ML=1
(linux) export ONNX_ML=1
2. Install protobuf and put protoc into your PATH environment. When you compile protobuf, it's better to only enable the static libraries.
3. run "python setup.py bdist_wheel" and "pip install dist/*.whl"

## Generate node test data
$ python3 -m onnx.backend.test.cmd_tools generate-data -o <dest_folder>
e.g.
python3 -m onnx.backend.test.cmd_tools generate-data -o C:\testdata


## Get more models
Download https://onnxruntimetestdata.blob.core.windows.net/models/20190419.zip and unzip it.
```
git submodule update --init --recursive
pushd .
cd cmake/external/emsdk
./emsdk install latest
./emsdk activate latest
source ./emsdk_env.sh
popd
cd js
npm install
npm run prepare-node-tests
```

In addition to that, You can get more test models with their test data from https://github.com/onnx/models .


# Compile onnx_test_runner and run the tests
Expand Down
24 changes: 11 additions & 13 deletions docs/ORTModule_Training_Guidelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,19 +208,6 @@ debugging).
export ORTMODULE_ENABLE_COMPUTE_OPTIMIZER=0 # Disable
```

#### ORTMODULE_ENABLE_SPARSE_OPTIMIZER

- **Feature Area**: *ORTMODULE/Optimizations*
- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the input data sparsity
based performance optimizations, including embedding sparsity and label sparsity.
This optimization is applicable when using optimum, which has an implementation of the ModuleWithLoss class that wraps the HuggingFace Training that allows loss computation inside ONNX Runtime (ORT).
If you're not using optimum but want to implement a similar wrapper in your codebase to compute the loss inside ONNX Runtime (ORT), you can refer to this [Link](ORTModule_ModuleWithLoss_Wrapper.md) for detailed steps and guidelines on how to achieve this.

```bash
export ORTMODULE_ENABLE_SPARSE_OPTIMIZER=1 # Enable
export ORTMODULE_ENABLE_SPARSE_OPTIMIZER=0 # Disable
```

#### ORTMODULE_PRINT_INPUT_DENSITY

- **Feature Area**: *ORTMODULE/RuntimeInspector*
Expand Down Expand Up @@ -254,6 +241,17 @@ data sparsity based performance optimizations.
export ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER=0 # Disable
```

#### ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER

- **Feature Area**: *ORTMODULE/Optimizations*
- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the label input
data sparsity based performance optimizations.

```bash
export ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER=1 # Enable
export ORTMODULE_ENABLE_LABEL_SPARSE_OPTIMIZER=0 # Disable
```

#### ORTMODULE_CACHE_DIR

- **Feature Area**: *ORTMODULE/RuntimeOptions*
Expand Down
2 changes: 1 addition & 1 deletion docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,7 @@ Do not modify directly.*
|SkipGroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *in* skip:**T**<br> *in* bias:**T**<br> *out* Y:**T**<br> *out* S:**T**|1+|**T** = tensor(float), tensor(float16)|
|SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
|SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
|SparseAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* block_mask:**M**<br> *in* total_sequence_length:**M**<br> *in* key_total_sequence_lengths:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(bfloat16), tensor(float16)|
|SparseAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* block_row_indices:**M**<br> *in* block_col_indices:**M**<br> *in* total_sequence_length:**M**<br> *in* key_total_sequence_lengths:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(bfloat16), tensor(float16)|
|TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|Trilu|*in* X:**T**<br> *in* k:**tensor(int64)**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|UnfoldTensor|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
Expand Down
5 changes: 5 additions & 0 deletions js/common/lib/inference-session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ export declare namespace InferenceSession {
webgl: WebGLExecutionProviderOption;
webgpu: WebGpuExecutionProviderOption;
webnn: WebNNExecutionProviderOption;
qnn: QnnExecutionProviderOption;
xnnpack: XnnpackExecutionProviderOption;
}

Expand Down Expand Up @@ -247,6 +248,10 @@ export declare namespace InferenceSession {
numThreads?: number;
powerPreference?: 'default'|'low-power'|'high-performance';
}
export interface QnnExecutionProviderOption extends ExecutionProviderOption {
readonly name: 'qnn';
// TODO add flags
}
export interface CoreMLExecutionProviderOption extends ExecutionProviderOption {
readonly name: 'coreml';
/**
Expand Down
4 changes: 4 additions & 0 deletions js/node/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ option(USE_DML "Build with DirectML support" OFF)
option(USE_CUDA "Build with CUDA support" OFF)
option(USE_TENSORRT "Build with TensorRT support" OFF)
option(USE_COREML "Build with CoreML support" OFF)
option(USE_QNN "Build with QNN support" OFF)

if(USE_DML)
add_compile_definitions(USE_DML=1)
Expand All @@ -50,6 +51,9 @@ endif()
if(USE_COREML)
add_compile_definitions(USE_COREML=1)
endif()
if(USE_QNN)
add_compile_definitions(USE_QNN=1)
endif()

# source files
file(GLOB ORT_NODEJS_BINDING_SOURCE_FILES ${CMAKE_SOURCE_DIR}/src/*.cc)
Expand Down
5 changes: 5 additions & 0 deletions js/node/script/build.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ const USE_CUDA = !!buildArgs.use_cuda;
const USE_TENSORRT = !!buildArgs.use_tensorrt;
// --use_coreml
const USE_COREML = !!buildArgs.use_coreml;
// --use_qnn
const USE_QNN = !!buildArgs.use_qnn;

// build path
const ROOT_FOLDER = path.join(__dirname, '..');
Expand Down Expand Up @@ -72,6 +74,9 @@ if (USE_TENSORRT) {
if (USE_COREML) {
args.push('--CDUSE_COREML=ON');
}
if (USE_QNN) {
args.push('--CDUSE_QNN=ON');
}

// set CMAKE_OSX_ARCHITECTURES for macOS build
if (os.platform() === 'darwin') {
Expand Down
3 changes: 3 additions & 0 deletions js/node/src/inference_session_wrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,9 @@ Napi::Value InferenceSessionWrap::ListSupportedBackends(const Napi::CallbackInfo
#ifdef USE_COREML
result.Set(result.Length(), createObject("coreml", true));
#endif
#ifdef USE_QNN
result.Set(result.Length(), createObject("qnn", true));
#endif

return scope.Escape(result);
}
7 changes: 7 additions & 0 deletions js/node/src/session_options_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,13 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions &sess
#ifdef USE_COREML
} else if (name == "coreml") {
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreMlFlags));
#endif
#ifdef USE_QNN
} else if (name == "qnn") {
std::unordered_map<std::string, std::string> qnn_options;
qnn_options["backend_path"] = "QnnHtp.dll";
qnn_options["enable_htp_fp16_precision"] = "1";
sessionOptions.AppendExecutionProvider("QNN", qnn_options);
#endif
} else {
ORT_NAPI_THROW_ERROR(epList.Env(), "Invalid argument: sessionOptions.executionProviders[", i,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import ai.onnxruntime.OnnxValue;
import ai.onnxruntime.OrtEnvironment;
import ai.onnxruntime.OrtException;
import ai.onnxruntime.OrtLoggingLevel;
import ai.onnxruntime.OrtSession;
import ai.onnxruntime.OrtSession.Result;
import ai.onnxruntime.OrtSession.RunOptions;
Expand Down Expand Up @@ -421,7 +422,7 @@ private SessionOptions parseSessionOptions(ReadableMap options) throws OrtExcept

if (options.hasKey("logSeverityLevel")) {
int logSeverityLevel = options.getInt("logSeverityLevel");
sessionOptions.setSessionLogVerbosityLevel(logSeverityLevel);
sessionOptions.setSessionLogLevel(OrtLoggingLevel.mapFromInt(logSeverityLevel));
}

return sessionOptions;
Expand All @@ -432,7 +433,7 @@ private RunOptions parseRunOptions(ReadableMap options) throws OrtException {

if (options.hasKey("logSeverityLevel")) {
int logSeverityLevel = options.getInt("logSeverityLevel");
runOptions.setLogVerbosityLevel(logSeverityLevel);
runOptions.setLogLevel(OrtLoggingLevel.mapFromInt(logSeverityLevel));
}

if (options.hasKey("tag")) {
Expand Down
9 changes: 7 additions & 2 deletions js/react_native/ios/OnnxruntimeJSIHelper.mm
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@

@implementation OnnxruntimeJSIHelper

@synthesize bridge = _bridge;

RCT_EXPORT_MODULE()

- (void)setBridge:(RCTBridge *)bridge {
_bridge = bridge;
}

RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(install) {
RCTBridge *bridge = [RCTBridge currentBridge];
RCTCxxBridge *cxxBridge = (RCTCxxBridge *)bridge;
RCTCxxBridge *cxxBridge = (RCTCxxBridge *)_bridge;
if (cxxBridge == nil) {
return @false;
}
Expand Down
1 change: 1 addition & 0 deletions js/web/docs/webgpu-operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Do not modify directly.*
| GlobalMaxPool | ai.onnx(1+); com.ms.internal.nhwc(1+) | |
| Greater | ai.onnx(7-8,9-12,13+) | |
| GreaterOrEqual | ai.onnx(12-15,16+) | |
| GroupQueryAttention | com.microsoft(1+) | |
| HardSigmoid | ai.onnx(6+) | |
| If | ai.onnx(1-10,11-12,13-18,19+) | |
| InstanceNormalization | ai.onnx(6+); com.ms.internal.nhwc(6+) | |
Expand Down
2 changes: 2 additions & 0 deletions js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {fastGelu} from './ops/fast-gelu';
import {gather, parseGatherAttributes} from './ops/gather';
import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
import {gemm, parseGemmAttributes} from './ops/gemm';
import {groupQueryAttention, parseGroupQueryAttentionAttributes} from './ops/group-query-attention';
import {instanceNorm} from './ops/instance-norm';
import {layerNorm} from './ops/layer-norm';
import {matMul} from './ops/matmul';
Expand Down Expand Up @@ -88,6 +89,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
['Greater', [binaryOps.greater]],
['GreaterOrEqual', [binaryOps.greaterOrEqual]],
['GroupQueryAttention', [groupQueryAttention, parseGroupQueryAttentionAttributes]],
['HardSigmoid', [unaryOps.hardSigmoid, unaryOps.parseHardSigmoidAttributes]],
['InstanceNormalization', [instanceNorm]],
['LayerNormalization', [layerNorm]],
Expand Down

0 comments on commit 92ce6eb

Please sign in to comment.