Support zeroCopy in WebGPUData

tensorflow · Nov 22, 2022 · 8a7a39c · 8a7a39c
1 parent 54ed5e9
commit 8a7a39c
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 77 deletions.
diff --git a/tfjs-backend-webgpu/src/backend_webgpu.ts b/tfjs-backend-webgpu/src/backend_webgpu.ts
@@ -51,12 +51,9 @@ type TensorData = {
   shape: number[],
   refCount: number,
   resourceInfo?: BufferInfo|TextureInfo,
-  // zeroCopy is used for creating tensor from GPUBuffer. When zeroCopy is false
-  // or undefined (default), this GPUBuffer will be copied to the tensor's
-  // resource buffer. When zeroCopy is true, tensor will use this GPUBUffer as
-  // tensor's resource buffer, user should not destroy this GPUBuffer until all
-  // access are done.
-  zeroCopy?: boolean,
+  // external is true means we use the resource provided by users directly
+  // (without a copy), so users should be responsible for its release.
+  external?: boolean,
   // For complex numbers, the real and imaginary parts are stored as their own
   // individual tensors, with a parent joining the two with the
   // complexTensorInfos field.
@@ -248,9 +245,8 @@ export class WebGPUBackend extends KernelBackend {
     if (!tensorData || !tensorData.resourceInfo) {
       return;
     }
-    // If tensor's resource buffer is from a zero copy GPUBuffer, do not
-    // release.
-    if (tensorData.zeroCopy) {
+    // If tensor's resource is from external, do not release.
+    if (tensorData.external) {
       tensorData.resourceInfo = null;
       return;
     }
@@ -472,23 +468,25 @@ export class WebGPUBackend extends KernelBackend {
       throw new Error(`Cannot write to a complex64 dtype. `);
     }
     const dataId = {id: this.nextDataId()};
-    const zeroCopy = env().getBool('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY');
     this.tensorMap.set(
-        dataId, {dtype, shape, values: null, refCount: 1, zeroCopy});
+        dataId,
+        {dtype, shape, values: null, refCount: 1, external: values.zeroCopy});
     const tensorData = this.tensorMap.get(dataId);
     const size = webgpu_util.GPUBytesPerElement(tensorData.dtype) *
         util.sizeFromShape(tensorData.shape);
     if (values.buffer.size < size) {
       throw new Error(`GPUBuffer size(${
           values.buffer.size}) is smaller than tensor size(${size})!`);
     } else if (
-        (values.buffer.usage & GPUBufferUsage.STORAGE) !==
-        GPUBufferUsage.STORAGE) {
-      throw new Error('GPUBuffer.usage should include GPUBufferUsage.STORAGE!');
+        (values.buffer.usage &
+         (GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC)) !==
+        (GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC)) {
+      throw new Error(
+          'GPUBuffer.usage should include GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC!');
     }
 
     // Do buffer copy by default.
-    if (zeroCopy === false) {
+    if (values.zeroCopy !== true) {
       buffer = this.copyBuffer(buffer, size, buffer.usage);
     }
     tensorData.resourceInfo = {size: buffer.size, usage: buffer.usage, buffer};

diff --git a/tfjs-backend-webgpu/src/backend_webgpu_test.ts b/tfjs-backend-webgpu/src/backend_webgpu_test.ts
@@ -367,10 +367,8 @@ describeWebGPU('keeping data on gpu ', () => {
   });
 });
 
-function createReadonlyGPUBufferFromData(
-    device: GPUDevice, data: number[], dtype: tf.DataType,
-    bufferUsage = GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE |
-        GPUBufferUsage.COPY_SRC) {
+function createStagingGPUBufferFromData(
+    device: GPUDevice, data: number[], dtype: tf.DataType) {
   const bytesPerElement = 4;
   const sizeInBytes = data.length * bytesPerElement;
 
@@ -390,7 +388,17 @@ function createReadonlyGPUBufferFromData(
         `'float32'|'int32' dtype, while the dtype is ${dtype}.`);
   }
   gpuWriteBuffer.unmap();
+  return gpuWriteBuffer;
+}
 
+function createGPUBufferFromData(
+    device: GPUDevice, data: number[], dtype: tf.DataType,
+    bufferUsage = GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE |
+        GPUBufferUsage.COPY_SRC) {
+  const bytesPerElement = 4;
+  const sizeInBytes = data.length * bytesPerElement;
+
+  const gpuWriteBuffer = createStagingGPUBufferFromData(device, data, dtype);
   const gpuReadBuffer = device.createBuffer(
       {mappedAtCreation: false, size: sizeInBytes, usage: bufferUsage});
 
@@ -403,43 +411,20 @@ function createReadonlyGPUBufferFromData(
   return gpuReadBuffer;
 }
 
-function createStagingGPUBufferFromData(
-    device: GPUDevice, data: number[], dtype: tf.DataType) {
-  const bytesPerElement = 4;
-  const sizeInBytes = data.length * bytesPerElement;
-
-  const gpuWriteBuffer = device.createBuffer({
-    mappedAtCreation: true,
-    size: sizeInBytes,
-    usage: GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC
-  });
-  const arrayBuffer = gpuWriteBuffer.getMappedRange();
-  if (dtype === 'float32') {
-    new Float32Array(arrayBuffer).set(data);
-  } else if (dtype === 'int32') {
-    new Int32Array(arrayBuffer).set(data);
-  } else {
-    throw new Error(
-        `Creating tensor from GPUBuffer only supports` +
-        `'float32'|'int32' dtype, while the dtype is ${dtype}.`);
-  }
-  gpuWriteBuffer.unmap();
-  return gpuWriteBuffer;
-}
-
 async function testCreateTensorFromGPUBuffer(
     dtype: tf.DataType, useDefaultShapeAndType = false, zeroCopy = false) {
   const webGPUBackend = tf.backend() as WebGPUBackend;
   const device = webGPUBackend.device;
   const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
   const bData = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4];
   const expected = [2, 4, 6, 8, 6, 8, 10, 12, 10, 12, 14, 16, 14, 16, 18, 20];
-  const aBuffer = createReadonlyGPUBufferFromData(device, aData, dtype);
+  const aBuffer = createGPUBufferFromData(device, aData, dtype);
   const shape: number[] = [aData.length];
   const startNumBytes = tf.memory().numBytes;
   const startNumTensors = tf.memory().numTensors;
-  const a = useDefaultShapeAndType ? tf.tensor({buffer: aBuffer}) :
-                                     tf.tensor({buffer: aBuffer}, shape, dtype);
+  const webGPUData = {buffer: aBuffer, zeroCopy};
+  const a = useDefaultShapeAndType ? tf.tensor(webGPUData) :
+                                     tf.tensor(webGPUData, shape, dtype);
   if (zeroCopy !== true) {
     aBuffer.destroy();
   }
@@ -476,9 +461,9 @@ function createTensorFromGPUTest(zeroCopy = false) {
     const device = webGPUBackend.device;
     const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
     const dtype = 'float32';
-    const aBuffer = createReadonlyGPUBufferFromData(device, aData, dtype);
+    const aBuffer = createGPUBufferFromData(device, aData, dtype);
     const shape: number[] = [aData.length];
-    const a = tf.tensor({buffer: aBuffer}, shape, dtype);
+    const a = tf.tensor({buffer: aBuffer, zeroCopy}, shape, dtype);
     if (zeroCopy !== true) {
       aBuffer.destroy();
     }
@@ -493,12 +478,13 @@ function createTensorFromGPUTest(zeroCopy = false) {
     const device = webGPUBackend.device;
     const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
     const dtype = 'float32';
-    const aBuffer = createReadonlyGPUBufferFromData(device, aData, dtype);
+    const aBuffer = createGPUBufferFromData(device, aData, dtype);
     const startNumBytes = tf.memory().numBytes;
     const startNumTensors = tf.memory().numTensors;
     const shape: number[] = [aData.length];
-    const a = tf.tensor({buffer: aBuffer}, shape, dtype);
-    const b = tf.tensor({buffer: aBuffer}, shape, dtype);
+    const webGPUData = {buffer: aBuffer, zeroCopy};
+    const a = tf.tensor(webGPUData, shape, dtype);
+    const b = tf.tensor(webGPUData, shape, dtype);
     if (zeroCopy !== true) {
       aBuffer.destroy();
     }
@@ -523,13 +509,14 @@ function createTensorFromGPUTest(zeroCopy = false) {
     const device = webGPUBackend.device;
     const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
     const dtype = 'float32';
-    const aBuffer = createReadonlyGPUBufferFromData(device, aData, dtype);
+    const aBuffer = createGPUBufferFromData(device, aData, dtype);
     const startNumBytes = tf.memory().numBytes;
     const startNumTensors = tf.memory().numTensors;
     // GPUBuffer.size is bigger than shape size
     const shape: number[] = [aData.length - 1];
-    const a = tf.tensor({buffer: aBuffer}, shape, dtype);
-    const b = tf.tensor({buffer: aBuffer}, shape, dtype);
+    const webGPUData = {buffer: aBuffer, zeroCopy};
+    const a = tf.tensor(webGPUData, shape, dtype);
+    const b = tf.tensor(webGPUData, shape, dtype);
     if (zeroCopy !== true) {
       aBuffer.destroy();
     }
@@ -553,7 +540,7 @@ function createTensorFromGPUTest(zeroCopy = false) {
     const device = webGPUBackend.device;
     const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
     const dtype = 'float32';
-    const aBuffer = createReadonlyGPUBufferFromData(device, aData, dtype);
+    const aBuffer = createGPUBufferFromData(device, aData, dtype);
     // Throw when GPUBuffer.size is smaller than shape size
     const shape: number[] = [aData.length + 1];
     const a = () => tf.tensor({buffer: aBuffer}, shape, dtype);
@@ -570,7 +557,7 @@ function createTensorFromGPUTest(zeroCopy = false) {
     const aBuffer = createStagingGPUBufferFromData(device, aData, dtype);
     // Throw when GPUBuffer usage is not correct.
     const shape: number[] = [aData.length];
-    const a = () => tf.tensor({buffer: aBuffer}, shape, dtype);
+    const a = () => tf.tensor({buffer: aBuffer, zeroCopy}, shape, dtype);
     expect(a).toThrowError();
     aBuffer.destroy();
   });
@@ -581,14 +568,5 @@ describeWebGPU('create tensor from GPUBuffer', () => {
 });
 
 describeWebGPU('create tensor from GPUBuffer with zero copy', () => {
-  let savedZeroCopyFlag = false;
-  beforeAll(() => {
-    savedZeroCopyFlag =
-        tf.env().get('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY') as boolean;
-    tf.env().set('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', true);
-  });
-  afterAll(() => {
-    tf.env().set('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', savedZeroCopyFlag);
-  });
   createTensorFromGPUTest(true);
 });
diff --git a/tfjs-backend-webgpu/src/flags_webgpu.ts b/tfjs-backend-webgpu/src/flags_webgpu.ts
@@ -82,8 +82,3 @@ ENV.registerFlag('WEBGPU_THRESHOLD_TO_INCREASE_WORKGROUPS_FOR_MATMUL', () => 0);
  * Whether we will run im2col as a separate shader for convolution.
  */
 ENV.registerFlag('WEBGPU_CONV_SEPARATE_IM2COL_SHADER', () => false);
-
-/**
- * Whether use zero copy when create tensor from GPUBuffer.
- */
-ENV.registerFlag('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', () => false);
diff --git a/tfjs-core/src/ops/tensor.ts b/tfjs-core/src/ops/tensor.ts
@@ -99,10 +99,14 @@ import {makeTensor} from './tensor_ops_util';
  * // This makes it possible for TF.js applications to avoid GPU / CPU sync.
  * // For example, if your application includes a preprocessing step on the GPU,
  * // you could upload the GPU output directly to TF.js, rather than first
- * // downloading the values.
+ * // downloading the values. Unlike WebGL, this optionally supports zero copy
+ * // by WebGPUData.zeroCopy. When zeroCopy is false or undefined(default), this
+ * // passing GPUBuffer can be destroyed after tensor is created. When zeroCopy
+ * // is true, this GPUBuffer is bound directly by the tensor, so donot destroy
+ * // this GPUBuffer until all access is done.
  *
  * // Example for WebGPU:
- * function createReadonlyGPUBufferFromData(device, data, dtype) {
+ * function createGPUBufferFromData(device, data, dtype) {
  *   const bytesPerElement = 4;
  *   const sizeInBytes = data.length * bytesPerElement;
  *
@@ -144,8 +148,10 @@ import {makeTensor} from './tensor_ops_util';
  * const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
  * const bData = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4];
  * const expected = [2, 4, 6, 8, 6, 8, 10, 12, 10, 12, 14, 16, 14, 16, 18, 20];
- * const aBuffer = createReadonlyGPUBufferFromData(device, aData, dtype);
+ * const aBuffer = createGPUBufferFromData(device, aData, dtype);
  * const shape = [aData.length];
+ * // To use zeroCopy, use {buffer: aBuffer, zeroCopy: true} instead and destroy
+ * // aBuffer untill all access is done.
  * const a = tf.tensor({buffer: aBuffer}, shape, dtype);
  * const b = tf.tensor(bData, shape, dtype);
  * const result = tf.add(a, b);
@@ -172,7 +178,11 @@ import {makeTensor} from './tensor_ops_util';
  * have: buffer, a `GPUBuffer`. The buffer must: 1. share the same `GPUDevice`
  * with TFJS's WebGPU backend; 2. buffer.usage should at least support
  * GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC; 3. buffer.size should not
- * be smaller than the byte size of tensor shape.
+ * be smaller than the byte size of tensor shape. WebGPUData optionally supports
+ * zero copy by flag zeroCopy. When zeroCopy is false or undefined(default),
+ * this passing GPUBuffer can be destroyed after tensor is created. When
+ * zeroCopy is true, this GPUBuffer is bound directly by the tensor, so donot
+ * destroy this GPUBuffer until all access is done.
  * @param shape The shape of the tensor. Optional. If not provided,
  *   it is inferred from `values`.
  * @param dtype The data type.

diff --git a/tfjs-core/src/types.ts b/tfjs-core/src/types.ts
@@ -184,10 +184,15 @@ export interface WebGLData {
 }
 
 /**
- * Type for representing a buffer data to create a tensor. Use default usage
- * GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC. If not specified at
- * creating a tensor, tensor type is float32.
+ * Type for representing a buffer data to create a tensor. Buffer usage should
+ * at least support GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC. When
+ * zeroCopy is false or undefined (default), this GPUBuffer will be copied to
+ * the tensor's resource buffer. When zeroCopy is true, tensor will use this
+ * GPUBuffer as tensor's resource buffer, user should not destroy this GPUBuffer
+ * until all access is done. If not specified at creating a tensor, tensor type
+ * is float32.
  */
 export interface WebGPUData {
   buffer: GPUBuffer;
+  zeroCopy?: boolean;
 }