PaddlePaddle · luotao1 · Aug 25, 2022 · Aug 16, 2022 · Aug 16, 2022 · Aug 16, 2022
diff --git a/paddle/fluid/operators/triu_indices_op.cc b/paddle/fluid/operators/triu_indices_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
+
+namespace paddle {
+namespace operators {
+
+class TriuIndicesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class TriuIndicesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("out",
+              "Tensor, the output tensor, with the shape (2,x), x bounded by "
+              "[0,rows*cols])");
+    AddAttr<int>("rows",
+                 "int number, the input of triu_indices op"
+                 "which describes the number of row of the matrix")
+        .SetDefault(0);
+    AddAttr<int>("cols",
+                 "int number, the input of triu_indices op"
+                 "which describes the number of col of the matrix")
+        .SetDefault(0);
+    AddAttr<int>(
+        "offset",
+        "int number, the input of triu_indices op bounded by [1-rows,cols-1"
+        "which describes the dignalline index of the upper triangular part of "
+        "the matrix")
+        .SetDefault(0);
+    AddAttr<int>("dtype", "data type ,the input of triu_indices op")
+        .SetDefault(framework::proto::VarType::INT64);
+
+    AddComment(R"DOC(
+  TriuIndices Operator.
+  The triu_indices operator returns the indices of the upper triangular part of the matrix
+  whose rows and cols is known. It is a 2-by-x tensor, where the first row contains row coordinates
+  of all indices and the second row contains column coordinates. Indices are ordered based on
+  rows and then columns. The upper triangular part of the matrix is defined as the elements on
+  and below the diagonal.
+  The argument offset controls which diagonal to consider, default value is 0.
+  A positive value includes just as fewer diagonals above the main diagonal,
+  and similarly a negative value excludes just as fewer diagonals below the main diagonal
+  )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(triu_indices,
+                            TriuIndicesInferShapeFunctor,
+                            PD_INFER_META(phi::TriuIndicesInferMeta));
+
+REGISTER_OPERATOR(
+    triu_indices,
+    ops::TriuIndicesOp,
+    ops::TriuIndicesOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    TriuIndicesInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
@@ -2697,6 +2697,18 @@
     func : tril_triu
   backward : tril_triu_grad
 
+- api : triu_indices
+  args : (int rows, int cols, int offset, DataType dtype, Place place={})
+  output : Tensor(out)
+  infer_meta :
+    func : TriuIndicesInferMeta
+    param : [rows, cols, offset, dtype]
+  kernel :
+    func : triu_indices
+    param : [rows, cols, offset, dtype]
+    data_type : dtype
+    backend : place
+
 # python API: paddle.nn.initializer.TruncatedNormal
 - api : truncated_gaussian_random
   args : (int[] shape, float mean, float std, int seed, DataType dtype=DataType::FLOAT32, Place place={})

diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
@@ -152,4 +152,29 @@ void TrilIndicesInferMeta(
   out->set_dims(out_dims);
   out->set_dtype(dtype);
 }
+
+void TriuIndicesInferMeta(
+    int rows, int cols, int offset, DataType dtype, MetaTensor* out) {
+  // number of elements in the first row of the tril,bounded by [0, cols]
+  offset = offset - 1;
+  auto n_first_row =
+      offset > 0 ? std::min<int64_t>(cols, 1 + offset) : rows + offset > 0;
+  // number of elements in the last row of the tril, bounded by [0, cols]
+  auto n_last_row =
+      std::max<int64_t>(0, std::min<int64_t>(cols, rows + offset));
+  // number of rows, bounded by [0, rows]
+  auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(rows, rows + offset));
+  auto n_row_trapezoid = (n_last_row - n_first_row + 1);
+  // calculate # of elements in the top trapezoid
+  auto tril_size = (n_first_row + n_last_row) * n_row_trapezoid >> 1;
+  // calculate # of elements in the bottom rectangle if there is any
+  auto diff_row = n_row_all - n_row_trapezoid;
+  if (diff_row > 0) {
+    tril_size += diff_row * cols;
+  }
+  std::vector<int64_t> tmp = {2, rows * cols - tril_size};
+  auto out_dims = phi::make_ddim(tmp);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
@@ -74,4 +74,7 @@ void UniformRandomInferMeta(const IntArray& shape,
 
 void TrilIndicesInferMeta(
     int rows, int cols, int offset, DataType dtype, MetaTensor* out);
+
+void TriuIndicesInferMeta(
+    int rows, int cols, int offset, DataType dtype, MetaTensor* out);
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/triu_indices_kernel.cc b/paddle/phi/kernels/cpu/triu_indices_kernel.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triu_indices_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void TriuIndicesKernel(const Context& dev_ctx,
+                       int rows,
+                       int cols,
+                       int offset,
+                       DataType dtype,
+                       DenseTensor* out) {
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  const auto& out_dims = out->dims();
+  int64_t triu_size = out_dims[1];
+  int64_t i = 0;
+  T c = std::max<int64_t>(0, offset), r = 0;
+  while (i < triu_size) {
+    out_data[i] = r;
+    out_data[triu_size + i++] = c;
+
+    // move to the next column and check if (r, c) is still in bound
+    c += 1;
+    if (c >= cols) {
+      r += 1;
+      // not typing std::max with scalar_t as it could be an unsigned type
+      // NOTE: not necessary to check if c is less than col or overflows here,
+      // because i and triu_size act as a guard.
+      c = std::max<int64_t>(0, r + offset);
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    triu_indices, CPU, ALL_LAYOUT, phi::TriuIndicesKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/triu_indices_kernel.cu b/paddle/phi/kernels/gpu/triu_indices_kernel.cu
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triu_indices_kernel.h"
+
+#include <algorithm>
+#include <tuple>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+__device__ inline int resolve_root_int(int b, int cX4, int x, int32_t sign) {
+  int bXb_cX4 = b * b - cX4;
+  double sr = ::sqrt(static_cast<double>(bXb_cX4));
+  T res = ::__double2ll_rd((-b + sign * sr) / 2);
+  if (bXb_cX4 != static_cast<int>(sr * sr)) {
+    int llsr = ::__double2ll_rd(sr);
+    int diff = ::__double2ll_ru(
+        ::sqrt(::fabs(static_cast<double>(bXb_cX4 - llsr * llsr))));
+    auto l = res > diff ? res - diff : 0;
+    auto r = res + diff + 1;
+    x <<= 1;
+    while (l + 1 < r) {
+      auto m = (l + r) >> 1;
+      if (sign * (b + m) * m > x) {
+        r = m;
+      } else {
+        l = m;
+      }
+    }
+    res = l;
+  }
+  return res;
+}
+
+template <typename T>
+__device__ inline void get_coordinate_in_triu_trapezoid(int f,
+                                                        int x,
+                                                        T* row,
+                                                        T* col) {
+  f <<= 1;  // all statements use 2f, so only calculate it once here.
+  auto b = -1 - f;
+  auto cX4 = x << 3;  // 4 * c = 4 * (2x) = 8x;
+  *row = resolve_root_int<T>(b, cX4, x, -1);
+  *col = x - ((f - *row + 1) * *row >> 1) + *row;
+}
+
+template <typename T>
+__global__ void triu_indices_kernel(T* out_data,
+                                    int col_offset,
+                                    int m_first_row,
+                                    int col,
+                                    int rectangle_size,
+                                    int triu_size) {
+  int linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (linear_index < triu_size) {
+    T r, c;
+    if (linear_index < rectangle_size) {
+      // the coordinate is within the top rectangle
+      r = linear_index / col;
+      c = linear_index % col;
+    } else {
+      // the coordinate falls in the bottom trapezoid
+      get_coordinate_in_triu_trapezoid<T>(
+          m_first_row, linear_index - rectangle_size, &r, &c);
+      r += rectangle_size / col;
+    }
+
+    c += col_offset;
+    out_data[linear_index] = r;
+    out_data[linear_index + triu_size] = c;
+  }
+}
+
+template <typename T, typename Context>
+void TriuIndicesKernel(const Context& dev_ctx,
+                       int rows,
+                       int cols,
+                       int offset,
+                       DataType dtype,
+                       DenseTensor* out) {
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto out_dims = out->dims();
+  int triu_size = out_dims[1];
+  //  auto tensor = empty_cuda({2, triu_size}, dtype_opt, layout_opt,
+  //  device_opt, pin_memory_opt);
+
+  if (triu_size > 0) {
+    // # of triu elements in the first row
+    auto m_first_row = offset > 0 ? std::max<int>(cols - offset, 0)
+                                  :  // upper bounded by col
+                           cols;
+
+    // size of the top rectangle
+    int rectangle_size = 0;
+    if (offset < 0) {
+      rectangle_size = std::min<int>(rows, -offset) * cols;
+    }
+
+    //  using gpu_launch_config to get grid_size and block_size
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, triu_size);
+
+    triu_indices_kernel<T><<<config.block_per_grid.x,
+                             config.thread_per_block.x,
+                             0,
+                             dev_ctx.stream()>>>(out_data,
+                                                 std::max<int>(0, offset),
+                                                 m_first_row,
+                                                 cols,
+                                                 rectangle_size,
+                                                 triu_size);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    triu_indices, GPU, ALL_LAYOUT, phi::TriuIndicesKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/triu_indices_kernel.h b/paddle/phi/kernels/triu_indices_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriuIndicesKernel(const Context& dev_ctx,
+                       int rows,
+                       int cols,
+                       int offset,
+                       DataType dtype,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
@@ -109,6 +109,7 @@
 from .tensor.creation import complex  # noqa: F401
 from .tensor.creation import clone  # noqa: F401
 from .tensor.creation import tril_indices  #noqa: F401
+from .tensor.creation import triu_indices  #noqa: F401
 from .tensor.linalg import matmul  # noqa: F401
 from .tensor.linalg import dot  # noqa: F401
 from .tensor.linalg import norm  # noqa: F401
@@ -652,4 +653,5 @@
     'heaviside',
     'tril_indices',
     'sgn',
+    'triu_indices',
 ]