nvrtc: error: invalid value for --gpu-architecture (-arch) #87595

HeaoYe · 2022-10-24T13:12:47Z

🐛 Describe the bug

This problem only occurs when I use RTX4090.

import torch
a = torch.tensor([2, 2, 3]).cuda(0)
print(a.prod())

Traceback (most recent call last):
  File "xxxxxxx.py", line 3, in <module>
    print(a.prod())
RuntimeError: 
  #define POS_INFINITY __int_as_float(0x7f800000)
  #define INFINITY POS_INFINITY
  #define NEG_INFINITY __int_as_float(0xff800000)
  #define NAN __int_as_float(0x7fffffff)

  typedef long long int int64_t;
  typedef unsigned int uint32_t;
  typedef signed char int8_t;
  typedef unsigned char uint8_t;  // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
  typedef short int16_t;
  static_assert(sizeof(int64_t) == 8, "expected size does not match");
  static_assert(sizeof(uint32_t) == 4, "expected size does not match");
  static_assert(sizeof(int8_t) == 1, "expected size does not match");
  constexpr int num_threads = 128;
  constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
  constexpr int block_work_size = thread_work_size * num_threads;
  //TODO use _assert_fail, because assert is disabled in non-debug builds
  #define ERROR_UNSUPPORTED_CAST assert(false);

  
  


  namespace std {
  
  using ::signbit;
  using ::isfinite;
  using ::isinf;
  using ::isnan;
  
  using ::abs;
  
  using ::acos;
  using ::acosf;
  using ::asin;
  using ::asinf;
  using ::atan;
  using ::atanf;
  using ::atan2;
  using ::atan2f;
  using ::ceil;
  using ::ceilf;
  using ::cos;
  using ::cosf;
  using ::cosh;
  using ::coshf;
  
  using ::exp;
  using ::expf;
  
  using ::fabs;
  using ::fabsf;
  using ::floor;
  using ::floorf;
  
  using ::fmod;
  using ::fmodf;
  
  using ::frexp;
  using ::frexpf;
  using ::ldexp;
  using ::ldexpf;
  
  using ::log;
  using ::logf;
  
  using ::log10;
  using ::log10f;
  using ::modf;
  using ::modff;
  
  using ::pow;
  using ::powf;
  
  using ::sin;
  using ::sinf;
  using ::sinh;
  using ::sinhf;
  
  using ::sqrt;
  using ::sqrtf;
  using ::tan;
  using ::tanf;
  
  using ::tanh;
  using ::tanhf;
  
  using ::acosh;
  using ::acoshf;
  using ::asinh;
  using ::asinhf;
  using ::atanh;
  using ::atanhf;
  using ::cbrt;
  using ::cbrtf;
  
  using ::copysign;
  using ::copysignf;
  
  using ::erf;
  using ::erff;
  using ::erfc;
  using ::erfcf;
  using ::exp2;
  using ::exp2f;
  using ::expm1;
  using ::expm1f;
  using ::fdim;
  using ::fdimf;
  using ::fmaf;
  using ::fma;
  using ::fmax;
  using ::fmaxf;
  using ::fmin;
  using ::fminf;
  using ::hypot;
  using ::hypotf;
  using ::ilogb;
  using ::ilogbf;
  using ::lgamma;
  using ::lgammaf;
  using ::llrint;
  using ::llrintf;
  using ::llround;
  using ::llroundf;
  using ::log1p;
  using ::log1pf;
  using ::log2;
  using ::log2f;
  using ::logb;
  using ::logbf;
  using ::lrint;
  using ::lrintf;
  using ::lround;
  using ::lroundf;
  
  using ::nan;
  using ::nanf;
  
  using ::nearbyint;
  using ::nearbyintf;
  using ::nextafter;
  using ::nextafterf;
  using ::remainder;
  using ::remainderf;
  using ::remquo;
  using ::remquof;
  using ::rint;
  using ::rintf;
  using ::round;
  using ::roundf;
  using ::scalbln;
  using ::scalblnf;
  using ::scalbn;
  using ::scalbnf;
  using ::tgamma;
  using ::tgammaf;
  using ::trunc;
  using ::truncf;
  
  } // namespace std
  
  

  // NB: Order matters for this macro; it is relied upon in
  // _promoteTypesLookup and the serialization format.
  // Note, some types have ctype as void because we don't support them in codegen
  #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \
  _(uint8_t, Byte) /* 0 */                               \
  _(int8_t, Char) /* 1 */                                \
  _(int16_t, Short) /* 2 */                              \
  _(int, Int) /* 3 */                                    \
  _(int64_t, Long) /* 4 */                               \
  _(at::Half, Half) /* 5 */                                  \
  _(float, Float) /* 6 */                                \
  _(double, Double) /* 7 */                              \
  _(std::complex<at::Half>, ComplexHalf) /* 8 */        \
  _(std::complex<float>, ComplexFloat) /* 9 */                          \
  _(std::complex<double>, ComplexDouble) /* 10 */                         \
  _(bool, Bool) /* 11 */                                 \
  _(void, QInt8) /* 12 */                          \
  _(void, QUInt8) /* 13 */                        \
  _(void, QInt32) /* 14 */                        \
  _(at::BFloat16, BFloat16) /* 15 */                             \

  #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(_)       \
  _(uint8_t, Byte)                                                 \
  _(int8_t, Char)                                                  \
  _(int16_t, Short)                                                \
  _(int, Int)                                                      \
  _(int64_t, Long)                                                 \
  _(at::Half, Half)                                                \
  _(float, Float)                                                  \
  _(double, Double)                                                \
  _(std::complex<at::Half>, ComplexHalf)                           \
  _(std::complex<float>, ComplexFloat)                             \
  _(std::complex<double>, ComplexDouble)                           \
  _(bool, Bool)                                                    \
  _(at::BFloat16, BFloat16)


  enum class ScalarType : int8_t {
  #define DEFINE_ENUM(_1, n) n,
  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
  #undef DEFINE_ENUM
      Undefined,
  NumOptions
  };

  template <typename T, int size>
  struct Array {
  T data[size];

  __device__ T operator[](int i) const {
      return data[i];
  }
  __device__ T& operator[](int i) {
      return data[i];
  }
  Array() = default;
  Array(const Array&) = default;
  Array& operator=(const Array&) = default;
  __device__ Array(T x) {
    for (int i = 0; i < size; i++) {
      data[i] = x;
    }
  }
  };

  
  
  
  
  



  template <typename T>
  struct DivMod {
  T div;
  T mod;

  __device__ DivMod(T _div, T _mod) {
      div = _div;
      mod = _mod;
  }
  };

  //<unsigned int>
  struct IntDivider {
  IntDivider() = default;

  __device__ inline unsigned int div(unsigned int n) const {
  unsigned int t = __umulhi(n, m1);
  return (t + n) >> shift;
  }

  __device__ inline unsigned int mod(unsigned int n) const {
  return n - div(n) * divisor;
  }

  __device__ inline DivMod<unsigned int> divmod(unsigned int n) const {
  unsigned int q = div(n);
  return DivMod<unsigned int>(q, n - q * divisor);
  }

  unsigned int divisor;  // d above.
  unsigned int m1;  // Magic number: m' above.
  unsigned int shift;  // Shift amounts.
  };

  template <int NARGS>
  struct TrivialOffsetCalculator {
    // The offset for each argument. Wrapper around fixed-size array.
    // The offsets are in # of elements, not in bytes.
    Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
      Array<unsigned int, NARGS> offsets;
      #pragma unroll
      for (int arg = 0; arg < NARGS; arg++) {
        offsets[arg] = linear_idx;
      }
      return offsets;
    }
  };

  template<int NARGS>
  struct OffsetCalculator {
  OffsetCalculator() = default;
  __device__ __forceinline__ Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
      Array<unsigned int, NARGS> offsets;
      #pragma unroll
      for (int arg = 0; arg < NARGS; ++arg) {
      offsets[arg] = 0;
      }

      #pragma unroll
      for (int dim = 0; dim < 25; ++dim) {
      if (dim == dims) {
          break;
      }

      auto divmod = sizes_[dim].divmod(linear_idx);
      linear_idx = divmod.div;

      #pragma unroll
      for (int arg = 0; arg < NARGS; ++arg) {
          offsets[arg] += divmod.mod * strides_[dim][arg];
      }
      //printf("offset calc thread dim size stride offset %d %d %d %d %d %d %d %d\n",
      //threadIdx.x, dim, sizes_[dim].divisor, strides_[dim][0], offsets[0], linear_idx, divmod.div, divmod.mod);
      }
      return offsets;
  }

    int dims;
    IntDivider sizes_[25];
    // NOTE: this approach will not support nInputs == 0
    unsigned int strides_[25][NARGS];
  };



  #define C10_HOST_DEVICE __host__ __device__
  #define C10_DEVICE __device__

  template <typename T>
  __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
  {
    return __shfl_down_sync(mask, value, delta, width);
  }


  #if 0
  template <typename T>
  __device__ __forceinline__ std::complex<T> WARP_SHFL_DOWN(std::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
  {
    return std::complex<T>(
        __shfl_down_sync(mask, value.real(), delta, width),
        __shfl_down_sync(mask, value.imag(), delta, width));
  }
  #endif

  // aligned vector generates vectorized load/store on CUDA
  template<typename scalar_t, int vec_size>
  struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
    scalar_t val[vec_size];
  };


  C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
    // get GCD of num and denom using Euclid's algorithm.
    // Can replace this with std::gcd if we ever support c++17.
    size_t a = denominator;
    size_t b = numerator;
    while (b != 0) {
        a %= b;
        // swap(a,b)
        size_t tmp = a;
        a = b;
        b = tmp;
    }

    // a is now the GCD
    numerator /= a;
    denominator /= a;
  }




  struct ReduceConfig {
  //has to match host-side ReduceConfig in the eager code
  static constexpr int BLOCK_X = 0;
  static constexpr int BLOCK_Y = 1;
  static constexpr int CTA = 2;

  static constexpr int input_vec_size = 4;
  int element_size_bytes;
  int num_inputs;
  int num_outputs;
  int step_input = 1;
  int step_output = 1;
  int ctas_per_output = 1;
  int input_mult[3] = {0, 0, 0};
  int output_mult[2] = {0, 0};

  int block_width;
  int block_height;
  int num_threads;

  bool vectorize_input = false;
  int output_vec_size = 1;

  C10_HOST_DEVICE bool should_block_x_reduce() const {
    return input_mult[BLOCK_X] != 0;
  }

  C10_HOST_DEVICE bool should_block_y_reduce() const {
    return input_mult[BLOCK_Y] != 0;
  }

  C10_HOST_DEVICE bool should_global_reduce() const {
    return input_mult[CTA] != 0;
  }

  C10_DEVICE bool should_store(int output_idx) const {
    return output_idx < num_outputs &&
      (!should_block_x_reduce() || threadIdx.x == 0) &&
      (!should_block_y_reduce() || threadIdx.y == 0);
  }

  C10_DEVICE bool should_reduce_tail() const {
    return (!should_block_y_reduce() || threadIdx.y == 0) &&
      (!should_global_reduce() || blockIdx.y == 0);
  }

  C10_HOST_DEVICE int input_idx() const {
    int lane = threadIdx.x;
    int warp = threadIdx.y;
    int cta2 = blockIdx.y;
    return (lane * input_mult[BLOCK_X] +
            warp * input_mult[BLOCK_Y] +
            cta2 * input_mult[CTA]);
  }

  template <int output_vec_size>
  C10_HOST_DEVICE int output_idx() const {
    int lane = threadIdx.x;
    int warp = threadIdx.y;
    int cta1 = blockIdx.x;
    return (lane * output_mult[BLOCK_X] +
            warp * output_mult[BLOCK_Y] +
            cta1 * step_output) * output_vec_size;
  }

  C10_DEVICE int shared_memory_offset(int offset) const {
    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
  }

  C10_DEVICE int staging_memory_offset(int cta2) const {
    int offset = cta2 + blockIdx.x * gridDim.y;
    if (!should_block_x_reduce()) {
      offset = threadIdx.x + offset * blockDim.x;
    }
    return offset;
  }


  };


//TODO this will need to be different for more generic reduction functions
namespace reducer {

  using scalar_t = int64_t;
  using arg_t = int64_t;
  using out_scalar_t = int64_t;


  inline __device__ arg_t combine(arg_t a, arg_t b) { return a * b; }

  inline __device__ out_scalar_t project(arg_t arg) {
    return (out_scalar_t) arg;
  }

  inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
    return WARP_SHFL_DOWN(arg, offset);
  }

  inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
    return acc;
  }

  // wrap a normal reduction that ignores the index
  inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
     return combine(acc, val);
  }
}


struct ReduceJitOp {
  using scalar_t = int64_t;
  using arg_t = int64_t;
  using out_scalar_t = int64_t;

  using InputCalculator = OffsetCalculator<1>;
  using OutputCalculator = OffsetCalculator<2>;

//   static constexpr bool can_accumulate_in_output =
//     std::is_convertible<arg_t, out_scalar_t>::value
//     && std::is_convertible<out_scalar_t, arg_t>::value;

  static constexpr int input_vec_size = ReduceConfig::input_vec_size;

  arg_t ident;
  ReduceConfig config;
  InputCalculator input_calc;
  OutputCalculator output_calc;
  const void* src;
  const char* dst[2]; //it accepts at most two destinations
  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
  // output is not permissible
  void* acc_buf;
  // cta_buf used for accumulation between blocks during global reduction
  void* cta_buf;
  int* semaphores;
  int64_t base_idx;
  bool accumulate;
  bool final_output;
  int noutputs;


  C10_DEVICE void run() const {
    extern __shared__ char shared_memory[];
    uint32_t output_idx = config.output_idx<1>();
    uint32_t input_idx = config.input_idx();
    auto base_offsets1 = output_calc.get(output_idx)[1];

    using arg_vec_t = Array<arg_t, 1>;
    arg_vec_t value;

    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);

      value = thread_reduce<1>(input_slice);
    }

    if (config.should_block_y_reduce()) {
      value = block_y_reduce<1>(value, shared_memory);
    }
    if (config.should_block_x_reduce()) {
      value = block_x_reduce<1>(value, shared_memory);
    }

    using out_ptr_vec_t = Array<out_scalar_t*, 1>;
    using offset_vec_t = Array<uint32_t, 1>;
    offset_vec_t base_offsets;
    out_ptr_vec_t out;

    #pragma unroll
    for (int i = 0; i < 1; i++) {
      base_offsets[i] = output_calc.get(output_idx + i)[0];
      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
    }

    arg_vec_t* acc = nullptr;
    if (acc_buf != nullptr) {
      size_t numerator = sizeof(arg_t);
      size_t denominator = sizeof(out_scalar_t);
      reduce_fraction(numerator, denominator);
      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
    }

    if (config.should_global_reduce()) {
      value = global_reduce<1>(value, acc, shared_memory);
    } else if (config.should_store(output_idx)) {
      if (accumulate) {
        #pragma unroll
        for (int i = 0; i < 1; i++) {
          value[i] = reducer::translate_idx(value[i], base_idx);
        }
      }

      if (acc == nullptr) {
        if (accumulate) {
          value = accumulate_in_output<1>(out, value);
        }
        if (final_output) {
          set_results_to_output<1>(value, base_offsets);
        } else {
          #pragma unroll
          for (int i = 0; i < 1; i++) {
            *(out[i]) = get_accumulated_output(out[i], value[i]);
          }
        }
      } else {
        if (accumulate) {
          #pragma unroll
          for (int i = 0; i < 1; i++) {
            value[i] = reducer::combine((*acc)[i], value[i]);
          }
        }
        if (final_output) {
          set_results_to_output<1>(value, base_offsets);
        } else {
          *acc = value;
        }
      }
    }
  }

  template <int output_vec_size>
  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
    if (config.vectorize_input) {
      assert(output_vec_size == 1);
      // reduce at the header of input_slice where memory is not aligned,
      // so that thread_reduce will have an aligned memory to work on.
      return {input_vectorized_thread_reduce_impl(data)};
    } else {
      uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
      if (is_contiguous) {
        return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
      } else if (input_calc.dims == 1) {
        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
      } else {
        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
      }
    }
  }

  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
    uint32_t end = config.num_inputs;

    // Handle the head of input slice where data is not aligned
    arg_t value = ident;
    constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
    constexpr int align_elements = align_bytes / sizeof(scalar_t);
    int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
    if (shift > 0) {
      data -= shift;
      end += shift;
      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
        value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
      }
      end -= align_elements;
      data += align_elements;
      shift = align_elements - shift;
    }

    // Do the vectorized reduction
    using load_t = aligned_vector<scalar_t, input_vec_size>;

    uint32_t idx = config.input_idx();
    const uint32_t stride = config.step_input;

    // Multiple accumulators to remove dependency between unrolled loops.
    arg_t value_list[input_vec_size];
    value_list[0] = value;

    #pragma unroll
    for (int i = 1; i < input_vec_size; i++) {
      value_list[i] = ident;
    }

    scalar_t values[input_vec_size];

    load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);

    while (idx * input_vec_size + input_vec_size - 1 < end) {
      *values_vector = reinterpret_cast<const load_t*>(data)[idx];
      #pragma unroll
      for (uint32_t i = 0; i < input_vec_size; i++) {
        value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
      }
      idx += stride;
    }

    // tail
    uint32_t tail_start = end - end % input_vec_size;
    if (config.should_reduce_tail()) {
      int idx = tail_start + threadIdx.x;
      if (idx < end) {
        value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
      }
    }

    // combine accumulators
    #pragma unroll
    for (int i = 1; i < input_vec_size; i++) {
      value_list[0] = reducer::combine(value_list[0], value_list[i]);
    }
    return value_list[0];
  }

  template <int output_vec_size, typename offset_calc_t>
  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
    uint32_t idx = config.input_idx();
    const uint32_t end = config.num_inputs;
    const uint32_t stride = config.step_input;
    const int vt0=4;

    using arg_vec_t = Array<arg_t, output_vec_size>;
    using load_t = aligned_vector<scalar_t, output_vec_size>;
    const load_t* data = reinterpret_cast<const load_t*>(data_);

    // Multiple accumulators to remove dependency between unrolled loops.
    arg_vec_t value_list[vt0];

    #pragma unroll
    for (int i = 0; i < vt0; i++) {
      #pragma unroll
      for (int j = 0; j < output_vec_size; j++) {
        value_list[i][j] = ident;
      }
    }

    load_t values[vt0];

    while (idx + (vt0 - 1) * stride < end) {
      #pragma unroll
      for (uint32_t i = 0; i < vt0; i++) {
        values[i] = data[calc(idx + i * stride) / output_vec_size];
      }
      #pragma unroll
      for (uint32_t i = 0; i < vt0; i++) {
        #pragma unroll
        for (uint32_t j = 0; j < output_vec_size; j++) {
          value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
        }
      }
      idx += stride * vt0;
    }

    // tail
    int idx_ = idx;
    #pragma unroll
    for (uint32_t i = 0; i < vt0; i++) {
      if (idx >= end) {
        break;
      }
      values[i] = data[calc(idx) / output_vec_size];
      idx += stride;
    }
    idx = idx_;
    #pragma unroll
    for (uint32_t i = 0; i < vt0; i++) {
      if (idx >= end) {
        break;
      }
      #pragma unroll
      for (uint32_t j = 0; j < output_vec_size; j++) {
        value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
      }
      idx += stride;
    }

    // combine accumulators
    #pragma unroll
    for (int i = 1; i < vt0; i++) {
      #pragma unroll
      for (uint32_t j = 0; j < output_vec_size; j++) {
        value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
      }
    }
    return value_list[0];
  }
  template <int output_vec_size>
  C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
    using args_vec_t = Array<arg_t, output_vec_size>;
    int dim_x = blockDim.x;
    args_vec_t* shared = (args_vec_t*)shared_memory;
    if (dim_x > warpSize) {
      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
      shared[address_base] = value;
      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
        __syncthreads();
        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
          args_vec_t other = shared[address_base + offset];
          #pragma unroll
          for (int i = 0; i < output_vec_size; i++) {
            value[i] = reducer::combine(value[i], other[i]);
          }
          shared[address_base] = value;
        }
      }
      dim_x = warpSize;
    }

    __syncthreads();

    for (int offset = 1; offset < dim_x; offset <<= 1) {
      #pragma unroll
      for (int i = 0; i < output_vec_size; i++) {
        arg_t other = reducer::warp_shfl_down(value[i], offset);
        value[i] = reducer::combine(value[i], other);
      }
    }
    return value;
  }

  template <int output_vec_size>
  C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
    using args_vec_t = Array<arg_t, output_vec_size>;
    args_vec_t* shared = (args_vec_t*)shared_memory;
    shared[config.shared_memory_offset(0)] = value;
    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
      __syncthreads();
      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
        args_vec_t other = shared[config.shared_memory_offset(offset)];
        #pragma unroll
        for (int i = 0; i < output_vec_size; i++) {
          value[i] = reducer::combine(value[i], other[i]);
        }
        shared[config.shared_memory_offset(0)] = value;
      }
    }
    return value;
  }
  

  C10_DEVICE bool mark_block_finished() const {
    __shared__ bool is_last_block_done_shared;

    __syncthreads();
    if (threadIdx.x == 0 && threadIdx.y == 0) {
      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
    }

    __syncthreads();

    return is_last_block_done_shared;
  }

  template <int output_vec_size>
  C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
    Array<out_scalar_t*, output_vec_size> out,
    Array<arg_t, output_vec_size> value
  ) const {
    Array<arg_t, output_vec_size> ret;
    #pragma unroll
    for (int i = 0; i < output_vec_size; i++) {
      ret[i] = reducer::combine(*(out[i]), value[i]);
    }
    return ret;
  }


  C10_DEVICE out_scalar_t get_accumulated_output(
    out_scalar_t* out, arg_t value
  ) const {
    assert(!final_output);
    return (out_scalar_t)value;
  }

  template<class T>
  C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
    assert(noutputs == 1);
    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
    *res = x;
  }

//TODO - multi-output reduction - we won't be able to use thrust::pair
//just explicitly specify typed output reads/writes
//Currently implemented for max of two outputs
//   template<class T1, class T2>
//   C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
//     if (noutputs >= 1) {
//       auto res0 = (T1*)((char*)dst[0] + base_offset);
//       *res0 = x.first;
//     }
//     if (noutputs >= 2) {
//       // base offset is computed assuming element size being sizeof(T1), so we need to make a
//       // correction to obtain the correct base offset
//       auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
//       *res1 = x.second;
//     }
//   }

  template <int output_vec_size>
  C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
    assert(final_output);
    #pragma unroll
    for (int i = 0; i < output_vec_size; i++) {
      set_results(reducer::project(value[i]), base_offset[i]);
    }
  }

  template <int output_vec_size>
  C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
    using arg_vec_t = Array<arg_t, output_vec_size>;
    using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
    using offset_vec_t = Array<uint32_t, output_vec_size>;

    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
    uint32_t output_idx = config.output_idx<output_vec_size>();
    offset_vec_t base_offsets;
    out_ptr_vec_t out;

    #pragma unroll
    for (int i = 0; i < output_vec_size; i++) {
      base_offsets[i] = output_calc.get(output_idx + i)[0];
      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
    }

    bool should_store = config.should_store(output_idx);
    if (should_store) {
      uint32_t offset = config.staging_memory_offset(blockIdx.y);
      reduce_buffer[offset] = value;
    }

    __threadfence(); // make sure writes are globally visible
    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
    bool is_last_block_done = mark_block_finished();

    if (is_last_block_done) {
      value = ident;
      if (config.should_block_x_reduce()) {
        uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
        uint32_t step = blockDim.x * blockDim.y;
        for (; input_offset < config.ctas_per_output; input_offset += step) {
          uint32_t idx = config.staging_memory_offset(input_offset);
          arg_vec_t next = reduce_buffer[idx];
          #pragma unroll
          for (int i = 0; i < output_vec_size; i++) {
            value[i] = reducer::combine(value[i], next[i]);
          }
        }
      } else {
        uint32_t input_offset = threadIdx.y;
        uint32_t step = blockDim.y;
        for (; input_offset < config.ctas_per_output; input_offset += step) {
          uint32_t idx = config.staging_memory_offset(input_offset);
          arg_vec_t next = reduce_buffer[idx];
          #pragma unroll
          for (int i = 0; i < output_vec_size; i++) {
            value[i] = reducer::combine(value[i], next[i]);
          }
        }
      }
      value = block_y_reduce(value, shared_memory);
      if (config.should_block_x_reduce()) {
        value = block_x_reduce<output_vec_size>(value, shared_memory);
      }
      if (should_store) {
        if (accumulate) {
          #pragma unroll
          for (int i = 0; i < output_vec_size; i++) {
            value[i] = reducer::translate_idx(value[i], base_idx);
          }
        }

        if (acc == nullptr) {
          if (accumulate) {
            value = accumulate_in_output<output_vec_size>(out, value);
          }
          if (final_output) {
            set_results_to_output<output_vec_size>(value, base_offsets);
          } else {
            #pragma unroll
            for (int i = 0; i < output_vec_size; i++) {
              *(out[i]) = get_accumulated_output(out[i], value[i]);
            }
          }
        } else {
          if (accumulate) {
            #pragma unroll
            for (int i = 0; i < output_vec_size; i++) {
              value[i] = reducer::combine((*acc)[i], value[i]);
            }
          }
          if (final_output) {
            set_results_to_output<output_vec_size>(value, base_offsets);
          } else {
            *acc = value;
          }
        }
      }
    }

    return value;
  }
};

extern "C"
__launch_bounds__(512, 4)
__global__ void reduction_prod_kernel(ReduceJitOp r){
  r.run();
}
nvrtc: error: invalid value for --gpu-architecture (-arch)

when I use RTX3090, the problem doesn't occur.

import torch
a = torch.tensor([2, 2, 3]).cuda(1)
print(a.prod())

tensor(12, device='cuda:1')

Versions

RTX 4090
pytorch 1.14+cu117 nightly
cuda 11.8
cudnn 8.6.0
ubuntu 20.04

cc @ezyang @gchanan @zou3519 @ngimel @mruberry

The text was updated successfully, but these errors were encountered:

malfet · 2022-10-24T15:41:45Z

Hmm, can you please run python -c "import torch;print(torch.cuda.get_device_capability(0))" and share the output here?
(I suspect that 4090 is based on Lovelace/Hopper GPU arc, isn't it?)

ngimel · 2022-10-24T15:50:24Z

4090 is sm89 that's explicitly supported only by cuda 11.8, and jiterator probably doesn't handle this correctly, we'd need to set some earlier sm version and hope that it's binary compatible.

malfet · 2022-10-24T16:27:43Z

4090 is sm89 that's explicitly supported only by cuda 11.8, and jiterator probably doesn't handle this correctly, we'd need to set some earlier sm version and hope that it's binary compatible.

Yeah, but sm_86 should be compatible with sm89, so if we set maxversion to sm_86, jiterator should just work, shouldn't it?

malfet · 2022-10-24T17:05:57Z

As @ngimel suggested probably adding few more entires to

pytorch/aten/src/ATen/native/cuda/jit_utils.cpp

Lines 885 to 900 in 512a3a4

    
           CUDAVersion max_dev_version; 
        
           if (nvrtc_major <= 7) { // 7 supports 2-5.x 
        
             max_dev_version = CUDAVersion(5, 0); 
        
           } else if (nvrtc_major <= 8) { // 8 supports 2-6.x 
        
             max_dev_version = CUDAVersion(6, 0); 
        
           } else if (nvrtc_major <= 9) { // 9 supports 3-7.2 
        
             max_dev_version = CUDAVersion(7, 2); 
        
           } else if (nvrtc_major <= 10) { // 10 supports 3-7.5 
        
             max_dev_version = CUDAVersion(7, 5); 
        
           } else if (nvrtc_version == CUDAVersion(11, 0)) { // 11.0 supports 3-8.0 
        
             max_dev_version = CUDAVersion(8, 0); 
        
           } else { 
        
             // If the driver version is unknown (i.e. newer than this code) 
        
             // assume the driver supports this device 
        
             max_dev_version = dev_version; 
        
           }

dagitses · 2022-10-24T17:18:28Z

Remaining high-pri. Natalia has an attempted fix in #87611, waiting for Nvidia to test.

Fixes pytorch#87595 (maybe?) Pull Request resolved: pytorch#87611 Approved by: https://github.com/malfet, https://github.com/atalman

Fixes #87595 (maybe?) Pull Request resolved: #87611 Approved by: https://github.com/malfet, https://github.com/atalman Co-authored-by: Natalia Gimelshein <ngimel@fb.com>

atalman · 2022-10-25T00:49:33Z

@1697427528 We have deployed the fix, should be available in nightlies soon.

Should be available on pytorch-test, can be installed using:

pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/test/cu117/torch_test.html

Can you please validate and confirm if its working for you

eqy · 2022-10-25T01:03:23Z

Works for me on a 11.7 container with an AD102 GPU that repro'd the original bug.

Fixes pytorch#87595 (maybe?) Pull Request resolved: pytorch#87611 Approved by: https://github.com/malfet, https://github.com/atalman

people-RNGUP · 2022-11-14T14:42:11Z

Have you solved this problem？I'm also having this issue

eqy · 2022-11-14T23:20:05Z

This should be fixed in the nightly builds by now: check the nightly tab on https://pytorch.org/

vince62s · 2022-12-05T13:30:44Z

is there an easy fix to use torch.compile with sm89 ? cf #90170

Fixes pytorch#87595 (maybe?) Pull Request resolved: pytorch#87611 Approved by: https://github.com/malfet, https://github.com/atalman

tonyyuan1 · 2023-01-26T05:06:35Z

I having the same problem with rtx4070ti: "nvrtc: error: invalid value for --gpu-architecture (-arch)".

Versions
RTX 4070ti
pytorch 1.13.1+cu117 nightly
cuda 11.7

The pytorch 1.13.1 nightly from https://pytorch.org/ does not work for me.

eqy · 2023-01-26T06:45:29Z

I'm not aware of a 1.31 build, but if you are referring to 1.13, that would be an older release and not a current nightly. The current nightlies should be 2.0.0.dev.

tonyyuan1 · 2023-01-26T09:59:18Z

I'm not aware of a 1.31 build, but if you are referring to 1.13, that would be an older release and not a current nightly. The current nightlies should be 2.0.0.dev.

Sorry, I am referring to 1.13.1 on https://pytorch.org/. I saw stable and preview (nightly) tabs for cuda 11.7. Do they include the fix for this bug? I installed preview (nightly) but this bug still occurred with my 4070ti.

eqy · 2023-01-26T19:50:43Z

You would need a nightly recent enough to include the fix; many of the 1.13.x nightly builds are too old to include it. See https://download.pytorch.org/whl/nightly/torch/ for listings of example builds---if you installed a fresh nightly your torch.__version__ should start with 2.0.0.x

vince62s · 2023-01-26T20:01:55Z

the answer is at the end of this thread: #90170
which means you'll also need to install Triton from the Openai repo.

tonyyuan1 · 2023-01-27T07:17:01Z

I am not able to install Triton on my win10. (python version 3.8)

C:\Users\yuanb> pip.exe install triton
ERROR: Could not find a version that satisfies the requirement triton (from versions: none)
ERROR: No matching distribution found for triton

I also tried installing from source but it gave err too:
C:\test\triton\python> pip install -e .

  File "C:\test\triton\python\setup.py", line 126, in run
    self.build_extension(ext)
  File "C:\test\triton\python\setup.py", line 132, in build_extension
    thirdparty_cmake_args = get_thirdparty_packages(triton_cache_path)
  File "C:\test\triton\python\setup.py", line 67, in get_thirdparty_packages
    packages = [get_pybind11_package_info(), get_llvm_package_info()]
  File "C:\test\triton\python\setup.py", line 55, in get_llvm_package_info
    system_suffix = {"Linux": "linux-gnu-ubuntu-18.04", "Darwin": "apple-darwin"}[system]
KeyError: 'Windows'
[end of output]

nikolaydyankov · 2023-05-02T15:58:42Z

Is there a way to make this work with pytorch 1.10 and cuda 11.3? Currently I can't train OneFormer on RTX 4090.

slavkovsky77 · 2023-05-31T15:24:10Z

Is there a way to make this work with pytorch 1.10 and cuda 11.3? Currently I can't train OneFormer on RTX 4090.

Yeah I have the same issue

ffdown · 2023-10-27T10:44:40Z

Hmm, can you please run python -c "import torch;print(torch.cuda.get_device_capability(0))" and share the output here? (I suspect that 4090 is based on Lovelace/Hopper GPU arc, isn't it?)

(8,9)

pompomO · 2023-11-24T13:26:11Z

Hi.I meet the same problem, here is my versions
ntnu-arl/aerial_gym_simulator#1 (comment)

Alkrick · 2023-11-29T02:36:13Z

Is there a way to make this work with pytorch 1.10 and cuda 11.3? Currently I can't train OneFormer on RTX 4090.

I am facing the same problem on RTX 4070ti, have you managed to find a solution to this issue?

GraceKafuu · 2023-11-30T11:50:00Z

I meet the same problem when training yolov5.

torch 1.12.1+cu113
torchvision 0.13.1+cu113

kongdebug · 2023-12-13T07:41:24Z

I meet the same problem.
Versions
RTX 4090
pytorch 1.12.1+cu113
cuda 11.3
cudnn 8.6.0
ubuntu 20.04

eqy · 2023-12-13T23:56:28Z

I meet the same problem. Versions RTX 4090 pytorch 1.12.1+cu113 cuda 11.3 cudnn 8.6.0 ubuntu 20.04

As RTX 4090 is sm8.9, it would not be supported in CUDA 11.3 (minimum of CUDA 11.8 is required).

kongdebug · 2023-12-14T01:57:49Z

I meet the same problem. Versions RTX 4090 pytorch 1.12.1+cu113 cuda 11.3 cudnn 8.6.0 ubuntu 20.04

As RTX 4090 is sm8.9, it would not be supported in CUDA 11.3 (minimum of CUDA 11.8 is required).

Thanks for your reply :)
I solved these issues by using docker~

GraceKafuu · 2023-12-21T02:33:04Z

I meet the same problem when training yolov5.

torch 1.12.1+cu113

torchvision 0.13.1+cu113

Solved by update to:
torch 2.1.2+cu121
torchvision 0.16.2+cu121

Twenty3hree · 2024-03-18T08:08:16Z

Solved by update

heiyuxiaokai · 2024-04-18T06:14:55Z

Solved by update to:

torch 1.13.0+cu116
torchaudio 0.13.0+cu116
torchvision 0.14.0+cu116

mruberry added the high priority label Oct 24, 2022

ngimel mentioned this issue Oct 24, 2022

attempted fix for nvrtc with lovelace #87611

Closed

malfet assigned eqy Oct 24, 2022

pytorchmergebot closed this as completed in 272747d Oct 24, 2022

atalman pushed a commit to atalman/pytorch that referenced this issue Oct 24, 2022

attempted fix for nvrtc with lovelace (pytorch#87611)

dcb5a75

Fixes pytorch#87595 (maybe?) Pull Request resolved: pytorch#87611 Approved by: https://github.com/malfet, https://github.com/atalman

atalman mentioned this issue Oct 24, 2022

attempted fix for nvrtc with lovelace (#87611) #87618

Merged

malfet added this to the 1.13.0 milestone Oct 24, 2022

sgrigory pushed a commit to sgrigory/pytorch that referenced this issue Oct 28, 2022

attempted fix for nvrtc with lovelace (pytorch#87611)

8e6d2b7

Fixes pytorch#87595 (maybe?) Pull Request resolved: pytorch#87611 Approved by: https://github.com/malfet, https://github.com/atalman

kulinseth pushed a commit to kulinseth/pytorch that referenced this issue Nov 5, 2022

attempted fix for nvrtc with lovelace (pytorch#87611)

9d3ff2d

Fixes pytorch#87595 (maybe?) Pull Request resolved: pytorch#87611 Approved by: https://github.com/malfet, https://github.com/atalman

kulinseth pushed a commit to kulinseth/pytorch that referenced this issue Dec 10, 2022

attempted fix for nvrtc with lovelace (pytorch#87611)

dfcbd4d

Fixes pytorch#87595 (maybe?) Pull Request resolved: pytorch#87611 Approved by: https://github.com/malfet, https://github.com/atalman

fangwei123456 mentioned this issue Feb 15, 2023

关于将cpu换成gpu遇到的问题 fangwei123456/spikingjelly#331

Closed

nikolaydyankov mentioned this issue May 2, 2023

Can't setup the environment SHI-Labs/OneFormer#55

Closed

memoryunreal mentioned this issue May 17, 2023

nvrtc: error: invalid value for --gpu-architecture (-arch) gaomingqi/Track-Anything#70

Open

mihirk284 mentioned this issue Nov 8, 2023

RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch) ntnu-arl/aerial_gym_simulator#1

Open

balakumar-s mentioned this issue Dec 20, 2023

nvrtc: error: invalid value for --gpu-architecture (-arch) NVlabs/curobo#87

Closed

rcrehuet mentioned this issue Jan 22, 2024

RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch) RosettaCommons/RFDesign#58

Open

marcocannici mentioned this issue May 7, 2024

nvrtc: error: invalid value for --gpu-architecture (-arch) uzh-rpg/EvDeblurNeRF#2

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

nvrtc: error: invalid value for --gpu-architecture (-arch) #87595

nvrtc: error: invalid value for --gpu-architecture (-arch) #87595

HeaoYe commented Oct 24, 2022 •

edited by pytorch-bot bot

malfet commented Oct 24, 2022 •

edited

ngimel commented Oct 24, 2022

malfet commented Oct 24, 2022

malfet commented Oct 24, 2022

dagitses commented Oct 24, 2022

atalman commented Oct 25, 2022

eqy commented Oct 25, 2022

people-RNGUP commented Nov 14, 2022

eqy commented Nov 14, 2022

vince62s commented Dec 5, 2022

tonyyuan1 commented Jan 26, 2023 •

edited

eqy commented Jan 26, 2023

tonyyuan1 commented Jan 26, 2023

eqy commented Jan 26, 2023

vince62s commented Jan 26, 2023

tonyyuan1 commented Jan 27, 2023 •

edited

nikolaydyankov commented May 2, 2023 •

edited

slavkovsky77 commented May 31, 2023

ffdown commented Oct 27, 2023

pompomO commented Nov 24, 2023

Alkrick commented Nov 29, 2023 •

edited

GraceKafuu commented Nov 30, 2023

kongdebug commented Dec 13, 2023

eqy commented Dec 13, 2023

kongdebug commented Dec 14, 2023

GraceKafuu commented Dec 21, 2023

torch 1.12.1+cu113

Twenty3hree commented Mar 18, 2024

heiyuxiaokai commented Apr 18, 2024

nvrtc: error: invalid value for --gpu-architecture (-arch) #87595

nvrtc: error: invalid value for --gpu-architecture (-arch) #87595

Comments

HeaoYe commented Oct 24, 2022 • edited by pytorch-bot bot

🐛 Describe the bug

Versions

malfet commented Oct 24, 2022 • edited

ngimel commented Oct 24, 2022

malfet commented Oct 24, 2022

malfet commented Oct 24, 2022

dagitses commented Oct 24, 2022

atalman commented Oct 25, 2022

eqy commented Oct 25, 2022

people-RNGUP commented Nov 14, 2022

eqy commented Nov 14, 2022

vince62s commented Dec 5, 2022

tonyyuan1 commented Jan 26, 2023 • edited

eqy commented Jan 26, 2023

tonyyuan1 commented Jan 26, 2023

eqy commented Jan 26, 2023

vince62s commented Jan 26, 2023

tonyyuan1 commented Jan 27, 2023 • edited

nikolaydyankov commented May 2, 2023 • edited

slavkovsky77 commented May 31, 2023

ffdown commented Oct 27, 2023

pompomO commented Nov 24, 2023

Alkrick commented Nov 29, 2023 • edited

GraceKafuu commented Nov 30, 2023

torch 1.12.1+cu113 torchvision 0.13.1+cu113

kongdebug commented Dec 13, 2023

eqy commented Dec 13, 2023

kongdebug commented Dec 14, 2023

GraceKafuu commented Dec 21, 2023

torch 1.12.1+cu113

Twenty3hree commented Mar 18, 2024

heiyuxiaokai commented Apr 18, 2024

HeaoYe commented Oct 24, 2022 •

edited by pytorch-bot bot

malfet commented Oct 24, 2022 •

edited

tonyyuan1 commented Jan 26, 2023 •

edited

tonyyuan1 commented Jan 27, 2023 •

edited

nikolaydyankov commented May 2, 2023 •

edited

Alkrick commented Nov 29, 2023 •

edited

torch 1.12.1+cu113
torchvision 0.13.1+cu113