Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA - Inline literal in (...) statements #9528

Open
c200chromebook opened this issue Apr 12, 2024 · 2 comments
Open

CUDA - Inline literal in (...) statements #9528

c200chromebook opened this issue Apr 12, 2024 · 2 comments
Labels
CUDA CUDA related issue/PR performance - run time Performance issue occurring at run time.

Comments

@c200chromebook
Copy link
Contributor

c200chromebook commented Apr 12, 2024

Feature request

We are seeing (tiny) differences in performance between x in (1,2) and x == 1 or x == 2, favoring the latter. A bit of asm inspection shows that a subfunction is being created and not inlined in the first case on the CUDA target. Is it possible to automatically inline this or add support for this case to literal_unroll? It makes the code a bit prettier and more pythonic if you can use the in statement.

import numba as nb
from numba import cuda

ONE, TWO, THREE = 1, 2, 3


@cuda.jit(nb.void(nb.float64[:]))
def fn_a(arg):
    if arg[0] in (ONE, TWO):
        arg[0] += 1


@cuda.jit(nb.void(nb.float64[:]))
def fn_b(arg):
    if arg[0] == ONE or arg[0] == TWO:
        arg[0] += 1


print("*" * 25)
print(next(iter(fn_a.overloads.values())).inspect_llvm())
print("#" * 25)
print(next(iter(fn_a.overloads.values())).inspect_asm(cc=None))
print("*" * 25)
print(next(iter(fn_b.overloads.values())).inspect_llvm())
print("#" * 25)
print(next(iter(fn_b.overloads.values())).inspect_asm(cc=None))
print("*" * 25)

Yields

; ModuleID = "cuda.kernel.wrapper"
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

declare i32 @"_ZN8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE"(i8** %".1", i8* %".2", i8* %".3", i64 %".4", i64 %".5", double* %".6", i64 %".7", i64 %".8")

define void @"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE"(i8* %".1", i8* %".2", i64 %".3", i64 %".4", double* %".5", i64 %".6", i64 %".7")
{
.9:
  %"inserted.meminfo" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} undef, i8* %".1", 0
  %"inserted.parent" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.meminfo", i8* %".2", 1
  %"inserted.nitems" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.parent", i64 %".3", 2
  %"inserted.itemsize" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.nitems", i64 %".4", 3
  %"inserted.data" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.itemsize", double* %".5", 4
  %".10" = insertvalue [1 x i64] undef, i64 %".6", 0
  %"inserted.shape" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.data", [1 x i64] %".10", 5
  %".11" = insertvalue [1 x i64] undef, i64 %".7", 0
  %"inserted.strides" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.shape", [1 x i64] %".11", 6
  %".12" = alloca i8*
  store i8* null, i8** %".12"
  store i8* null, i8** %".12"
  %"extracted.meminfo" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 0
  %"extracted.parent" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 1
  %"extracted.nitems" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 2
  %"extracted.itemsize" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 3
  %"extracted.data" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 4
  %"extracted.shape" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 5
  %".15" = extractvalue [1 x i64] %"extracted.shape", 0
  %"extracted.strides" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 6
  %".16" = extractvalue [1 x i64] %"extracted.strides", 0
  %".17" = call i32 @"_ZN8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE"(i8** %".12", i8* %"extracted.meminfo", i8* %"extracted.parent", i64 %"extracted.nitems", i64 %"extracted.itemsize", double* %"extracted.data", i64 %".15", i64 %".16")
  %".18" = icmp eq i32 %".17", 0
  %".19" = icmp eq i32 %".17", -2
  %".20" = or i1 %".18", %".19"
  %".21" = xor i1 %".20", -1
  %".22" = icmp eq i32 %".17", -1
  %".23" = icmp eq i32 %".17", -3
  %".24" = icmp sge i32 %".17", 1
  %".25" = load i8*, i8** %".12"
  ret void
}

@"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__errcode__" = global i32 0
@"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidx__" = global i32 0
@"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidx__" = global i32 0
@"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidy__" = global i32 0
@"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidy__" = global i32 0
@"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidz__" = global i32 0
@"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidz__" = global i32 0
@"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i8*, i8*, i64, i64, double*, i64, i64)* @"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE" to i8*)], section "llvm.metadata"
!nvvmir.version = !{ !0 }
!nvvm.annotations = !{ !1 }
!0 = !{ i32 1, i32 9, i32 3, i32 1 }
!1 = !{ void (i8*, i8*, i64, i64, double*, i64, i64)* @"_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE", !"kernel", i32 1 }

; ModuleID = "fn_a$1"
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

@"_ZN08NumbaEnv8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE" = common global i8* null
define linkonce_odr i32 @"_ZN8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE"(i8** %".ret", i8* %"arg.arg.0", i8* %"arg.arg.1", i64 %"arg.arg.2", i64 %"arg.arg.3", double* %"arg.arg.4", i64 %"arg.arg.5.0", i64 %"arg.arg.6.0")
{
entry:
  %"inserted.meminfo" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} undef, i8* %"arg.arg.0", 0
  %"inserted.parent" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.meminfo", i8* %"arg.arg.1", 1
  %"inserted.nitems" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.parent", i64 %"arg.arg.2", 2
  %"inserted.itemsize" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.nitems", i64 %"arg.arg.3", 3
  %"inserted.data" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.itemsize", double* %"arg.arg.4", 4
  %".10" = insertvalue [1 x i64] undef, i64 %"arg.arg.5.0", 0
  %"inserted.shape" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.data", [1 x i64] %".10", 5
  %".11" = insertvalue [1 x i64] undef, i64 %"arg.arg.6.0", 0
  %"inserted.strides" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.shape", [1 x i64] %".11", 6
  %"arg" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  %".15" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15"
  %".51" = alloca i8
  store i8 0, i8* %".51"
  %".65" = alloca i1
  store i1 0, i1* %".65"
  %".78" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".78"
  %".111" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".111"
  br label %"B0"
B0:
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  %".14" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %".14", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15"
  %".19" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 5
  %".20" = getelementptr inbounds [1 x i64], [1 x i64]* %".19", i32 0, i32 0
  %".21" = load i64, i64* %".20", !range !1
  %".22" = insertvalue [1 x i64] undef, i64 %".21", 0
  %".23" = extractvalue [1 x i64] %".22", 0
  %".24" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 6
  %".25" = load [1 x i64], [1 x i64]* %".24"
  %".26" = extractvalue [1 x i64] %".25", 0
  %".27" = icmp slt i64 0, 0
  %".28" = add i64 0, %".23"
  %".29" = select  i1 %".27", i64 %".28", i64 0
  %".30" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 5
  %".31" = getelementptr inbounds [1 x i64], [1 x i64]* %".30", i32 0, i32 0
  %".32" = load i64, i64* %".31", !range !1
  %".33" = insertvalue [1 x i64] undef, i64 %".32", 0
  %".34" = extractvalue [1 x i64] %".33", 0
  %".35" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 6
  %".36" = load [1 x i64], [1 x i64]* %".35"
  %".37" = extractvalue [1 x i64] %".36", 0
  %".38" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 4
  %".39" = load double*, double** %".38"
  %".40" = mul i64 %".37", %".29"
  %".41" = ptrtoint double* %".39" to i64
  %".42" = add i64 %".41", %".40"
  %".43" = inttoptr i64 %".42" to double*
  %".44" = load double, double* %".43"
  %".45" = insertvalue {i64, i64} undef, i64 1, 0
  %".46" = insertvalue {i64, i64} %".45", i64 2, 1
  %".47" = extractvalue {i64, i64} %".46", 0
  %".48" = extractvalue {i64, i64} %".46", 1
  %".49" = insertvalue [2 x i64] undef, i64 1, 0
  %".50" = insertvalue [2 x i64] %".49", i64 2, 1
  store i8 0, i8* %".51"
  %".54" = extractvalue [2 x i64] %".50", 0
  %".55" = extractvalue [2 x i64] %".50", 1
  %".56" = call i32 @"_ZN5numba7cpython7listobj6in_seq12_3clocals_3e17seq_contains_implB2v2B40c8tJTC_2fWQI8IW1CiAAYKPM6RBFDjESyhCQA_3dE8UniTupleIxLi2EEd"(i8* %".51", i64 %".54", i64 %".55", double %".44")
  %".57" = icmp eq i32 %".56", 0
  %".58" = icmp eq i32 %".56", -2
  %".59" = or i1 %".57", %".58"
  %".60" = xor i1 %".59", -1
  %".61" = icmp eq i32 %".56", -1
  %".62" = icmp eq i32 %".56", -3
  %".63" = icmp sge i32 %".56", 1
  %".64" = load i8, i8* %".51"
  %".67" = icmp eq i8 %".64", 0
  br i1 %".67", label %"B0.if", label %"B0.else"
B18:
  %".77" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".78"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %".77", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".78"
  %".82" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".78", i32 0, i32 5
  %".83" = getelementptr inbounds [1 x i64], [1 x i64]* %".82", i32 0, i32 0
  %".84" = load i64, i64* %".83", !range !1
  %".85" = insertvalue [1 x i64] undef, i64 %".84", 0
  %".86" = extractvalue [1 x i64] %".85", 0
  %".87" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".78", i32 0, i32 6
  %".88" = load [1 x i64], [1 x i64]* %".87"
  %".89" = extractvalue [1 x i64] %".88", 0
  %".90" = icmp slt i64 0, 0
  %".91" = add i64 0, %".86"
  %".92" = select  i1 %".90", i64 %".91", i64 0
  %".93" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".78", i32 0, i32 5
  %".94" = getelementptr inbounds [1 x i64], [1 x i64]* %".93", i32 0, i32 0
  %".95" = load i64, i64* %".94", !range !1
  %".96" = insertvalue [1 x i64] undef, i64 %".95", 0
  %".97" = extractvalue [1 x i64] %".96", 0
  %".98" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".78", i32 0, i32 6
  %".99" = load [1 x i64], [1 x i64]* %".98"
  %".100" = extractvalue [1 x i64] %".99", 0
  %".101" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".78", i32 0, i32 4
  %".102" = load double*, double** %".101"
  %".103" = mul i64 %".100", %".92"
  %".104" = ptrtoint double* %".102" to i64
  %".105" = add i64 %".104", %".103"
  %".106" = inttoptr i64 %".105" to double*
  %".107" = load double, double* %".106"
  %".108" = sitofp i64 1 to double
  %".109" = fadd double %".107", %".108"
  %".110" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".111"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %".110", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".111"
  %".115" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".111", i32 0, i32 5
  %".116" = getelementptr inbounds [1 x i64], [1 x i64]* %".115", i32 0, i32 0
  %".117" = load i64, i64* %".116", !range !1
  %".118" = insertvalue [1 x i64] undef, i64 %".117", 0
  %".119" = extractvalue [1 x i64] %".118", 0
  %".120" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".111", i32 0, i32 6
  %".121" = load [1 x i64], [1 x i64]* %".120"
  %".122" = extractvalue [1 x i64] %".121", 0
  %".123" = icmp slt i64 0, 0
  %".124" = add i64 0, %".119"
  %".125" = select  i1 %".123", i64 %".124", i64 0
  %".126" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".111", i32 0, i32 5
  %".127" = getelementptr inbounds [1 x i64], [1 x i64]* %".126", i32 0, i32 0
  %".128" = load i64, i64* %".127", !range !1
  %".129" = insertvalue [1 x i64] undef, i64 %".128", 0
  %".130" = extractvalue [1 x i64] %".129", 0
  %".131" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".111", i32 0, i32 6
  %".132" = load [1 x i64], [1 x i64]* %".131"
  %".133" = extractvalue [1 x i64] %".132", 0
  %".134" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".111", i32 0, i32 4
  %".135" = load double*, double** %".134"
  %".136" = mul i64 %".133", %".125"
  %".137" = ptrtoint double* %".135" to i64
  %".138" = add i64 %".137", %".136"
  %".139" = inttoptr i64 %".138" to double*
  store double %".109", double* %".139"
  %".141" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  br label %"B34"
B34:
  %".144" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store i8* null, i8** %".ret"
  ret i32 0
B0.if:
  store i1 0, i1* %".65"
  br label %"B0.endif"
B0.else:
  store i1 1, i1* %".65"
  br label %"B0.endif"
B0.endif:
  %".73" = load i1, i1* %".65"
  br i1 %".60", label %"B0.endif.if", label %"B0.endif.endif", !prof !2
B0.endif.if:
  ret i32 %".56"
B0.endif.endif:
  br i1 %".73", label %"B18", label %"B34"
}

declare i32 @"_ZN5numba7cpython7listobj6in_seq12_3clocals_3e17seq_contains_implB2v2B40c8tJTC_2fWQI8IW1CiAAYKPM6RBFDjESyhCQA_3dE8UniTupleIxLi2EEd"(i8* %".ret", i64 %"arg.lst.0", i64 %"arg.lst.1", double %"arg.value")

!nvvmir.version = !{ !0 }
!0 = !{ i32 1, i32 9, i32 3, i32 1 }
!1 = !{ i64 0, i64 9223372036854775807 }
!2 = !{ !"branch_weights", i32 1, i32 99 }

; ModuleID = "in_seq.<locals>.seq_contains_impl$2"
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

@"_ZN08NumbaEnv5numba7cpython7listobj6in_seq12_3clocals_3e17seq_contains_implB2v2B40c8tJTC_2fWQI8IW1CiAAYKPM6RBFDjESyhCQA_3dE8UniTupleIxLi2EEd" = common global i8* null
define linkonce_odr i32 @"_ZN5numba7cpython7listobj6in_seq12_3clocals_3e17seq_contains_implB2v2B40c8tJTC_2fWQI8IW1CiAAYKPM6RBFDjESyhCQA_3dE8UniTupleIxLi2EEd"(i8* %".ret", i64 %"arg.lst.0", i64 %"arg.lst.1", double %"arg.value")
{
entry:
  %".6" = insertvalue [2 x i64] undef, i64 %"arg.lst.0", 0
  %".7" = insertvalue [2 x i64] %".6", i64 %"arg.lst.1", 1
  %"value" = alloca double
  store double 0.0, double* %"value"
  %".10" = alloca {i64*, [2 x i64]}
  store {i64*, [2 x i64]} zeroinitializer, {i64*, [2 x i64]}* %".10"
  %".13" = alloca i64
  store i64 0, i64* %".13"
  %"$phi6.0" = alloca {i64*, [2 x i64]}
  store {i64*, [2 x i64]} zeroinitializer, {i64*, [2 x i64]}* %"$phi6.0"
  %"$16pred" = alloca i1
  store i1 0, i1* %"$16pred"
  %".29" = alloca {i64, i1}
  store {i64, i1} zeroinitializer, {i64, i1}* %".29"
  %".32" = alloca {i64*, [2 x i64]}
  store {i64*, [2 x i64]} zeroinitializer, {i64*, [2 x i64]}* %".32"
  %".60" = alloca {i64, i1}
  store {i64, i1} zeroinitializer, {i64, i1}* %".60"
  %".66" = alloca {i64, i1}
  store {i64, i1} zeroinitializer, {i64, i1}* %".66"
  %"$phi8.1" = alloca i64
  store i64 0, i64* %"$phi8.1"
  br label %"B0"
B0:
  store double %"arg.value", double* %"value"
  store {i64*, [2 x i64]} zeroinitializer, {i64*, [2 x i64]}* %".10"
  store i64 0, i64* %".13"
  %".16" = getelementptr inbounds {i64*, [2 x i64]}, {i64*, [2 x i64]}* %".10", i32 0, i32 0
  store i64* %".13", i64** %".16"
  %".18" = getelementptr inbounds {i64*, [2 x i64]}, {i64*, [2 x i64]}* %".10", i32 0, i32 1
  store [2 x i64] %".7", [2 x i64]* %".18"
  %".20" = load {i64*, [2 x i64]}, {i64*, [2 x i64]}* %".10"
  %".22" = load {i64*, [2 x i64]}, {i64*, [2 x i64]}* %"$phi6.0"
  store {i64*, [2 x i64]} %".20", {i64*, [2 x i64]}* %"$phi6.0"
  br label %"B6"
B6:
  %".26" = load i1, i1* %"$16pred"
  store i1 0, i1* %"$16pred"
  %".28" = load {i64*, [2 x i64]}, {i64*, [2 x i64]}* %"$phi6.0"
  store {i64, i1} zeroinitializer, {i64, i1}* %".29"
  store {i64*, [2 x i64]} zeroinitializer, {i64*, [2 x i64]}* %".32"
  store {i64*, [2 x i64]} %".28", {i64*, [2 x i64]}* %".32"
  %".36" = getelementptr inbounds {i64*, [2 x i64]}, {i64*, [2 x i64]}* %".32", i32 0, i32 1
  %".37" = load [2 x i64], [2 x i64]* %".36"
  %".38" = getelementptr inbounds {i64*, [2 x i64]}, {i64*, [2 x i64]}* %".32", i32 0, i32 0
  %".39" = load i64*, i64** %".38"
  %".40" = load i64, i64* %".39"
  %".41" = icmp slt i64 %".40", 2
  %".42" = getelementptr inbounds {i64, i1}, {i64, i1}* %".29", i32 0, i32 1
  store i1 %".41", i1* %".42"
  br i1 %".41", label %"B6.if", label %"B6.endif"
B8:
  %".76" = load i64, i64* %"$phi8.1"
  %".77" = load i64, i64* %"$phi8.1"
  store i64 0, i64* %"$phi8.1"
  %".79" = load double, double* %"value"
  %".80" = sitofp i64 %".76" to double
  %".81" = fcmp oeq double %".80", %".79"
  %".82" = load i1, i1* %"$16pred"
  store i1 %".81", i1* %"$16pred"
  %".84" = load i1, i1* %"$16pred"
  br i1 %".84", label %"B18", label %"B6"
B18:
  %".86" = load double, double* %"value"
  store double 0.0, double* %"value"
  %".88" = load {i64*, [2 x i64]}, {i64*, [2 x i64]}* %"$phi6.0"
  store {i64*, [2 x i64]} zeroinitializer, {i64*, [2 x i64]}* %"$phi6.0"
  %".90" = load i1, i1* %"$16pred"
  store i1 0, i1* %"$16pred"
  %".92" = zext i1 true to i8
  store i8 %".92", i8* %".ret"
  ret i32 0
B26:
  %".95" = load double, double* %"value"
  store double 0.0, double* %"value"
  %".97" = load i64, i64* %"$phi8.1"
  store i64 0, i64* %"$phi8.1"
  %".99" = load {i64*, [2 x i64]}, {i64*, [2 x i64]}* %"$phi6.0"
  store {i64*, [2 x i64]} zeroinitializer, {i64*, [2 x i64]}* %"$phi6.0"
  %".101" = zext i1 false to i8
  store i8 %".101", i8* %".ret"
  ret i32 0
B6.if:
  switch i64 %".40", label %"switch.else" [i64 0, label %"switch.0" i64 -2, label %"switch.0" i64 1, label %"switch.1" i64 -1, label %"switch.1"]
B6.endif:
  %".59" = load {i64, i1}, {i64, i1}* %".29"
  store {i64, i1} zeroinitializer, {i64, i1}* %".60"
  store {i64, i1} %".59", {i64, i1}* %".60"
  %".64" = getelementptr inbounds {i64, i1}, {i64, i1}* %".60", i32 0, i32 0
  %".65" = load i64, i64* %".64"
  store {i64, i1} zeroinitializer, {i64, i1}* %".66"
  store {i64, i1} %".59", {i64, i1}* %".66"
  %".70" = getelementptr inbounds {i64, i1}, {i64, i1}* %".66", i32 0, i32 1
  %".71" = load i1, i1* %".70"
  %".73" = load i64, i64* %"$phi8.1"
  store i64 %".65", i64* %"$phi8.1"
  br i1 %".71", label %"B8", label %"B26"
switch.else:
  ret i32 1
switch.end:
  %".47" = phi  i64 [%".48", %"switch.0"], [%".50", %"switch.1"]
  %".52" = getelementptr inbounds {i64, i1}, {i64, i1}* %".29", i32 0, i32 0
  store i64 %".47", i64* %".52"
  %".54" = add i64 %".40", 1
  %".55" = getelementptr inbounds {i64*, [2 x i64]}, {i64*, [2 x i64]}* %".32", i32 0, i32 0
  %".56" = load i64*, i64** %".55"
  store i64 %".54", i64* %".56"
  br label %"B6.endif"
switch.0:
  %".48" = extractvalue [2 x i64] %".37", 0
  br label %"switch.end"
switch.1:
  %".50" = extractvalue [2 x i64] %".37", 1
  br label %"switch.end"
}

!nvvmir.version = !{ !0 }
!0 = !{ i32 1, i32 9, i32 3, i32 1 }
#########################
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31294372
// Cuda compilation tools, release 11.7, V11.7.64
// Based on NVVM 7.0.1
//

.version 7.7
.target sm_75
.address_size 64

	// .globl	_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__errcode__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidx__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidx__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidy__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidy__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidz__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidz__;
.common .global .align 8 .u64 _ZN08NumbaEnv8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE;
.common .global .align 8 .u64 _ZN08NumbaEnv5numba7cpython7listobj6in_seq12_3clocals_3e17seq_contains_implB2v2B40c8tJTC_2fWQI8IW1CiAAYKPM6RBFDjESyhCQA_3dE8UniTupleIxLi2EEd;

.visible .entry _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE(
	.param .u64 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_0,
	.param .u64 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_1,
	.param .u64 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_2,
	.param .u64 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_3,
	.param .u64 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_4,
	.param .u64 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_5,
	.param .u64 _ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_6
)
{
	.reg .pred 	%p<20>;
	.reg .f64 	%fd<4>;
	.reg .b64 	%rd<16>;


	ld.param.u64 	%rd8, [_ZN6cudapy8__main__4fn_aB2v1B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_4];
	cvta.to.global.u64 	%rd1, %rd8;
	ld.global.f64 	%fd1, [%rd1];
	mov.u64 	%rd7, 0;
	mov.u64 	%rd10, 1;
	mov.pred 	%p5, -1;
	mov.pred 	%p12, 0;
	mov.u64 	%rd14, %rd7;

$L__BB0_1:
	setp.gt.s64 	%p3, %rd14, 1;
	mov.u64 	%rd13, %rd7;
	@%p3 bra 	$L__BB0_9;

	setp.gt.s64 	%p6, %rd14, -1;
	@%p6 bra 	$L__BB0_5;

	setp.eq.s64 	%p9, %rd14, -2;
	mov.u64 	%rd13, %rd10;
	@%p9 bra 	$L__BB0_8;

	setp.eq.s64 	%p10, %rd14, -1;
	mov.pred 	%p18, %p5;
	mov.pred 	%p19, %p5;
	@%p10 bra 	$L__BB0_7;
	bra.uni 	$L__BB0_11;

$L__BB0_5:
	setp.eq.s64 	%p7, %rd14, 0;
	mov.u64 	%rd13, %rd10;
	@%p7 bra 	$L__BB0_8;

	setp.ne.s64 	%p8, %rd14, 1;
	mov.pred 	%p18, %p5;
	mov.pred 	%p19, %p5;
	@%p8 bra 	$L__BB0_11;

$L__BB0_7:
	mov.u64 	%rd13, 2;

$L__BB0_8:
	add.s64 	%rd14, %rd14, 1;

$L__BB0_9:
	mov.pred 	%p18, %p5;
	mov.pred 	%p19, %p12;
	@%p3 bra 	$L__BB0_11;

	cvt.rn.f64.s64 	%fd2, %rd13;
	setp.neu.f64 	%p16, %fd1, %fd2;
	mov.pred 	%p18, %p12;
	mov.pred 	%p19, %p12;
	@%p16 bra 	$L__BB0_1;

$L__BB0_11:
	or.pred  	%p17, %p18, %p19;
	@%p17 bra 	$L__BB0_13;

	add.f64 	%fd3, %fd1, 0d3FF0000000000000;
	st.global.f64 	[%rd1], %fd3;

$L__BB0_13:
	ret;

}
*************************
; ModuleID = "cuda.kernel.wrapper"
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

declare i32 @"_ZN8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE"(i8** %".1", i8* %".2", i8* %".3", i64 %".4", i64 %".5", double* %".6", i64 %".7", i64 %".8")

define void @"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE"(i8* %".1", i8* %".2", i64 %".3", i64 %".4", double* %".5", i64 %".6", i64 %".7")
{
.9:
  %"inserted.meminfo" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} undef, i8* %".1", 0
  %"inserted.parent" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.meminfo", i8* %".2", 1
  %"inserted.nitems" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.parent", i64 %".3", 2
  %"inserted.itemsize" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.nitems", i64 %".4", 3
  %"inserted.data" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.itemsize", double* %".5", 4
  %".10" = insertvalue [1 x i64] undef, i64 %".6", 0
  %"inserted.shape" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.data", [1 x i64] %".10", 5
  %".11" = insertvalue [1 x i64] undef, i64 %".7", 0
  %"inserted.strides" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.shape", [1 x i64] %".11", 6
  %".12" = alloca i8*
  store i8* null, i8** %".12"
  store i8* null, i8** %".12"
  %"extracted.meminfo" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 0
  %"extracted.parent" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 1
  %"extracted.nitems" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 2
  %"extracted.itemsize" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 3
  %"extracted.data" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 4
  %"extracted.shape" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 5
  %".15" = extractvalue [1 x i64] %"extracted.shape", 0
  %"extracted.strides" = extractvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", 6
  %".16" = extractvalue [1 x i64] %"extracted.strides", 0
  %".17" = call i32 @"_ZN8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE"(i8** %".12", i8* %"extracted.meminfo", i8* %"extracted.parent", i64 %"extracted.nitems", i64 %"extracted.itemsize", double* %"extracted.data", i64 %".15", i64 %".16")
  %".18" = icmp eq i32 %".17", 0
  %".19" = icmp eq i32 %".17", -2
  %".20" = or i1 %".18", %".19"
  %".21" = xor i1 %".20", -1
  %".22" = icmp eq i32 %".17", -1
  %".23" = icmp eq i32 %".17", -3
  %".24" = icmp sge i32 %".17", 1
  %".25" = load i8*, i8** %".12"
  ret void
}

@"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__errcode__" = global i32 0
@"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidx__" = global i32 0
@"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidx__" = global i32 0
@"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidy__" = global i32 0
@"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidy__" = global i32 0
@"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidz__" = global i32 0
@"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidz__" = global i32 0
@"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i8*, i8*, i64, i64, double*, i64, i64)* @"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE" to i8*)], section "llvm.metadata"
!nvvmir.version = !{ !0 }
!nvvm.annotations = !{ !1 }
!0 = !{ i32 1, i32 9, i32 3, i32 1 }
!1 = !{ void (i8*, i8*, i64, i64, double*, i64, i64)* @"_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE", !"kernel", i32 1 }

; ModuleID = "fn_b$3"
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

@"_ZN08NumbaEnv8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE" = common global i8* null
define linkonce_odr i32 @"_ZN8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE"(i8** %".ret", i8* %"arg.arg.0", i8* %"arg.arg.1", i64 %"arg.arg.2", i64 %"arg.arg.3", double* %"arg.arg.4", i64 %"arg.arg.5.0", i64 %"arg.arg.6.0")
{
entry:
  %"inserted.meminfo" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} undef, i8* %"arg.arg.0", 0
  %"inserted.parent" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.meminfo", i8* %"arg.arg.1", 1
  %"inserted.nitems" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.parent", i64 %"arg.arg.2", 2
  %"inserted.itemsize" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.nitems", i64 %"arg.arg.3", 3
  %"inserted.data" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.itemsize", double* %"arg.arg.4", 4
  %".10" = insertvalue [1 x i64] undef, i64 %"arg.arg.5.0", 0
  %"inserted.shape" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.data", [1 x i64] %".10", 5
  %".11" = insertvalue [1 x i64] undef, i64 %"arg.arg.6.0", 0
  %"inserted.strides" = insertvalue {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.shape", [1 x i64] %".11", 6
  %"arg" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  %".15" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15"
  %".49" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".49"
  %".83" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".83"
  %".116" = alloca {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".116"
  br label %"B0"
B0:
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %"inserted.strides", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  %".14" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %".14", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15"
  %".19" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 5
  %".20" = getelementptr inbounds [1 x i64], [1 x i64]* %".19", i32 0, i32 0
  %".21" = load i64, i64* %".20", !range !1
  %".22" = insertvalue [1 x i64] undef, i64 %".21", 0
  %".23" = extractvalue [1 x i64] %".22", 0
  %".24" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 6
  %".25" = load [1 x i64], [1 x i64]* %".24"
  %".26" = extractvalue [1 x i64] %".25", 0
  %".27" = icmp slt i64 0, 0
  %".28" = add i64 0, %".23"
  %".29" = select  i1 %".27", i64 %".28", i64 0
  %".30" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 5
  %".31" = getelementptr inbounds [1 x i64], [1 x i64]* %".30", i32 0, i32 0
  %".32" = load i64, i64* %".31", !range !1
  %".33" = insertvalue [1 x i64] undef, i64 %".32", 0
  %".34" = extractvalue [1 x i64] %".33", 0
  %".35" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 6
  %".36" = load [1 x i64], [1 x i64]* %".35"
  %".37" = extractvalue [1 x i64] %".36", 0
  %".38" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".15", i32 0, i32 4
  %".39" = load double*, double** %".38"
  %".40" = mul i64 %".37", %".29"
  %".41" = ptrtoint double* %".39" to i64
  %".42" = add i64 %".41", %".40"
  %".43" = inttoptr i64 %".42" to double*
  %".44" = load double, double* %".43"
  %".45" = sitofp i64 1 to double
  %".46" = fcmp oeq double %".44", %".45"
  br i1 %".46", label %"B26", label %"B14"
B14:
  %".48" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".49"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %".48", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".49"
  %".53" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".49", i32 0, i32 5
  %".54" = getelementptr inbounds [1 x i64], [1 x i64]* %".53", i32 0, i32 0
  %".55" = load i64, i64* %".54", !range !1
  %".56" = insertvalue [1 x i64] undef, i64 %".55", 0
  %".57" = extractvalue [1 x i64] %".56", 0
  %".58" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".49", i32 0, i32 6
  %".59" = load [1 x i64], [1 x i64]* %".58"
  %".60" = extractvalue [1 x i64] %".59", 0
  %".61" = icmp slt i64 0, 0
  %".62" = add i64 0, %".57"
  %".63" = select  i1 %".61", i64 %".62", i64 0
  %".64" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".49", i32 0, i32 5
  %".65" = getelementptr inbounds [1 x i64], [1 x i64]* %".64", i32 0, i32 0
  %".66" = load i64, i64* %".65", !range !1
  %".67" = insertvalue [1 x i64] undef, i64 %".66", 0
  %".68" = extractvalue [1 x i64] %".67", 0
  %".69" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".49", i32 0, i32 6
  %".70" = load [1 x i64], [1 x i64]* %".69"
  %".71" = extractvalue [1 x i64] %".70", 0
  %".72" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".49", i32 0, i32 4
  %".73" = load double*, double** %".72"
  %".74" = mul i64 %".71", %".63"
  %".75" = ptrtoint double* %".73" to i64
  %".76" = add i64 %".75", %".74"
  %".77" = inttoptr i64 %".76" to double*
  %".78" = load double, double* %".77"
  %".79" = sitofp i64 2 to double
  %".80" = fcmp oeq double %".78", %".79"
  br i1 %".80", label %"B26", label %"B42"
B26:
  %".82" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".83"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %".82", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".83"
  %".87" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".83", i32 0, i32 5
  %".88" = getelementptr inbounds [1 x i64], [1 x i64]* %".87", i32 0, i32 0
  %".89" = load i64, i64* %".88", !range !1
  %".90" = insertvalue [1 x i64] undef, i64 %".89", 0
  %".91" = extractvalue [1 x i64] %".90", 0
  %".92" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".83", i32 0, i32 6
  %".93" = load [1 x i64], [1 x i64]* %".92"
  %".94" = extractvalue [1 x i64] %".93", 0
  %".95" = icmp slt i64 0, 0
  %".96" = add i64 0, %".91"
  %".97" = select  i1 %".95", i64 %".96", i64 0
  %".98" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".83", i32 0, i32 5
  %".99" = getelementptr inbounds [1 x i64], [1 x i64]* %".98", i32 0, i32 0
  %".100" = load i64, i64* %".99", !range !1
  %".101" = insertvalue [1 x i64] undef, i64 %".100", 0
  %".102" = extractvalue [1 x i64] %".101", 0
  %".103" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".83", i32 0, i32 6
  %".104" = load [1 x i64], [1 x i64]* %".103"
  %".105" = extractvalue [1 x i64] %".104", 0
  %".106" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".83", i32 0, i32 4
  %".107" = load double*, double** %".106"
  %".108" = mul i64 %".105", %".97"
  %".109" = ptrtoint double* %".107" to i64
  %".110" = add i64 %".109", %".108"
  %".111" = inttoptr i64 %".110" to double*
  %".112" = load double, double* %".111"
  %".113" = sitofp i64 1 to double
  %".114" = fadd double %".112", %".113"
  %".115" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".116"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} %".115", {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".116"
  %".120" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".116", i32 0, i32 5
  %".121" = getelementptr inbounds [1 x i64], [1 x i64]* %".120", i32 0, i32 0
  %".122" = load i64, i64* %".121", !range !1
  %".123" = insertvalue [1 x i64] undef, i64 %".122", 0
  %".124" = extractvalue [1 x i64] %".123", 0
  %".125" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".116", i32 0, i32 6
  %".126" = load [1 x i64], [1 x i64]* %".125"
  %".127" = extractvalue [1 x i64] %".126", 0
  %".128" = icmp slt i64 0, 0
  %".129" = add i64 0, %".124"
  %".130" = select  i1 %".128", i64 %".129", i64 0
  %".131" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".116", i32 0, i32 5
  %".132" = getelementptr inbounds [1 x i64], [1 x i64]* %".131", i32 0, i32 0
  %".133" = load i64, i64* %".132", !range !1
  %".134" = insertvalue [1 x i64] undef, i64 %".133", 0
  %".135" = extractvalue [1 x i64] %".134", 0
  %".136" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".116", i32 0, i32 6
  %".137" = load [1 x i64], [1 x i64]* %".136"
  %".138" = extractvalue [1 x i64] %".137", 0
  %".139" = getelementptr inbounds {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %".116", i32 0, i32 4
  %".140" = load double*, double** %".139"
  %".141" = mul i64 %".138", %".130"
  %".142" = ptrtoint double* %".140" to i64
  %".143" = add i64 %".142", %".141"
  %".144" = inttoptr i64 %".143" to double*
  store double %".114", double* %".144"
  %".146" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  br label %"B42"
B42:
  %".149" = load {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]} zeroinitializer, {i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}* %"arg"
  store i8* null, i8** %".ret"
  ret i32 0
}

!nvvmir.version = !{ !0 }
!0 = !{ i32 1, i32 9, i32 3, i32 1 }
!1 = !{ i64 0, i64 9223372036854775807 }
#########################
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31294372
// Cuda compilation tools, release 11.7, V11.7.64
// Based on NVVM 7.0.1
//

.version 7.7
.target sm_75
.address_size 64

	// .globl	_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__errcode__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidx__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidx__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidy__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidy__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__tidz__;
.visible .global .align 4 .u32 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE__ctaidz__;
.common .global .align 8 .u64 _ZN08NumbaEnv8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE;

.visible .entry _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE(
	.param .u64 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_0,
	.param .u64 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_1,
	.param .u64 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_2,
	.param .u64 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_3,
	.param .u64 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_4,
	.param .u64 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_5,
	.param .u64 _ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_6
)
{
	.reg .pred 	%p<4>;
	.reg .f64 	%fd<3>;
	.reg .b64 	%rd<3>;


	ld.param.u64 	%rd2, [_ZN6cudapy8__main__4fn_bB2v3B94cw51cXTLSUwv1sCUt9Uw11Ew1dRRQPKzLTg4gaGKFsG2oMQGEYakJSQB1PQBk0Bynm21OiwU1a0UoLGhDpQE8oxrNQE_3dE5ArrayIdLi1E1A7mutable7alignedE_param_4];
	cvta.to.global.u64 	%rd1, %rd2;
	ld.global.f64 	%fd1, [%rd1];
	setp.neu.f64 	%p1, %fd1, 0d3FF0000000000000;
	setp.neu.f64 	%p2, %fd1, 0d4000000000000000;
	and.pred  	%p3, %p1, %p2;
	@%p3 bra 	$L__BB0_2;

	add.f64 	%fd2, %fd1, 0d3FF0000000000000;
	st.global.f64 	[%rd1], %fd2;

$L__BB0_2:
	ret;

}
*************************
@c200chromebook
Copy link
Contributor Author

c200chromebook commented Apr 12, 2024

CPU seems to be doing better than GPU here, though I'm not as experienced in reading these outputs.

import numba as nb

ONE, TWO, THREE = 1, 2, 3


@nb.jit(nb.void(nb.float64[:]))
def fn_a(arg):
    if arg[0] in (ONE, TWO):
        arg[0] += 1


@nb.jit(nb.void(nb.float64[:]))
def fn_b(arg):
    if arg[0] == ONE or arg[0] == TWO:
        arg[0] += 1


print("*" * 25)
print(next(iter(fn_a.overloads.values())).library.get_llvm_str())
print("#" * 25)
print(next(iter(fn_a.overloads.values())).library.get_asm_str())
print("*" * 25)

print(next(iter(fn_b.overloads.values())).library.get_llvm_str())
print("#" * 25)
print(next(iter(fn_b.overloads.values())).library.get_asm_str())
print("*" * 25)

@c200chromebook c200chromebook changed the title Inline literal in (...) statements CUDA - Inline literal in (...) statements Apr 12, 2024
@c200chromebook
Copy link
Contributor Author

@gmarkall This one is probably yours

@sklam sklam added CUDA CUDA related issue/PR performance - run time Performance issue occurring at run time. labels Apr 15, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
CUDA CUDA related issue/PR performance - run time Performance issue occurring at run time.
Projects
None yet
Development

No branches or pull requests

2 participants