Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce normal register loads #263

Merged
merged 3 commits into from
Mar 5, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
64 changes: 26 additions & 38 deletions larq_compute_engine/core/bgemm_kernels_arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,18 +294,18 @@ void BinaryKernelNeonOutOfOrder32BP4x4(
// Load multiplication bias
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_POST_ACTIVATION_MULTIPLIER) "]\n"
// Offset these base pointers as needed given the current row, col.
"add x5, x1, %x[row], lsl #2\n"
"add x2, x1, %x[row], lsl #2\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"csel x1, x1, x2, eq\n"
// Load 4 bias-multiplication values.
"ld1 {v14.4s}, [x1], #16\n"

// Load addition bias
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_POST_ACTIVATION_BIAS) "]\n"
// Offset these base pointers as needed given the current row, col.
"add x5, x1, %x[row], lsl #2\n"
"add x2, x1, %x[row], lsl #2\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"csel x1, x1, x2, eq\n"
// Load 4 bias-addition values.
"ld1 {v15.4s}, [x1], #16\n"

Expand Down Expand Up @@ -405,32 +405,26 @@ void BinaryKernelNeonOutOfOrder32BP4x4(
// it to copy into the destination matrix the part that fits.
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"mov w14, #0\n"
"50:\n"
"mov w5, #0\n"
"mov w15, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"ldr w13, [x3, x15, lsl #2]\n"
"str w13, [x4, x15, lsl #2]\n"
"add w15, w15, #1\n"
"cmp w15, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add w14, w14, #1\n"
"add x3, x3, #16\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"cmp w14, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"

// At this point we have completely finished writing values to the
// destination matrix for the current block.

// Reload some params --- we had used x5 -- x7 for a few other things
// since the last time we had loaded them.
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"

// Move to the next block of the destination matrix, for the next iter
// of the main loop. Notice that lhs_col_ptr, rhs_col_ptr have already
// been updated earlier.
Expand Down Expand Up @@ -464,7 +458,7 @@ void BinaryKernelNeonOutOfOrder32BP4x4(
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
Expand Down Expand Up @@ -736,18 +730,18 @@ void BinaryKernelNeonOutOfOrder64BP4x4(
// Load multiplication bias
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_POST_ACTIVATION_MULTIPLIER) "]\n"
// Offset these base pointers as needed given the current row, col.
"add x5, x1, %x[row], lsl #2\n"
"add x2, x1, %x[row], lsl #2\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"csel x1, x1, x2, eq\n"
// Load 4 bias-multiplication values.
"ld1 {v14.4s}, [x1], #16\n"

// Load addition bias
"ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_POST_ACTIVATION_BIAS) "]\n"
// Offset these base pointers as needed given the current row, col.
"add x5, x1, %x[row], lsl #2\n"
"add x2, x1, %x[row], lsl #2\n"
"tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
"csel x1, x1, x5, eq\n"
"csel x1, x1, x2, eq\n"
// Load 4 bias-addition values.
"ld1 {v15.4s}, [x1], #16\n"

Expand Down Expand Up @@ -847,32 +841,26 @@ void BinaryKernelNeonOutOfOrder64BP4x4(
// it to copy into the destination matrix the part that fits.
"mov x3, %[dst_tmp_buf]\n"
"mov x4, %[dst_ptr]\n"
"mov w6, #0\n"
"mov w14, #0\n"
"50:\n"
"mov w5, #0\n"
"mov w15, #0\n"
"51:\n"
"ldr w7, [x3, x5, lsl #2]\n"
"str w7, [x4, x5, lsl #2]\n"
"add w5, w5, #1\n"
"cmp w5, w1\n"
"ldr w13, [x3, x15, lsl #2]\n"
"str w13, [x4, x15, lsl #2]\n"
"add w15, w15, #1\n"
"cmp w15, w1\n"
"blt 51b\n"
"add w6, w6, #1\n"
"add w14, w14, #1\n"
"add x3, x3, #16\n"
"add x4, x4, x11\n"
"cmp w6, w2\n"
"cmp w14, w2\n"
"blt 50b\n"
"41:\n"
"add %[dst_ptr], %[dst_ptr], #16\n"

// At this point we have completely finished writing values to the
// destination matrix for the current block.

// Reload some params --- we had used x5 -- x7 for a few other things
// since the last time we had loaded them.
"ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
"ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
"ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"

// Move to the next block of the destination matrix, for the next iter
// of the main loop. Notice that lhs_col_ptr, rhs_col_ptr have already
// been updated earlier.
Expand Down Expand Up @@ -906,7 +894,7 @@ void BinaryKernelNeonOutOfOrder64BP4x4(
[dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
: [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
[dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
: "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "cc",
"memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
Expand Down