Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cmultithreading #343

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions c/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
blake3
example
example-mmap
build/
*.o
1 change: 1 addition & 0 deletions c/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ add_library(blake3
blake3.c
blake3_dispatch.c
blake3_portable.c
blake3_thread.c
)
add_library(BLAKE3::blake3 ALIAS blake3)

Expand Down
9 changes: 6 additions & 3 deletions c/Makefile.testing
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ ifdef BLAKE3_NO_NEON
EXTRAFLAGS += -DBLAKE3_USE_NEON=0
endif

all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS)
all: blake3.c blake3_dispatch.c blake3_portable.c main.c blake3_thread.c $(TARGETS)
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)

blake3_sse2.o: blake3_sse2.c
Expand All @@ -68,14 +68,17 @@ test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined
test: all
./test.py

asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS)
asm: blake3.c blake3_dispatch.c blake3_portable.c main.c blake3_thread.c $(ASM_TARGETS)
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)

test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined
test_asm: asm
./test.py

example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS)
example: example.c blake3.c blake3_dispatch.c blake3_portable.c blake3_thread.c $(ASM_TARGETS)
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS)

example-mmap: example-mmap.c blake3.c blake3_dispatch.c blake3_portable.c blake3_thread.c $(ASM_TARGETS)
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS)

clean:
Expand Down
79 changes: 69 additions & 10 deletions c/blake3.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@

#include "blake3.h"
#include "blake3_impl.h"
#include "blake3_thread.h"

const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }

INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
uint8_t flags) {
memcpy(self->cv, key, BLAKE3_KEY_LEN);
self->chunk_counter = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
self->buf_len = 0;
self->blocks_compressed = 0;
self->flags = flags;
Expand All @@ -22,7 +22,6 @@ INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
memcpy(self->cv, key, BLAKE3_KEY_LEN);
self->chunk_counter = chunk_counter;
self->blocks_compressed = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
self->buf_len = 0;
}

Expand Down Expand Up @@ -65,7 +64,9 @@ INLINE output_t make_output(const uint32_t input_cv[8],
uint8_t flags) {
output_t ret;
memcpy(ret.input_cv, input_cv, 32);
memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
// copy out what's there and fill the rest with zeroes
memcpy(ret.block, block, block_len);
memset(ret.block + block_len, 0, BLAKE3_BLOCK_LEN - block_len);
ret.block_len = block_len;
ret.counter = counter;
ret.flags = flags;
Expand Down Expand Up @@ -121,7 +122,6 @@ INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
self->flags | chunk_state_maybe_start_flag(self));
self->blocks_compressed += 1;
self->buf_len = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
}
}

Expand Down Expand Up @@ -244,6 +244,44 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
}
}


// the state for the thread when doing compress subtree
typedef struct {
// inputs
const uint8_t *input;
size_t input_len;
const uint32_t *key;
uint64_t chunk_counter;
uint8_t flags;
// outputs
uint8_t *out;
size_t n;
} blake3_compress_subtree_state;

static size_t blake3_compress_subtree_wide(const uint8_t *input,
size_t input_len,
const uint32_t key[8],
uint64_t chunk_counter,
uint8_t flags, uint8_t *out);

static bool blake3_compress_subtree_wide_work_check(const void *arg) {
const blake3_compress_subtree_state *s = arg;

/* only off-load to thread if we have enough input (8 chunks at least) */
return s->input_len >= 8 * BLAKE3_CHUNK_LEN;
}

static void blake3_compress_subtree_wide_thread(void *arg) {
blake3_compress_subtree_state *s = arg;

s->n = blake3_compress_subtree_wide(
s->input, s->input_len,
s->key,
s->chunk_counter,
s->flags,
s->out);
}

// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
Expand Down Expand Up @@ -299,12 +337,32 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
}
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];

// Recurse! If this implementation adds multi-threading support in the
// future, this is where it will go.
size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
chunk_counter, flags, cv_array);
size_t right_n = blake3_compress_subtree_wide(
right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
// Recurse! this is the multi-threaded implementation
blake3_compress_subtree_state states[2];

/* common */
states[0].key = states[1].key = key;
states[0].flags = states[1].flags = flags;

/* left */
states[0].input = input;
states[0].input_len = left_input_len;
states[0].chunk_counter = chunk_counter;
states[0].out = cv_array;

/* right */
states[1].input = right_input;
states[1].input_len = right_input_len;
states[1].chunk_counter = right_chunk_counter;
states[1].out = right_cvs;

blake3_thread_arg_array_join(blake3_get_thread_pool(),
blake3_compress_subtree_wide_thread,
blake3_compress_subtree_wide_work_check,
states, sizeof(states[0]), 2);

size_t left_n = states[0].n;
size_t right_n = states[1].n;

// The special case again. If simd_degree=1, then we'll have left_n=1 and
// right_n=1. Rather than compressing them into a single output, return
Expand Down Expand Up @@ -361,6 +419,7 @@ INLINE void compress_subtree_to_parent_node(

INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
uint8_t flags) {
memset(self, 0, sizeof(*self));
memcpy(self->key, key, BLAKE3_KEY_LEN);
chunk_state_init(&self->chunk, key, flags);
self->cv_stack_len = 0;
Expand Down
20 changes: 20 additions & 0 deletions c/blake3_dispatch.c
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdatomic.h>

#include "blake3_impl.h"
#include "blake3_thread.h"

#if defined(IS_X86)
#if defined(_MSC_VER)
Expand Down Expand Up @@ -303,3 +305,21 @@ size_t blake3_simd_degree(void) {
#endif
return 1;
}

blake3_thread_pool *blake3_get_thread_pool(void) {
static _Atomic(blake3_thread_pool *)g_thread_pool;
blake3_thread_pool *tp, *exp_tp;

if ((tp = atomic_load(&g_thread_pool)) == NULL) {
tp = blake3_thread_pool_create(0); /* let the pool implementation choose */
assert(tp);
exp_tp = NULL;
/* store it, if the comparison fails, some other thread won, use theirs */
if (!atomic_compare_exchange_strong(&g_thread_pool, &exp_tp, tp)) {
blake3_thread_pool_destroy(tp);
return exp_tp;
}
}

return tp;
}
2 changes: 2 additions & 0 deletions c/blake3_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <string.h>

#include "blake3.h"
#include "blake3_thread.h"

// internal flags
enum blake3_flags {
Expand Down Expand Up @@ -277,5 +278,6 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
uint8_t flags_end, uint8_t *out);
#endif

blake3_thread_pool *blake3_get_thread_pool(void);

#endif /* BLAKE3_IMPL_H */