Skip to content

Commit

Permalink
Add a SIMD (AVX2) optimised vector distance function for int7 on x64 (#…
Browse files Browse the repository at this point in the history
…108088)

* Adding support for x64 to native vec library
* Fix: aarch64 sqr7u dims
* Fix: add symbol stripping (deb lintian)
---------
Co-authored-by: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com>
Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
  • Loading branch information
ldematte committed May 10, 2024
1 parent 2541ce9 commit 2e0f8d0
Show file tree
Hide file tree
Showing 11 changed files with 254 additions and 22 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/108088.yaml
@@ -0,0 +1,5 @@
pr: 108088
summary: Add a SIMD (AVX2) optimised vector distance function for int7 on x64
area: "Search"
type: enhancement
issues: []
2 changes: 1 addition & 1 deletion libs/native/libraries/build.gradle
Expand Up @@ -18,7 +18,7 @@ configurations {
}

var zstdVersion = "1.5.5"
var vecVersion = "1.0.6"
var vecVersion = "1.0.8"

repositories {
exclusiveContent {
Expand Down
Expand Up @@ -45,7 +45,15 @@ public Optional<VectorSimilarityFunctions> getVectorSimilarityFunctions() {
}

static boolean isNativeVectorLibSupported() {
return Runtime.version().feature() >= 21 && isMacOrLinuxAarch64() && checkEnableSystemProperty();
return Runtime.version().feature() >= 21 && (isMacOrLinuxAarch64() || isLinuxAmd64()) && checkEnableSystemProperty();
}

/**
* Returns true iff the architecture is x64 (amd64) and the OS Linux (the OS we currently support for the native lib).
*/
static boolean isLinuxAmd64() {
String name = System.getProperty("os.name");
return (name.startsWith("Linux")) && System.getProperty("os.arch").equals("amd64");
}

/** Returns true iff the OS is Mac or Linux, and the architecture is aarch64. */
Expand Down
Expand Up @@ -37,7 +37,9 @@ public boolean supported() {
var arch = System.getProperty("os.arch");
var osName = System.getProperty("os.name");

if (jdkVersion >= 21 && arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))) {
if (jdkVersion >= 21
&& ((arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux")))
|| (arch.equals("amd64") && osName.equals("Linux")))) {
assertThat(vectorSimilarityFunctions, isPresent());
return true;
} else {
Expand Down
5 changes: 3 additions & 2 deletions libs/vec/native/Dockerfile
Expand Up @@ -4,6 +4,7 @@ RUN apt update
RUN apt install -y gcc g++ openjdk-17-jdk
COPY . /workspace
WORKDIR /workspace
RUN ./gradlew --quiet --console=plain clean vecSharedLibrary
RUN ./gradlew --quiet --console=plain clean buildSharedLibrary
RUN strip --strip-unneeded build/output/libvec.so

CMD cat build/libs/vec/shared/libvec.so
CMD cat build/output/libvec.so
76 changes: 67 additions & 9 deletions libs/vec/native/build.gradle
Expand Up @@ -12,9 +12,10 @@ var os = org.gradle.internal.os.OperatingSystem.current()
// To update this library run publish_vec_binaries.sh ( or ./gradlew vecSharedLibrary )
// Or
// For local development, build the docker image with:
// docker build --platform linux/arm64 --progress=plain .
// docker build --platform linux/arm64 --progress=plain . (for aarch64)
// docker build --platform linux/amd64 --progress=plain . (for x64)
// Grab the image id from the console output, then, e.g.
// docker run 9c9f36564c148b275aeecc42749e7b4580ded79dcf51ff6ccc008c8861e7a979 > build/libs/vec/shared/libvec.so
// docker run 9c9f36564c148b275aeecc42749e7b4580ded79dcf51ff6ccc008c8861e7a979 > build/libs/vec/shared/$arch/libvec.so
//
// To run tests and benchmarks on a locally built libvec,
// 1. Temporarily comment out the download in libs/native/library/build.gradle
Expand All @@ -30,26 +31,83 @@ var os = org.gradle.internal.os.OperatingSystem.current()

group = 'org.elasticsearch'

def platformName = System.getProperty("os.arch");

model {
platforms {
aarch64 {
architecture "aarch64"
}
amd64 {
architecture "x86-64"
}
}
toolChains {
gcc(Gcc) {
target("aarch64") {
cCompiler.executable = "/usr/bin/gcc"
cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=armv8-a"]) }
}
target("amd64") {
cCompiler.executable = "/usr/bin/gcc"
cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=core-avx2", "-Wno-incompatible-pointer-types"]) }
}
}
clang(Clang)
}
platforms {
aarch64 {
architecture "aarch64"
cl(VisualCpp) {
eachPlatform { toolchain ->
def platform = toolchain.getPlatform()
if (platform.name == "x64") {
cCompiler.withArguments { args -> args.addAll(["/O2", "/LD", "-march=core-avx2"]) }
}
}
}
clang(Clang) {
target("amd64") {
cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=core-avx2"]) }
}
}
}
components {
vec(NativeLibrarySpec) {
targetPlatform "aarch64"
binaries.withType(SharedLibraryBinarySpec) {
cCompiler.args "-O3", "-std=c99", "-march=armv8-a"
targetPlatform "amd64"

sources {
c {
source {
srcDir "src/vec/c/${platformName}/"
include "*.c"
}
exportedHeaders {
srcDir "src/vec/headers/"
}
}
}
}
}
}

tasks.register('buildSharedLibrary') {
description = 'Assembles native shared library for the host architecture'
if (platformName.equals("aarch64")) {
dependsOn tasks.vecAarch64SharedLibrary
doLast {
copy {
from tasks.linkVecAarch64SharedLibrary.outputs.files.files
into layout.buildDirectory.dir('output');
duplicatesStrategy = 'INCLUDE'
}
}
} else if (platformName.equals("amd64")) {
dependsOn tasks.vecAmd64SharedLibrary
doLast {
copy {
from tasks.linkVecAmd64SharedLibrary.outputs.files.files
into layout.buildDirectory.dir('output');
duplicatesStrategy = 'INCLUDE'
}
}
} else {
throw new GradleException("Unsupported platform: " + platformName)
}
}
16 changes: 11 additions & 5 deletions libs/vec/native/publish_vec_binaries.sh
Expand Up @@ -19,7 +19,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
exit 1;
fi

VERSION="1.0.6"
VERSION="1.0.8"
ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
TEMP=$(mktemp -d)

Expand All @@ -29,16 +29,22 @@ if curl -sS -I --fail --location "${ARTIFACTORY_REPOSITORY}/org/elasticsearch/ve
fi

echo 'Building Darwin binary...'
./gradlew --quiet --console=plain vecSharedLibrary
./gradlew --quiet --console=plain vecAarch64SharedLibrary

echo 'Building Linux binary...'
DOCKER_IMAGE=$(docker build --platform linux/arm64 --quiet .)
docker run $DOCKER_IMAGE > build/libs/vec/shared/libvec.so
docker run $DOCKER_IMAGE > build/libs/vec/shared/aarch64/libvec.so

echo 'Building Linux x64 binary...'
DOCKER_IMAGE=$(docker build --platform linux/amd64 --quiet .)
docker run --platform linux/amd64 $DOCKER_IMAGE > build/libs/vec/shared/amd64/libvec.so

mkdir -p $TEMP/darwin-aarch64
mkdir -p $TEMP/linux-aarch64
cp build/libs/vec/shared/libvec.dylib $TEMP/darwin-aarch64/
cp build/libs/vec/shared/libvec.so $TEMP/linux-aarch64/
mkdir -p $TEMP/linux-x64
cp build/libs/vec/shared/aarch64/libvec.dylib $TEMP/darwin-aarch64/
cp build/libs/vec/shared/aarch64/libvec.so $TEMP/linux-aarch64/
cp build/libs/vec/shared/amd64/libvec.so $TEMP/linux-x64/

echo 'Uploading to Artifactory...'
(cd $TEMP && zip -rq - .) | curl -sS -X PUT -H "X-JFrog-Art-Api: ${ARTIFACTORY_API_KEY}" --data-binary @- --location "${ARTIFACTORY_REPOSITORY}/org/elasticsearch/vec/${VERSION}/vec-${VERSION}.zip"
Expand Down
Expand Up @@ -121,7 +121,7 @@ static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, size_t dims) {
EXPORT int32_t sqr7u(int8_t* a, int8_t* b, size_t dims) {
int32_t res = 0;
int i = 0;
if (i > SQR7U_STRIDE_BYTES_LEN) {
if (dims > SQR7U_STRIDE_BYTES_LEN) {
i += dims & ~(SQR7U_STRIDE_BYTES_LEN - 1);
res = sqr7u_inner(a, b, i);
}
Expand Down
150 changes: 150 additions & 0 deletions libs/vec/native/src/vec/c/amd64/vec.c
@@ -0,0 +1,150 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

#include <stddef.h>
#include <stdint.h>
#include "vec.h"

#include <emmintrin.h>
#include <immintrin.h>

#ifndef DOT7U_STRIDE_BYTES_LEN
#define DOT7U_STRIDE_BYTES_LEN 32 // Must be a power of 2
#endif

#ifndef SQR7U_STRIDE_BYTES_LEN
#define SQR7U_STRIDE_BYTES_LEN 32 // Must be a power of 2
#endif

#ifdef _MSC_VER
#include <intrin.h>
#elif __GNUC__
#include <x86intrin.h>
#elif __clang__
#include <x86intrin.h>
#endif

// Multi-platform CPUID "intrinsic"; it takes as input a "functionNumber" (or "leaf", the eax registry). "Subleaf"
// is always 0. Output is stored in the passed output parameter: output[0] = eax, output[1] = ebx, output[2] = ecx,
// output[3] = edx
static inline void cpuid(int output[4], int functionNumber) {
#if defined(__GNUC__) || defined(__clang__)
// use inline assembly, Gnu/AT&T syntax
int a, b, c, d;
__asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionNumber), "c"(0) : );
output[0] = a;
output[1] = b;
output[2] = c;
output[3] = d;

#elif defined (_MSC_VER)
__cpuidex(output, functionNumber, 0);
#else
#error Unsupported compiler
#endif
}

// Utility function to horizontally add 8 32-bit integers
static inline int hsum_i32_8(const __m256i a) {
const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
}

EXPORT int vec_caps() {
int cpuInfo[4] = {-1};
// Calling __cpuid with 0x0 as the function_id argument
// gets the number of the highest valid function ID.
cpuid(cpuInfo, 0);
int functionIds = cpuInfo[0];
if (functionIds >= 7) {
cpuid(cpuInfo, 7);
int ebx = cpuInfo[1];
// AVX2 flag is the 5th bit
// We assume that all processors that have AVX2 also have FMA3
return (ebx & (1 << 5)) != 0;
}
return 0;
}

static inline int32_t dot7u_inner(int8_t* a, int8_t* b, size_t dims) {
const __m256i ones = _mm256_set1_epi16(1);

// Init accumulator(s) with 0
__m256i acc1 = _mm256_setzero_si256();

#pragma GCC unroll 4
for(int i = 0; i < dims; i += DOT7U_STRIDE_BYTES_LEN) {
// Load packed 8-bit integers
__m256i va1 = _mm256_loadu_si256(a + i);
__m256i vb1 = _mm256_loadu_si256(b + i);

// Perform multiplication and create 16-bit values
// Vertically multiply each unsigned 8-bit integer from va with the corresponding
// 8-bit integer from vb, producing intermediate signed 16-bit integers.
const __m256i vab = _mm256_maddubs_epi16(va1, vb1);
// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the results.
acc1 = _mm256_add_epi32(_mm256_madd_epi16(ones, vab), acc1);
}

// reduce (horizontally add all)
return hsum_i32_8(acc1);
}

EXPORT int32_t dot7u(int8_t* a, int8_t* b, size_t dims) {
int32_t res = 0;
int i = 0;
if (dims > DOT7U_STRIDE_BYTES_LEN) {
i += dims & ~(DOT7U_STRIDE_BYTES_LEN - 1);
res = dot7u_inner(a, b, i);
}
for (; i < dims; i++) {
res += a[i] * b[i];
}
return res;
}

static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, size_t dims) {
// Init accumulator(s) with 0
__m256i acc1 = _mm256_setzero_si256();

const __m256i ones = _mm256_set1_epi16(1);

#pragma GCC unroll 4
for(int i = 0; i < dims; i += SQR7U_STRIDE_BYTES_LEN) {
// Load packed 8-bit integers
__m256i va1 = _mm256_loadu_si256(a + i);
__m256i vb1 = _mm256_loadu_si256(b + i);

const __m256i dist1 = _mm256_sub_epi8(va1, vb1);
const __m256i abs_dist1 = _mm256_sign_epi8(dist1, dist1);
const __m256i sqr1 = _mm256_maddubs_epi16(abs_dist1, abs_dist1);

acc1 = _mm256_add_epi32(_mm256_madd_epi16(ones, sqr1), acc1);
}

// reduce (accumulate all)
return hsum_i32_8(acc1);
}

EXPORT int32_t sqr7u(int8_t* a, int8_t* b, size_t dims) {
int32_t res = 0;
int i = 0;
if (dims > SQR7U_STRIDE_BYTES_LEN) {
i += dims & ~(SQR7U_STRIDE_BYTES_LEN - 1);
res = sqr7u_inner(a, b, i);
}
for (; i < dims; i++) {
int32_t dist = a[i] - b[i];
res += dist * dist;
}
return res;
}

2 changes: 1 addition & 1 deletion libs/vec/native/src/vec/headers/vec.h
Expand Up @@ -7,7 +7,7 @@
*/

#ifdef _MSC_VER
#define EXPORT extern "C" __declspec(dllexport)
#define EXPORT __declspec(dllexport)
#elif defined(__GNUC__) && !defined(__clang__)
#define EXPORT __attribute__((externally_visible,visibility("default")))
#elif __clang__
Expand Down
Expand Up @@ -39,7 +39,9 @@ public static boolean supported() {
var arch = System.getProperty("os.arch");
var osName = System.getProperty("os.name");

if (jdkVersion >= 21 && arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))) {
if (jdkVersion >= 21
&& (arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))
|| arch.equals("amd64") && osName.equals("Linux"))) {
assertThat(factory, isPresent());
return true;
} else {
Expand Down

0 comments on commit 2e0f8d0

Please sign in to comment.