Skip to content

Commit

Permalink
Fix AVX2 build on Windows (facebookresearch#3238)
Browse files Browse the repository at this point in the history
Summary:
Tested with both MSVC (with /openmp:llvm) and clang-cl (no particular extra flags needed).  This PR is separated into two commits (three after I found out that lines need to be 80 chars or less):

1. Changes needed for clang-cl (and probably stock clang too)
2. Changes needed for MSVC

So FAISS can decide either to require using LLVM for Windows (not a hard thing to do these days since it is fully supported inside Visual Studio) and discarding the second commits, or taking them all and documenting the need to use /openmp:llvm

Closes facebookresearch#3193

Pull Request resolved: facebookresearch#3238

Reviewed By: mdouze

Differential Revision: D53479325

Pulled By: algoriddle

fbshipit-source-id: e8628f44626b6f49c5d9d7f259a9e3061cfe5568
  • Loading branch information
borrrden authored and facebook-github-bot committed Feb 15, 2024
1 parent c577f43 commit 87d43b9
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 15 deletions.
4 changes: 3 additions & 1 deletion faiss/impl/LocalSearchQuantizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,9 @@ void LocalSearchQuantizer::icm_encode_step(
{
size_t binary_idx = (other_m + 1) * M * K * K +
m * K * K + code2 * K + code;
_mm_prefetch(binaries + binary_idx, _MM_HINT_T0);
_mm_prefetch(
(const char*)(binaries + binary_idx),
_MM_HINT_T0);
}
}
#endif
Expand Down
9 changes: 9 additions & 0 deletions faiss/impl/platform_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@

#include <intrin.h>

#ifndef __clang__
inline int __builtin_ctzll(uint64_t x) {
unsigned long ret;
_BitScanForward64(&ret, x);
return (int)ret;
}
#endif

// cudatoolkit provides __builtin_ctz for NVCC >= 11.0
#if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11
Expand All @@ -55,13 +57,20 @@ inline int __builtin_ctz(unsigned long x) {
}
#endif

#ifndef __clang__
inline int __builtin_clzll(uint64_t x) {
return (int)__lzcnt64(x);
}
#endif

#define __builtin_popcount __popcnt
#define __builtin_popcountl __popcnt64

#ifndef __clang__
#define __m128i_u __m128i
#define __m256i_u __m256i
#endif

// MSVC does not define __SSEx__, and _M_IX86_FP is only defined on 32-bit
// processors cf.
// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
Expand Down
8 changes: 4 additions & 4 deletions faiss/utils/distances.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
for (int64_t i = i0; i < i1; i++) {
float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);

_mm_prefetch(ip_line, _MM_HINT_NTA);
_mm_prefetch(ip_line + 16, _MM_HINT_NTA);
_mm_prefetch((const char*)ip_line, _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA);

// constant
const __m256 mul_minus2 = _mm256_set1_ps(-2);
Expand All @@ -445,8 +445,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(

// process 16 elements per loop
for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) {
_mm_prefetch(ip_line + 32, _MM_HINT_NTA);
_mm_prefetch(ip_line + 48, _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA);

// load values for norms
const __m256 y_norm_0 =
Expand Down
2 changes: 1 addition & 1 deletion faiss/utils/distances_fused/simdlib_based.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ void kernel(

// prefetch the next point
#if defined(__AVX2__)
_mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
_mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
#endif

// load a single point from x
Expand Down
18 changes: 9 additions & 9 deletions faiss/utils/distances_simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,14 +439,14 @@ void fvec_op_ny_D2<ElementOpIP>(

if (ny8 > 0) {
// process 8 D2-vectors per loop.
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

const __m256 m0 = _mm256_set1_ps(x[0]);
const __m256 m1 = _mm256_set1_ps(x[1]);

for (i = 0; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

// load 8x2 matrix and transpose it in registers.
// the typical bottleneck is memory access, so
Expand Down Expand Up @@ -496,14 +496,14 @@ void fvec_op_ny_D2<ElementOpL2>(

if (ny8 > 0) {
// process 8 D2-vectors per loop.
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

const __m256 m0 = _mm256_set1_ps(x[0]);
const __m256 m1 = _mm256_set1_ps(x[1]);

for (i = 0; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

// load 8x2 matrix and transpose it in registers.
// the typical bottleneck is memory access, so
Expand Down Expand Up @@ -1084,8 +1084,8 @@ size_t fvec_L2sqr_ny_nearest_D2(
// process 8 D2-vectors per loop.
const size_t ny8 = ny / 8;
if (ny8 > 0) {
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

// track min distance and the closest vector independently
// for each of 8 AVX2 components.
Expand All @@ -1100,7 +1100,7 @@ size_t fvec_L2sqr_ny_nearest_D2(
const __m256 m1 = _mm256_set1_ps(x[1]);

for (; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

__m256 v0;
__m256 v1;
Expand Down

0 comments on commit 87d43b9

Please sign in to comment.