KEMBAR78
Fix windows inductor defination issue by xuhancn · Pull Request #128686 · pytorch/pytorch · GitHub
Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -794,12 +794,16 @@ Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vecto
template <>
inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
_mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
Expand Down Expand Up @@ -992,12 +996,16 @@ Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Ha
template <>
inline void convert(const Half* src, Half* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
_mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/cpu/vec/vec256/vec256_double.h
Original file line number Diff line number Diff line change
Expand Up @@ -416,11 +416,15 @@ inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other
template <>
inline void convert(const double* src, double* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
_mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/cpu/vec/vec256/vec256_float.h
Original file line number Diff line number Diff line change
Expand Up @@ -512,11 +512,15 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
template <>
inline void convert(const float* src, float* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
_mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -823,12 +823,16 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
template <>
inline void convert(const float* src, int32_t* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
vst1q_s32(dst + i + 4, vcvtq_s32_f32(vld1q_f32(src + i + 4)));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<int32_t>(src[i]);
}
Expand All @@ -837,12 +841,16 @@ inline void convert(const float* src, int32_t* dst, int64_t n) {
template <>
inline void convert(const int32_t* src, float* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
vst1q_f32(dst + i + 4, vcvtq_f32_s32(vld1q_s32(src + i + 4)));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<float>(src[i]);
}
Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -765,13 +765,17 @@ inline Vectorized<c10::Half> Vectorized<c10::Half>::le(
template <>
inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<c10::Half>::size());
i += Vectorized<c10::Half>::size()) {
vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i)));
vst1q_s16(dst + i + 8, vcvtq_s16_f16(vld1q_f16(src + i + 8)));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<int16_t>(src[i]);
}
Expand All @@ -780,13 +784,17 @@ inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
template <>
inline void convert(const int16_t* src, float16_t* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<c10::Half>::size());
i += Vectorized<c10::Half>::size()) {
vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i)));
vst1q_f16(dst + i + 8, vcvtq_f16_s16(vld1q_s16(src + i + 8)));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<float16_t>(src[i]);
}
Expand Down
40 changes: 40 additions & 0 deletions aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -914,12 +914,16 @@ Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vecto
template <>
inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
_mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
Expand Down Expand Up @@ -986,7 +990,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
// j0-j15 n0-n15
// k0-k15 o0-o15
// l0-l15 p0-p15
#ifndef __msvc_cl__
#pragma unroll(4)
#endif
for (int i = 0; i < 4; i++) {
r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01);
r[i + 4] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01);
Expand All @@ -998,7 +1004,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
// u3: c4c5 d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15 g4g5 h4h5 g6g7 h6h7 g12g13 h12h13 g14g15 h14h15
// i j m n
// k l o p
#ifndef __msvc_cl__
#pragma unroll(4)
#endif
for (int i = 0; i < 8; i += 2) {
u[i] = _mm512_unpacklo_epi32(r[i], r[i + 1]);
u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]);
Expand Down Expand Up @@ -1061,7 +1069,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
// 12-- 13--
// 6-- 7--
// 14-- 15--
#ifndef __msvc_cl__
#pragma unroll(4)
#endif
for (int i = 0; i < 4; i++) {
u[i] = _mm512_permutex2var_epi16(r[i], const1, r[i + 4]);
u[i + 4] = _mm512_permutex2var_epi16(r[i], const2, r[i + 4]);
Expand Down Expand Up @@ -1095,15 +1105,19 @@ inline void transpose_mxn<BFloat16, 16, 16>(
// n: n0 n1 n2 n3 n4 n5 n6 n7 n8 n9 n10 n11 n12 n13 n14 n15
// o: o0 o1 o2 o3 o4 o5 o6 o7 o8 o9 o10 o11 o12 o13 o14 o15
// p: p0 p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15
#ifndef __msvc_cl__
#pragma unroll(16)
#endif
for (int i = 0; i < 16; i++) {
t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
}

__m512i u[8];
_transpose_mxn_half_16_16(t, u);

#ifndef __msvc_cl__
#pragma unroll(8)
#endif
for (int i = 0; i < 8; i++) {
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
Expand All @@ -1125,15 +1139,19 @@ inline void transpose_mxn<Half, 16, 16>(
__m256i t[16];
// load from src to registers
// Same matrix indices as above transpose_mxn<BFloat16, 16, 16>
#ifndef __msvc_cl__
#pragma unroll(16)
#endif
for (int i = 0; i < 16; i++) {
t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
}

__m512i u[8];
_transpose_mxn_half_16_16(t, u);

#ifndef __msvc_cl__
#pragma unroll(8)
#endif
for (int i = 0; i < 8; i++) {
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
Expand Down Expand Up @@ -1164,7 +1182,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
// t[16]: 512 544 513 545 514 546 515 547 520 552 521 553 522 554 523 555 528 ... 571
// ...
// t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 980 ... 1023
#ifndef __msvc_cl__
#pragma unroll(16)
#endif
for (int i = 0; i < 16; ++i) {
d[i * 2] = _mm512_unpacklo_epi16(r[i * 2], r[i * 2 + 1]);
d[i * 2 + 1] = _mm512_unpackhi_epi16(r[i * 2], r[i * 2 + 1]);
Expand All @@ -1189,7 +1209,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
// t[16]: 512 544 576 608 513 545 577 609 520 552 584 616 521 553 585 617 528 ... 633
// ...
// t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 918 ... 1023
#ifndef __msvc_cl__
#pragma unroll(8)
#endif
for (int i = 0; i < 8; ++i) {
r[i * 4] = _mm512_unpacklo_epi32(d[i * 4], d[i * 4 + 2]);
r[i * 4 + 1] = _mm512_unpackhi_epi32(d[i * 4], d[i * 4 + 2]);
Expand All @@ -1216,7 +1238,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
// t[16]: 512 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760
// ...
// t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 ... 1023
#ifndef __msvc_cl__
#pragma unroll(4)
#endif
for (int i = 0; i < 4; ++i) {
d[i * 8] = _mm512_unpacklo_epi64(r[i * 8], r[i * 8 + 4]);
d[i * 8 + 1] = _mm512_unpackhi_epi64(r[i * 8], r[i * 8 + 4]);
Expand Down Expand Up @@ -1265,7 +1289,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
0x000000000000000a,
0x0000000000000003,
0x0000000000000002);
#ifndef __msvc_cl__
#pragma unroll(8)
#endif
for (int i = 0; i < 8; ++i) {
r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/const1, d[i + 8]);
r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/const2, d[i + 8]);
Expand Down Expand Up @@ -1310,7 +1336,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
0x0000000000000006,
0x0000000000000005,
0x0000000000000004);
#ifndef __msvc_cl__
#pragma unroll(16)
#endif
for (int i = 0; i < 16; ++i) {
d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/const3, r[i + 16]);
d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/const4, r[i + 16]);
Expand All @@ -1327,7 +1355,9 @@ inline void transpose_mxn<BFloat16, 32, 32>(
int64_t ld_dst) {
// Load from memory
__m512i r[32];
#ifndef __msvc_cl__
#pragma unroll(32)
#endif
for (int i = 0; i < 32; ++i) {
r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
}
Expand All @@ -1336,7 +1366,9 @@ inline void transpose_mxn<BFloat16, 32, 32>(
_transpose_mxn_half_32_32(r, d);

// Store to dst
#ifndef __msvc_cl__
#pragma unroll(32)
#endif
for (int i = 0; i < 32; ++i) {
_mm512_storeu_si512(dst + i* ld_dst, d[i]);
}
Expand All @@ -1350,7 +1382,9 @@ inline void transpose_mxn<Half, 32, 32>(
int64_t ld_dst) {
// Load from memory
__m512i r[32];
#ifndef __msvc_cl__
#pragma unroll(32)
#endif
for (int i = 0; i < 32; ++i) {
r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
}
Expand All @@ -1359,7 +1393,9 @@ inline void transpose_mxn<Half, 32, 32>(
_transpose_mxn_half_32_32(r, d);

// Store to dst
#ifndef __msvc_cl__
#pragma unroll(32)
#endif
for (int i = 0; i < 32; ++i) {
_mm512_storeu_si512(dst + i* ld_dst, d[i]);
}
Expand Down Expand Up @@ -1514,12 +1550,16 @@ Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Ha
template <>
inline void convert(const Half* src, Half* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
_mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/cpu/vec/vec512/vec512_double.h
Original file line number Diff line number Diff line change
Expand Up @@ -443,11 +443,15 @@ inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other
template <>
inline void convert(const double* src, double* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
_mm512_storeu_pd(dst + i, _mm512_loadu_pd(src + i));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/cpu/vec/vec512/vec512_float.h
Original file line number Diff line number Diff line change
Expand Up @@ -552,11 +552,15 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
template <>
inline void convert(const float* src, float* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
_mm512_storeu_ps(dst + i, _mm512_loadu_ps(src + i));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
Expand Down
9 changes: 9 additions & 0 deletions aten/src/ATen/cpu/vec/vec_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@
#define __FORCE_INLINE __forceinline
#endif

#if defined(_MSC_FULL_VER)
/*
https://learn.microsoft.com/en-us/cpp/overview/compiler-versions?view=msvc-170
Use _MSC_FULL_VER to identify current compiler is msvc,
Windows llvm will not have this defination.
*/
#define __msvc_cl__
#endif

// These macros helped us unify vec_base.h
#ifdef CPU_CAPABILITY_AVX512
#if defined(__GNUC__)
Expand Down
2 changes: 2 additions & 0 deletions aten/src/ATen/cpu/vec/vec_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ class VecMask {
static VecMask<T, N> from(U* b) {
using int_t = int_same_size_t<T>;
__at_align__ T mask[size()];
#ifndef __msvc_cl__
#pragma unroll
#endif
for (int i = 0; i < size(); i++) {
*(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0;
}
Expand Down
Loading