Skip to content

Commit 601edd2

Browse files
committed
AVX-512 optimization of clamp functions
1 parent 8a0fead commit 601edd2

File tree

13 files changed

+587
-22
lines changed

13 files changed

+587
-22
lines changed

include/private/dsp/arch/x86/avx512/pmath.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */
2828

2929
#include <private/dsp/arch/x86/avx512/pmath/abs_vv.h>
30+
#include <private/dsp/arch/x86/avx512/pmath/clamp.h>
3031
#include <private/dsp/arch/x86/avx512/pmath/cos.h>
3132
#include <private/dsp/arch/x86/avx512/pmath/exp.h>
3233
#include <private/dsp/arch/x86/avx512/pmath/fmop_kx.h>
@@ -38,6 +39,7 @@
3839
#include <private/dsp/arch/x86/avx512/pmath/normalize.h>
3940
#include <private/dsp/arch/x86/avx512/pmath/op_kx.h>
4041
#include <private/dsp/arch/x86/avx512/pmath/op_vv.h>
42+
#include <private/dsp/arch/x86/avx512/pmath/pmix.h>
4143
#include <private/dsp/arch/x86/avx512/pmath/sin.h>
4244
#include <private/dsp/arch/x86/avx512/pmath/sqr.h>
4345
#include <private/dsp/arch/x86/avx512/pmath/ssqrt.h>

include/private/dsp/arch/x86/avx512/pmath/clamp.h

Lines changed: 518 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright (C) 2025 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2025 Vladimir Sadovnikov <[email protected]>
4+
*
5+
* This file is part of lsp-dsp-lib
6+
* Created on: 27 нояб. 2025 г.
7+
*
8+
* lsp-dsp-lib is free software: you can redistribute it and/or modify
9+
* it under the terms of the GNU Lesser General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* any later version.
12+
*
13+
* lsp-dsp-lib is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
* GNU Lesser General Public License for more details.
17+
*
18+
* You should have received a copy of the GNU Lesser General Public License
19+
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
20+
*/
21+
22+
#ifndef PRIVATE_DSP_ARCH_X86_AVX512_PMATH_PMIX_H_
23+
#define PRIVATE_DSP_ARCH_X86_AVX512_PMATH_PMIX_H_
24+
25+
#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL
26+
#error "This header should not be included directly"
27+
#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */
28+
29+
namespace lsp
30+
{
31+
namespace avx512
32+
{
33+
34+
} /* namespace avx512 */
35+
} /* namespace lsp */
36+
37+
38+
39+
40+
#endif /* PRIVATE_DSP_ARCH_X86_AVX512_PMATH_PMIX_H_ */

include/private/dsp/arch/x86/sse/pmath/clamp.h

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -225,10 +225,10 @@ namespace lsp
225225
ARCH_X86_ASM
226226
(
227227
__ASM_EMIT("xor %[off], %[off]")
228-
__ASM_EMIT("shufps $0x00, %[min], %[min]")
229-
__ASM_EMIT("shufps $0x00, %[max], %[max]")
230-
__ASM_EMIT("movaps %[min], %%xmm6")
231-
__ASM_EMIT("movaps %[max], %%xmm7")
228+
__ASM_EMIT("movss %[min], %%xmm6")
229+
__ASM_EMIT("movss %[max], %%xmm7")
230+
__ASM_EMIT("shufps $0x00, %%xmm6, %%xmm6")
231+
__ASM_EMIT("shufps $0x00, %%xmm7, %%xmm7")
232232
// 16x blocks
233233
__ASM_EMIT("sub $16, %[count]")
234234
__ASM_EMIT("jb 2f")
@@ -290,9 +290,9 @@ namespace lsp
290290
__ASM_EMIT("jge 7b")
291291
// End
292292
__ASM_EMIT("8:")
293-
: [off] "=&r" (off), [count] "+r" (count),
294-
[min] "+x" (min), [max] "+x" (max)
295-
: [dst] "r" (dst)
293+
: [off] "=&r" (off), [count] "+r" (count)
294+
: [dst] "r" (dst),
295+
[min] "m" (min), [max] "m" (max)
296296
: "cc", "memory",
297297
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
298298
"%xmm6", "%xmm7"
@@ -305,10 +305,10 @@ namespace lsp
305305
ARCH_X86_ASM
306306
(
307307
__ASM_EMIT("xor %[off], %[off]")
308-
__ASM_EMIT("shufps $0x00, %[min], %[min]")
309-
__ASM_EMIT("shufps $0x00, %[max], %[max]")
310-
__ASM_EMIT("movaps %[min], %%xmm6")
311-
__ASM_EMIT("movaps %[max], %%xmm7")
308+
__ASM_EMIT("movss %[min], %%xmm6")
309+
__ASM_EMIT("movss %[max], %%xmm7")
310+
__ASM_EMIT("shufps $0x00, %%xmm6, %%xmm6")
311+
__ASM_EMIT("shufps $0x00, %%xmm7, %%xmm7")
312312
// 16x blocks
313313
__ASM_EMIT("sub $16, %[count]")
314314
__ASM_EMIT("jb 2f")
@@ -370,9 +370,9 @@ namespace lsp
370370
__ASM_EMIT("jge 7b")
371371
// End
372372
__ASM_EMIT("8:")
373-
: [off] "=&r" (off), [count] "+r" (count),
374-
[min] "+x" (min), [max] "+x" (max)
375-
: [dst] "r" (dst), [src] "r" (src)
373+
: [off] "=&r" (off), [count] "+r" (count)
374+
: [dst] "r" (dst), [src] "r" (src),
375+
[min] "m" (min), [max] "m" (max)
376376
: "cc", "memory",
377377
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
378378
"%xmm6", "%xmm7"

src/main/x86/avx512.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,11 @@
355355
CEXPORT1(vl, reverse_fft);
356356
CEXPORT1(vl, normalize_fft2);
357357
CEXPORT1(vl, normalize_fft3);
358+
359+
CEXPORT1(vl, clamp_vv1);
360+
CEXPORT1(vl, clamp_vv2);
361+
CEXPORT1(vl, clamp_kk1);
362+
CEXPORT1(vl, clamp_kk2);
358363
}
359364
} /* namespace avx2 */
360365
} /* namespace lsp */

src/test/ptest/pmath/clamp_kk1.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ PTEST_BEGIN("dsp.pmath", clamp_kk1, 5, 1000)
108108
CALL(generic::clamp_kk1);
109109
IF_ARCH_X86(CALL(sse::clamp_kk1));
110110
IF_ARCH_X86(CALL(avx::clamp_kk1));
111-
// IF_ARCH_X86(CALL(avx512::clamp_kk1));
111+
IF_ARCH_X86(CALL(avx512::clamp_kk1));
112112
// IF_ARCH_ARM(CALL(neon_d32::clamp_kk1));
113113
// IF_ARCH_AARCH64(CALL(asimd::clamp_kk1));
114114
PTEST_SEPARATOR;

src/test/ptest/pmath/clamp_kk2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ PTEST_BEGIN("dsp.pmath", clamp_kk2, 5, 1000)
108108
CALL(generic::clamp_kk2);
109109
IF_ARCH_X86(CALL(sse::clamp_kk2));
110110
IF_ARCH_X86(CALL(avx::clamp_kk2));
111-
// IF_ARCH_X86(CALL(avx512::clamp_kk2));
111+
IF_ARCH_X86(CALL(avx512::clamp_kk2));
112112
// IF_ARCH_ARM(CALL(neon_d32::clamp_kk2));
113113
// IF_ARCH_AARCH64(CALL(asimd::clamp_kk2));
114114
PTEST_SEPARATOR;

src/test/ptest/pmath/clamp_vv1.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ PTEST_BEGIN("dsp.pmath", clamp_vv2, 5, 1000)
112112
CALL(generic::clamp_vv2);
113113
IF_ARCH_X86(CALL(sse::clamp_vv2));
114114
IF_ARCH_X86(CALL(avx::clamp_vv2));
115-
// IF_ARCH_X86(CALL(avx512::clamp_vv2));
115+
IF_ARCH_X86(CALL(avx512::clamp_vv2));
116116
// IF_ARCH_ARM(CALL(neon_d32::clamp_vv2));
117117
// IF_ARCH_AARCH64(CALL(asimd::clamp_vv2));
118118
PTEST_SEPARATOR;

src/test/ptest/pmath/clamp_vv2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ PTEST_BEGIN("dsp.pmath", clamp_vv1, 5, 1000)
112112
CALL(generic::clamp_vv1);
113113
IF_ARCH_X86(CALL(sse::clamp_vv1));
114114
IF_ARCH_X86(CALL(avx::clamp_vv1));
115-
// IF_ARCH_X86(CALL(avx512::clamp_vv1));
115+
IF_ARCH_X86(CALL(avx512::clamp_vv1));
116116
// IF_ARCH_ARM(CALL(neon_d32::clamp_vv1));
117117
// IF_ARCH_AARCH64(CALL(asimd::clamp_vv1));
118118
PTEST_SEPARATOR;

src/test/utest/pmath/clamp_kk1.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ UTEST_BEGIN("dsp.pmath", clamp_kk1)
117117

118118
IF_ARCH_X86(CALL(generic::clamp_kk1, sse::clamp_kk1, 16));
119119
IF_ARCH_X86(CALL(generic::clamp_kk1, avx::clamp_kk1, 32));
120-
// IF_ARCH_X86(CALL(generic::clamp_kk1, avx512::clamp_kk1, 32));
120+
IF_ARCH_X86(CALL(generic::clamp_kk1, avx512::clamp_kk1, 32));
121121
// IF_ARCH_ARM(CALL(generic::clamp_kk1, neon_d32::clamp_kk1, 16));
122122
// IF_ARCH_AARCH64(CALL(generic::clamp_kk1, asimd::clamp_kk1, 16));
123123
}

0 commit comments

Comments
 (0)