Skip to content

Commit 10e0e57

Browse files
authored
Intel(R) oneAPI Collective Communications Library (oneCCL) 2021.15.1 (#167)
* Intel(R) oneAPI Collective Communications Library (oneCCL) 2021.15.1
1 parent 0b49b8f commit 10e0e57

File tree

14 files changed

+160
-110
lines changed

14 files changed

+160
-110
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ endif()
335335

336336
set(CCL_MAJOR_VERSION "2021")
337337
set(CCL_MINOR_VERSION "15")
338-
set(CCL_UPDATE_VERSION "0")
338+
set(CCL_UPDATE_VERSION "1")
339339
set(CCL_PRODUCT_STATUS "Gold")
340340
string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
341341
get_vcs_properties("git")

include/oneapi/ccl/config.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@
3232

3333
#define CCL_MAJOR_VERSION 2021
3434
#define CCL_MINOR_VERSION 15
35-
#define CCL_UPDATE_VERSION 0
35+
#define CCL_UPDATE_VERSION 1
3636
#define CCL_PRODUCT_STATUS "Gold"
37-
#define CCL_PRODUCT_BUILD_DATE "2025-03-24T 03:29:36Z"
38-
#define CCL_PRODUCT_FULL "Gold-2021.15.0 2025-03-24T 03:29:36Z (master/b382bf7)"
37+
#define CCL_PRODUCT_BUILD_DATE "2025-05-05T 03:34:33Z"
38+
#define CCL_PRODUCT_FULL "Gold-2021.15.1 2025-05-05T 03:34:33Z (master/0b49b8f)"
3939

4040
#if defined(SYCL_LANGUAGE_VERSION) && defined (__INTEL_LLVM_COMPILER)
4141
#define CCL_ENABLE_SYCL

man/OneCCL.md

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -150,31 +150,6 @@ Arguments<br />
150150

151151

152152

153-
By-default: &quot;1&quot;
154-
155-
156-
## CCL_ENABLE_OMP_ALLREDUCE
157-
158-
159-
Set this environment variable to use openmp threads for synchronous collectives with direct algorithms on host buffers.
160-
161-
162-
163-
Syntax <br />
164-
CCL_ENABLE_OMP_ALLREDUCE=&quot;&lt;value&gt;&quot;<br />
165-
<br />
166-
Arguments<br />
167-
&quot;&lt;value&gt;&quot; Description<br />
168-
169-
- 0 Does not use openmp threads for allreduce.<br />
170-
171-
172-
- 1 Use openmp threads for allreduce (default).<br />
173-
<br />
174-
175-
176-
177-
178153
By-default: &quot;1&quot;
179154

180155

@@ -1246,14 +1221,24 @@ Specify the maximum threshold for the Allreduce Sycl scale-out algorithm.
12461221
Set the threshold in bytes to specify the Sycl scaleout algorithm in the allreduce collective. Default value is 1048576. &quot;&lt;value&gt;&quot;&quot; : &quot;&gt;=0&quot;
12471222

12481223

1249-
## CCL_SYCL_ALLREDUCE_SCALEOUT_DIRECT_THRESHOLD
1224+
## CCL_SYCL_ALLREDUCE_SCALEOUT
12501225

12511226

1252-
Specify the maximum threshold for the Allreduce Sycl scale-out direct algorithm.
1227+
Specify allreduce SYCL scale-out algorithm.
12531228

12541229

12551230

1256-
Set the threshold in bytes to specify the Sycl scaleout direct algorithm (call MPI_allreduce directly) in the allreduce collective. Default value is 1048576. &quot;&lt;value&gt;&quot;&quot; : &quot;&gt;=0&quot;
1231+
Set the algorithm string from a list of available algorithms to set a specific algorithm for scale-out phase. ALLREDUCE algorithms
1232+
- auto Automatic selection. Default vaue.
1233+
1234+
- direct Based on MPI_Iallreduce
1235+
1236+
- rabenseifner Rabenseifner&#8217;s algorithm
1237+
1238+
- ring Reduce_scatter + allgather ring
1239+
1240+
1241+
12571242
12581243

12591244
## CCL_SYCL_REDUCE_SCATTER_TMP_BUF
@@ -1298,6 +1283,24 @@ Specify the threshold for the Sycl scaleout algorithm in reduce-scatter.
12981283
Set the threshold in bytes to specify the Sycl scaleout algorithm in the reduce-scatter collective. Default value is 4294967296. &quot;&lt;value&gt;&quot;&quot; : &quot;&gt;=0&quot;
12991284

13001285

1286+
## CCL_SYCL_REDUCE_SCATTER_SCALEOUT
1287+
1288+
1289+
Specify reduce-scatter SYCL scale-out algorithm.
1290+
1291+
1292+
1293+
Set the algorithm string from a list of available algorithms to set a specific algorithm for scale-out phase. REDUCE_SCATTER algorithms
1294+
- auto Automatic selection. Default vaue.
1295+
1296+
- direct Based on MPI_Ireduce_scatter
1297+
1298+
- ring Ring algorithm
1299+
1300+
1301+
1302+
1303+
13011304

13021305
Experimental OneCCL Environment Variables Functionality of these variables has not been (fully) tested and, therefore, cannot be supported nor guaranteed.
13031306

man/doxconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
PROJECT_NAME = "Intel® oneAPI Collective Communications Library"
2-
PROJECT_NUMBER = "2021.15.0"
2+
PROJECT_NUMBER = "2021.15.1"
33

44
INPUT = ../src/common/env/vars.hpp ../src/common/env/vars_experimental.hpp
55

man/man3/OneCCL.3

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.TH "OneCCLvars" 3 "Version 2021.15.0" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
1+
.TH "OneCCLvars" 3 "Tue Mar 18 2025" "Version 2021.15.1" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
22
.ad l
33
.nh
44
.SH NAME
@@ -54,10 +54,6 @@ OneCCLvars \- OneCCL Environment Variables
5454
.br
5555
.RI "Set this environment variable to enable cache model automatically for synchronous collectives with direct algorithms\&. "
5656
.ti -1c
57-
.RI "\fBCCL_ENABLE_OMP_ALLREDUCE\fP"
58-
.br
59-
.RI "Set this environment variable to use openmp threads for synchronous collectives with direct algorithms on host buffers\&. "
60-
.ti -1c
6157
.RI "\fBCCL_ALLGATHER\fP"
6258
.br
6359
.RI "Set allgather algorithm\&. "
@@ -603,32 +599,6 @@ Arguments
603599

604600
.br
605601

606-
.PP
607-
.PP
608-
By-default: '1'
609-
.SS "CCL_ENABLE_OMP_ALLREDUCE"
610-
611-
.PP
612-
Set this environment variable to use openmp threads for synchronous collectives with direct algorithms on host buffers\&. Syntax
613-
.br
614-
CCL_ENABLE_OMP_ALLREDUCE='<value>'
615-
.br
616-
617-
.br
618-
Arguments
619-
.br
620-
'<value>' Description
621-
.br
622-
.IP "\(bu" 2
623-
0 Does not use openmp threads for allreduce\&.
624-
.br
625-
626-
.IP "\(bu" 2
627-
1 Use openmp threads for allreduce (default)\&.
628-
.br
629-
630-
.br
631-
632602
.PP
633603
.PP
634604
By-default: '1'
@@ -1081,7 +1051,7 @@ By-default: '536870912'
10811051
.SH "Author"
10821052
.PP
10831053
Generated automatically by Doxygen for Intel® oneAPI Collective Communications Library from the source code\&.
1084-
.TH "ExpOneCCLvars" 3 "Version 2021.15.0" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
1054+
.TH "ExpOneCCLvars" 3 "Tue Mar 18 2025" "Version 2021.15.1" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
10851055
.ad l
10861056
.nh
10871057
.SH NAME
@@ -1164,9 +1134,9 @@ ExpOneCCLvars \- Experimental OneCCL Environment Variables
11641134
.br
11651135
.RI "Specify the maximum threshold for the Allreduce Sycl scale-out algorithm\&. "
11661136
.ti -1c
1167-
.RI "\fBCCL_SYCL_ALLREDUCE_SCALEOUT_DIRECT_THRESHOLD\fP"
1137+
.RI "\fBCCL_SYCL_ALLREDUCE_SCALEOUT\fP"
11681138
.br
1169-
.RI "Specify the maximum threshold for the Allreduce Sycl scale-out direct algorithm\&. "
1139+
.RI "Specify allreduce SYCL scale-out algorithm\&. "
11701140
.ti -1c
11711141
.RI "\fBCCL_SYCL_REDUCE_SCATTER_TMP_BUF\fP"
11721142
.br
@@ -1183,6 +1153,10 @@ ExpOneCCLvars \- Experimental OneCCL Environment Variables
11831153
.RI "\fBCCL_SYCL_REDUCE_SCATTER_SCALEOUT_THRESHOLD\fP"
11841154
.br
11851155
.RI "Specify the threshold for the Sycl scaleout algorithm in reduce-scatter\&. "
1156+
.ti -1c
1157+
.RI "\fBCCL_SYCL_REDUCE_SCATTER_SCALEOUT\fP"
1158+
.br
1159+
.RI "Specify reduce-scatter SYCL scale-out algorithm\&. "
11861160
.in -1c
11871161
.SH "Detailed Description"
11881162
.PP
@@ -1245,10 +1219,20 @@ By-default: '0 (disabled)'
12451219

12461220
.PP
12471221
Specify the threshold for the medium size algorithm in allreduce\&. Set the threshold in bytes to specify the medium size algorithm in the allreduce collective\&. Default value is 16777216\&. '<value>'' : '>=0"
1248-
.SS "CCL_SYCL_ALLREDUCE_SCALEOUT_DIRECT_THRESHOLD"
1222+
.SS "CCL_SYCL_ALLREDUCE_SCALEOUT"
12491223

12501224
.PP
1251-
Specify the maximum threshold for the Allreduce Sycl scale-out direct algorithm\&. Set the threshold in bytes to specify the Sycl scaleout direct algorithm (call MPI_allreduce directly) in the allreduce collective\&. Default value is 1048576\&. '<value>'' : '>=0"
1225+
Specify allreduce SYCL scale-out algorithm\&. Set the algorithm string from a list of available algorithms to set a specific algorithm for scale-out phase\&. ALLREDUCE algorithms
1226+
.IP "\(bu" 2
1227+
auto Automatic selection\&. Default vaue\&.
1228+
.IP "\(bu" 2
1229+
direct Based on MPI_Iallreduce
1230+
.IP "\(bu" 2
1231+
rabenseifner Rabenseifner’s algorithm
1232+
.IP "\(bu" 2
1233+
ring Reduce_scatter + allgather ring
1234+
.PP
1235+
12521236
.SS "CCL_SYCL_ALLREDUCE_SCALEOUT_THRESHOLD"
12531237

12541238
.PP
@@ -1269,6 +1253,18 @@ By-default: '0 (disabled)'
12691253

12701254
.PP
12711255
Specify the threshold for the medium size algorithm in reduce_scatter\&. Set the threshold in bytes to specify the medium size algorithm in the reduce_scatter collective\&. Default value is 67108864\&. '<value>'' : '>=0"
1256+
.SS "CCL_SYCL_REDUCE_SCATTER_SCALEOUT"
1257+
1258+
.PP
1259+
Specify reduce-scatter SYCL scale-out algorithm\&. Set the algorithm string from a list of available algorithms to set a specific algorithm for scale-out phase\&. REDUCE_SCATTER algorithms
1260+
.IP "\(bu" 2
1261+
auto Automatic selection\&. Default vaue\&.
1262+
.IP "\(bu" 2
1263+
direct Based on MPI_Ireduce_scatter
1264+
.IP "\(bu" 2
1265+
ring Ring algorithm
1266+
.PP
1267+
12721268
.SS "CCL_SYCL_REDUCE_SCATTER_SCALEOUT_THRESHOLD"
12731269

12741270
.PP

src/atl/mpi/atl_mpi_ctx.cpp

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -677,17 +677,11 @@ atl_status_t atl_mpi_ctx::check_impi_env(const atl_attr_t& attr) {
677677

678678
if (!getenv("ONEAPI_ROOT") && !getenv("I_MPI_ROOT")) {
679679
atl_mpi_lib_type_t type = ATL_MPI_LIB_IMPI;
680-
LOG_ERROR("CCL/MPI uses ",
681-
mpi_lib_infos[type].version_prefix_1,
682-
" but neither I_MPI_ROOT nor ONEAPI_ROOT is set. ",
683-
"Please source ",
684-
mpi_lib_infos[type].kind_value,
685-
" version of ",
686-
mpi_lib_infos[type].version_prefix_1,
687-
" (",
688-
mpi_lib_infos[type].min_version_value,
689-
" or higher version).");
690-
return ATL_STATUS_FAILURE;
680+
LOG_INFO("oneCCL MPI network transport layer is using ",
681+
mpi_lib_infos[type].version_prefix_1,
682+
" but $I_MPI_ROOT is not set.",
683+
" Transport variables will be initialized automatically.",
684+
" To override them run `source $I_MPI_ROOT/env/vars.sh`");
691685
}
692686

693687
return ATL_STATUS_SUCCESS;

src/coll/algorithms/utils/sycl_coll_base.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -506,8 +506,10 @@ size_t get_tmp_buf_size_per_rank() {
506506

507507
std::vector<sycl::event> get_sycl_events(const ccl::vector_class<ccl::event> &deps) {
508508
std::vector<sycl::event> ret;
509-
for (auto &dep : deps) {
510-
ret.push_back(dep.get_native());
509+
if (!group_impl::is_group_active) {
510+
for (auto &dep : deps) {
511+
ret.push_back(dep.get_native());
512+
}
511513
}
512514
return ret;
513515
}

src/coll/coll_util.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -781,16 +781,16 @@ bool is_reduce_scatter_inplace(const void* send_buf,
781781

782782
// this is needed to enable the direct algo fallback for pt2pt when used inside the Group API
783783
void enable_direct_fallback_for_pt2pt() {
784-
CCL_THROW_IF_NOT(ccl::global_data::env().fallback_recv,
785-
"CCL_RECV global fallback table is null");
786-
CCL_THROW_IF_NOT(ccl::global_data::env().fallback_send,
787-
"CCL_SEND global fallback table is null");
788-
ccl::global_data::env().store_fallback_recv =
789-
std::make_shared<ccl_selection_table_t<ccl_coll_recv_algo>>(
790-
*(ccl::global_data::env().fallback_recv));
791-
ccl::global_data::env().store_fallback_send =
792-
std::make_shared<ccl_selection_table_t<ccl_coll_send_algo>>(
793-
*(ccl::global_data::env().fallback_send));
784+
if (!ccl::global_data::env().fallback_recv) {
785+
ccl::global_data::env().fallback_recv =
786+
std::make_shared<ccl_selection_table_t<ccl_coll_recv_algo>>();
787+
}
788+
789+
if (!ccl::global_data::env().fallback_send) {
790+
ccl::global_data::env().fallback_send =
791+
std::make_shared<ccl_selection_table_t<ccl_coll_send_algo>>();
792+
}
793+
794794
ccl_algorithm_selector_base<ccl_coll_recv_algo>::insert(
795795
*(ccl::global_data::env().fallback_recv),
796796
0,
@@ -801,6 +801,14 @@ void enable_direct_fallback_for_pt2pt() {
801801
0,
802802
CCL_SELECTION_MAX_COLL_SIZE,
803803
ccl_coll_send_direct);
804+
805+
ccl::global_data::env().store_fallback_recv =
806+
std::make_shared<ccl_selection_table_t<ccl_coll_recv_algo>>(
807+
*(ccl::global_data::env().fallback_recv));
808+
809+
ccl::global_data::env().store_fallback_send =
810+
std::make_shared<ccl_selection_table_t<ccl_coll_send_algo>>(
811+
*(ccl::global_data::env().fallback_send));
804812
}
805813

806814
// this is needed to preserve general fallback table for pt2pt when used outside the Group API

src/common/api_wrapper/api_wrapper.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,9 @@ void print_error(int error, lib_info_t& info) {
156156
" - path contains invalid characters");
157157
}
158158
else if (error == CCL_LOAD_LB_DLOPEN_ERROR) {
159-
LOG_WARN("could not open the library: ", info.path.c_str(), " - ", dlerror());
159+
// Log as `info`, because in some cases we have to test multiple paths
160+
// and we do not want to write excess information into users screen
161+
LOG_INFO("could not open the library: ", info.path.c_str(), " - ", dlerror());
160162
}
161163
}
162164

0 commit comments

Comments
 (0)