summaryrefslogtreecommitdiff
path: root/dev-libs/rccl
diff options
context:
space:
mode:
Diffstat (limited to 'dev-libs/rccl')
-rw-r--r--dev-libs/rccl/Manifest4
-rw-r--r--dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch12
-rw-r--r--dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch250
-rw-r--r--dev-libs/rccl/rccl-6.3.0.ebuild75
4 files changed, 341 insertions, 0 deletions
diff --git a/dev-libs/rccl/Manifest b/dev-libs/rccl/Manifest
index 340cf2d171cd..cb020c19922c 100644
--- a/dev-libs/rccl/Manifest
+++ b/dev-libs/rccl/Manifest
@@ -1,7 +1,11 @@
AUX rccl-5.7.1-remove-chrpath.patch 592 BLAKE2B 8510698259154be9138c136c87c3650276140ac500a235049cdd1f8ea9dcc969ac6cd16b13ce628c448f1d5aa8c49818c455e761e96c24a0abc3aa0d3d431320 SHA512 7df69e5293edb3021f57caf9fa6ec12fd3c6a2841f62de4d6358092ec8f5c11f18d2f6434704e64c225327b3a55a4e64710cb07a82ebee103e4faa3865baa72d
AUX rccl-6.0.2-fix-version-check.patch 552 BLAKE2B fb1230b262a959c783e6f796e83621c3b32f69cc81ab94c07a9596a8fa37bf2d496aebfe4bec97adf2873d3dc946e690d64f51b4d578528470436b9949cfa432 SHA512 efd6343ffd9b5dee9152787f993621e7bbd5a6d46d4b33d1c0c47ded4fd1a5da9dc753c956c646f48bde5dd6b61876a633d89810ba33861734ec2b2c0040d0ec
+AUX rccl-6.3.0-headers-fix.patch 400 BLAKE2B 09bab334806f737e4a2675b435d8b386c1d671b8fdad8f458d1501cfdd9848d797d9cb6b385fe1b118ddfd991eb370763b936c693b77d232895653d56f610428 SHA512 e20da2a824e2669d160d6724b4efc362787e79dcfa547153ab2531731f1d239cf44394cf248e2e5abfd0feb5c7906e710acda05e08656584c529e4fa9a44f11f
+AUX rccl-6.3.0-same-rank-sendrecv.patch 13180 BLAKE2B 28d2d7d904ce2cb3008fd4a7472a93336cb9f1e3efd3d15b18d8142eb8b34ca9860907a23f64f818ff8c5611b94d11351a8c8b9dfdd103f58e8206f9b9330838 SHA512 c8fc176f0b7ec560eeef312fc60a53efa7e5c0e73f06b21f279d4c0c86715464eb698749dbe7953b3feb55c0a6e85e95ea5ef68c8a669ce648d9c02dfccc3398
DIST rccl-5.7.1.tar.gz 1425561 BLAKE2B 852c111ad806d5c99f48b3c65c8cf37315c68b969f9544bfa14c1faf1d5557edcc57cdc21705ced6ded4a0288d42b1076e65fb67b3f89b4fa78cfba9d317b23e SHA512 5913b8ff67fa787714713b7d5b571374898be740d56c77db9f04fe7a3e6ca74023fa930a3494d8a6f984ac9e68ee318343835e110049d08700fe773376618af4
DIST rccl-6.1.1.tar.gz 1679144 BLAKE2B 371d64691dc74f875c49e14df8f3f2d8b9c607376e6c5a889bd2bdb50607e88715d6d75ffed4ba3184a5b9b241cb37b8501e927a5f495632212909e410102490 SHA512 6c6376dd822182bcf28f573c0f3b5c7e52f94f4b670ee7c88519232f51b443d52cd37cbe6c41b5b6e9cb0b93c1124246a989f6e6a2ae74935134135585118002
+DIST rccl-6.3.0.tar.gz 1828647 BLAKE2B 8c312fc51e7d600bb62fa059e1af53e153955b79b2ba2e8a6b6b52228b9217b7df6dc815c3a48c0800aaa9387f645070e079d04e99c0e8ebdfe41d5ebe0bda06 SHA512 a068b4a21786176638d108c8c85d5e5a8b0413335b555c2602f2a2e0b9f291f6872dbf68fbb5a17a6a0af9d9b5a90b1b37cce63b655a867b68fc9e20d49931ea
EBUILD rccl-5.7.1.ebuild 1836 BLAKE2B 3286a92c9d08f9e0baac3ad3fbf0a9782109788b999bab8ac4864fa0ab47a6fcd53a73eee2d34a7cb5400998e60f246ec64df6f4a3f8bb07c38405e7f0b4417b SHA512 dd4dfeecdd908eeddba9d0450eba831ccc4778accbcab6023d4d47bac218d2e5d92a967744796b7c8854a579c5df16d8253795dd294183b2054dcf725a0372ae
EBUILD rccl-6.1.1.ebuild 1612 BLAKE2B e175a46484a37e31f0fc0ab3db662a2faaa1ff72cd21f6cbf4540245bc7be012baa9c6c0dc40bdfde39674a0f08ea898b33673db395de3288879ebd778a94ff0 SHA512 2ae7ea089fbac169fe09aed8d82dadb0bd343bee2e525470965987068ea364999e29022298468dfc91d9c625bcfd06e0ea695550275604a4d211d3e30cd322fe
+EBUILD rccl-6.3.0.ebuild 1705 BLAKE2B a732614eb178cb84b53441b83387d5b212c9386c3a48ac3ef39061038da4a60b48bb690aab6af3066aa92d350135913c30dedc054e7e1a87de458042cff91de5 SHA512 1e675f02d76ce4ae18b30cf308ac160db3c100dfbf55d16b94164806880f3f4ae9a970032d881debc19cee1591953220acdad4a120cb05c724b9377870d55fd2
MISC metadata.xml 695 BLAKE2B 7d52b2606665aebfade0d15c339f04fe9ac743d1ef402437adefa6a1ab710ae8e0367172cfa3bae8876609d40d26356cc9a93f555fd28f887cff957e38192416 SHA512 6b1d2a88fb7f88bc2bd1fd7126ba33a7d63b6e323cf43072a6c56fad3a5b8cde4262bc7ed7c9485a650544d562377b146d52088e2ecdd8c0e65d0b91addea57c
diff --git a/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch b/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch
new file mode 100644
index 000000000000..297627819f2c
--- /dev/null
+++ b/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch
@@ -0,0 +1,12 @@
+gtest 1.14 included iomanip, gtest 1.15 does not anymore.
+Upstream bug: https://github.com/ROCm/rccl/issues/1455
+--- a/test/common/TestBed.cpp
++++ b/test/common/TestBed.cpp
+@@ -4,6 +4,7 @@
+ * See LICENSE.txt for license information
+ ************************************************************************/
+ #include <unistd.h>
++#include <iomanip>
+ #include "TestBed.hpp"
+ #include <rccl/rccl.h>
+
diff --git a/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch b/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch
new file mode 100644
index 000000000000..435d6ac57b0f
--- /dev/null
+++ b/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch
@@ -0,0 +1,250 @@
+Enable UT sendrecv to same rank. Fixes test failure.
+Backports commit: https://github.com/ROCm/rccl/commit/fd9924cfe7afbb94b1f157972ba001865481480a
+--- a/test/SendRecvTests.cpp
++++ b/test/SendRecvTests.cpp
+@@ -16,7 +16,6 @@ namespace RcclUnitTesting
+ std::vector<int> const numElements = {1048576, 53327, 1024, 0};
+ bool const inPlace = false;
+ bool const useManagedMem = false;
+- int const groupCallId = 0;
+
+ OptionalColArgs options;
+ bool isCorrect = true;
+@@ -28,7 +27,10 @@ namespace RcclUnitTesting
+ int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
+ int totalRanks = numGpus * ranksPerGpu;
+ int const numProcesses = isMultiProcess ? numGpus : 1;
+- testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
++ testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
++ {1,2}, //two group, second group sendrecv to self, has 2 coll
++ testBed.GetNumStreamsPerGroup(1,2),
++ 2);
+
+ for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
+ for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
+@@ -37,6 +39,8 @@ namespace RcclUnitTesting
+ for (int recvRank = 0; recvRank < totalRanks; ++recvRank)
+ {
+ options.root = recvRank;
++ int groupCallId = sendRank == recvRank; //self sendrecv group has two coll
++ int recvId = sendRank == recvRank; //where recv will be second coll
+ testBed.SetCollectiveArgs(ncclCollSend,
+ dataTypes[dataIdx],
+ numElements[numIdx],
+@@ -47,36 +51,46 @@ namespace RcclUnitTesting
+ sendRank);
+ if (recvRank == 0)
+ {
+- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank);
+- testBed.PrepareData(groupCallId, 0, sendRank);
+- }
+- if (recvRank != sendRank)
+- {
+- if (testBed.ev.showNames) // Show test names
+- INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
+- isMultiProcess ? "MP" : "SP",
+- ncclDataTypeNames[dataTypes[dataIdx]],
+- sendRank,
+- recvRank,
+- numElements[numIdx]);
+-
+- options.root = sendRank;
+- testBed.SetCollectiveArgs(ncclCollRecv,
++ //set up the collArg slot to make sure AllocateMem is called once and correctly
++ testBed.SetCollectiveArgs(ncclCollSend,
+ dataTypes[dataIdx],
+ numElements[numIdx],
+ numElements[numIdx],
+ options,
+ 0,
+- groupCallId,
+- recvRank);
+- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank);
+- testBed.PrepareData(groupCallId, 0, recvRank);
+- testBed.ExecuteCollectives({sendRank, recvRank});
+- testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank);
+- testBed.DeallocateMem(groupCallId, 0, recvRank);
++ !groupCallId,
++ sendRank);
++ testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank);
++ testBed.PrepareData(0, 0, sendRank);
++ testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank);
++ testBed.PrepareData(1, 0, sendRank);
+ }
++
++ if (testBed.ev.showNames) // Show test names
++ INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
++ isMultiProcess ? "MP" : "SP",
++ ncclDataTypeNames[dataTypes[dataIdx]],
++ sendRank,
++ recvRank,
++ numElements[numIdx]);
++ options.root = sendRank;
++
++ testBed.SetCollectiveArgs(ncclCollRecv,
++ dataTypes[dataIdx],
++ numElements[numIdx],
++ numElements[numIdx],
++ options,
++ recvId,
++ groupCallId,
++ recvRank);
++ testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank);
++ testBed.PrepareData(groupCallId, recvId, recvRank);
++ testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId);
++ testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank);
++ testBed.DeallocateMem(groupCallId, recvId, recvRank);
+ }
+- testBed.DeallocateMem(groupCallId, 0, sendRank);
++ testBed.DeallocateMem(0, 0, sendRank);
++ testBed.DeallocateMem(1, 0, sendRank);
+ }
+ testBed.DestroyComms();
+ }
+@@ -94,7 +108,6 @@ namespace RcclUnitTesting
+ bool const inPlace = false;
+ bool const useManagedMem = false;
+ bool const userRegistered = true;
+- int const groupCallId = 0;
+
+ OptionalColArgs options;
+ bool isCorrect = true;
+@@ -106,7 +119,10 @@ namespace RcclUnitTesting
+ int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
+ int totalRanks = numGpus * ranksPerGpu;
+ int const numProcesses = isMultiProcess ? numGpus : 1;
+- testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
++ testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu),
++ {1,2}, //two group, second group sendrecv to self, has 2 coll
++ testBed.GetNumStreamsPerGroup(1,2),
++ 2);
+
+ for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
+ for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
+@@ -115,6 +131,8 @@ namespace RcclUnitTesting
+ for (int recvRank = 0; recvRank < totalRanks; ++recvRank)
+ {
+ options.root = recvRank;
++ int groupCallId = sendRank == recvRank;
++ int recvId = sendRank == recvRank;
+ testBed.SetCollectiveArgs(ncclCollSend,
+ dataTypes[dataIdx],
+ numElements[numIdx],
+@@ -125,36 +143,45 @@ namespace RcclUnitTesting
+ sendRank);
+ if (recvRank == 0)
+ {
+- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank, userRegistered);
+- testBed.PrepareData(groupCallId, 0, sendRank);
+- }
+- if (recvRank != sendRank)
+- {
+- if (testBed.ev.showNames) // Show test names
+- INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
+- isMultiProcess ? "MP" : "SP",
+- ncclDataTypeNames[dataTypes[dataIdx]],
+- sendRank,
+- recvRank,
+- numElements[numIdx]);
+-
+- options.root = sendRank;
+- testBed.SetCollectiveArgs(ncclCollRecv,
++ testBed.SetCollectiveArgs(ncclCollSend,
+ dataTypes[dataIdx],
+ numElements[numIdx],
+ numElements[numIdx],
+ options,
+ 0,
+- groupCallId,
+- recvRank);
+- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank, userRegistered);
+- testBed.PrepareData(groupCallId, 0, recvRank);
+- testBed.ExecuteCollectives({sendRank, recvRank});
+- testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank);
+- testBed.DeallocateMem(groupCallId, 0, recvRank);
++ !groupCallId,
++ sendRank);
++ testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank, userRegistered);
++ testBed.PrepareData(0, 0, sendRank);
++ testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank, userRegistered);
++ testBed.PrepareData(1, 0, sendRank);
+ }
++
++ if (testBed.ev.showNames) // Show test names
++ INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n",
++ isMultiProcess ? "MP" : "SP",
++ ncclDataTypeNames[dataTypes[dataIdx]],
++ sendRank,
++ recvRank,
++ numElements[numIdx]);
++
++ options.root = sendRank;
++ testBed.SetCollectiveArgs(ncclCollRecv,
++ dataTypes[dataIdx],
++ numElements[numIdx],
++ numElements[numIdx],
++ options,
++ recvId,
++ groupCallId,
++ recvRank);
++ testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank, userRegistered);
++ testBed.PrepareData(groupCallId, recvId, recvRank);
++ testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId);
++ testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank);
++ testBed.DeallocateMem(groupCallId, recvId, recvRank);
+ }
+- testBed.DeallocateMem(groupCallId, 0, sendRank);
++ testBed.DeallocateMem(0, 0, sendRank);
++ testBed.DeallocateMem(1, 0, sendRank);
+ }
+ testBed.DestroyComms();
+ }
+--- a/test/common/TestBedChild.cpp
++++ b/test/common/TestBedChild.cpp
+@@ -395,6 +395,8 @@ namespace RcclUnitTesting
+ {
+ CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx];
+ CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem, userRegistered));
++ if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
++ CHILD_NCCL_CALL(ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister");
+ if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d in group %d on device %d (%s,%s,%s) Input: %p Output %p\n",
+ globalRank, this->childId, collIdx, groupId, this->deviceIds[localRank],
+ inPlace ? "in-place" : "out-of-place",
+@@ -646,8 +648,6 @@ namespace RcclUnitTesting
+ "ncclAllToAllv");
+ break;
+ case ncclCollSend:
+- if (collArg.userRegistered)
+- CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister");
+ CHILD_NCCL_CALL_RANK(errCode, ncclSend(
+ collArg.inputGpu.ptr,
+ collArg.numInputElements,
+@@ -658,8 +658,6 @@ namespace RcclUnitTesting
+ "ncclSend");
+ break;
+ case ncclCollRecv:
+- if (collArg.userRegistered)
+- CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.outputGpu.ptr, collArg.numOutputBytesAllocated, &(collArg.commRegHandle)), "ncclCommRegister");
+ CHILD_NCCL_CALL_RANK(errCode, ncclRecv(
+ collArg.outputGpu.ptr,
+ collArg.numOutputElements,
+@@ -891,8 +889,6 @@ namespace RcclUnitTesting
+ for (int collIdx = 0; collIdx < collArgs[groupId][localRank].size(); ++collIdx)
+ {
+ CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx];
+- if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
+- CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister");
+ if (collId == -1 || collId == collIdx)
+ {
+ if (this->verbose)
+@@ -900,6 +896,10 @@ namespace RcclUnitTesting
+ INFO("Child %d release memory for collective %d in group %d (Input: %p Output %p\n",
+ this->childId, collIdx, groupId, collArg.inputGpu.ptr, collArg.outputGpu.ptr);
+ }
++ if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv))
++ {
++ CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister");
++ }
+
+ CHECK_CALL(collArg.DeallocateMem());
+ }
diff --git a/dev-libs/rccl/rccl-6.3.0.ebuild b/dev-libs/rccl/rccl-6.3.0.ebuild
new file mode 100644
index 000000000000..d610f7eb139c
--- /dev/null
+++ b/dev-libs/rccl/rccl-6.3.0.ebuild
@@ -0,0 +1,75 @@
+# Copyright 1999-2024 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=8
+
+ROCM_VERSION=${PV}
+
+inherit cmake edo rocm flag-o-matic
+
+DESCRIPTION="ROCm Communication Collectives Library (RCCL)"
+HOMEPAGE="https://github.com/ROCm/rccl"
+SRC_URI="https://github.com/ROCm/rccl/archive/rocm-${PV}.tar.gz -> rccl-${PV}.tar.gz"
+S="${WORKDIR}/rccl-rocm-${PV}"
+
+LICENSE="BSD"
+SLOT="0/$(ver_cut 1-2)"
+KEYWORDS="~amd64"
+IUSE="test"
+
+RDEPEND="
+ dev-util/hip:${SLOT}
+ dev-util/rocm-smi:${SLOT}"
+DEPEND="${RDEPEND}
+ sys-libs/binutils-libs"
+BDEPEND="
+ >=dev-build/cmake-3.22
+ >=dev-build/rocm-cmake-5.7.1
+ dev-util/hipify-clang:${SLOT}
+ test? ( dev-cpp/gtest )"
+
+RESTRICT="!test? ( test )"
+
+PATCHES=(
+ "${FILESDIR}/${PN}-6.0.2-fix-version-check.patch"
+ "${FILESDIR}/${PN}-6.3.0-same-rank-sendrecv.patch"
+ "${FILESDIR}/${PN}-6.3.0-headers-fix.patch"
+)
+
+src_prepare() {
+ cmake_src_prepare
+
+ # https://reviews.llvm.org/D69582 - clang does not support parallel jobs
+ sed '/parallel-jobs/d' -i CMakeLists.txt || die
+
+ # complete fix-version-check patch
+ sed "s/@rocm_version@/${PV}/" -i CMakeLists.txt || die
+
+ # don't install tests
+ sed "/rocm_install(TARGETS rccl-UnitTests/d" -i test/CMakeLists.txt || die
+}
+
+src_configure() {
+ rocm_use_hipcc
+
+ # lto flags make compilation fail with "undefined hidden symbol"
+ filter-lto
+
+ local mycmakeargs=(
+ -DCMAKE_SKIP_RPATH=ON
+ -DAMDGPU_TARGETS="$(get_amdgpu_flags)"
+ -DBUILD_TESTS=$(usex test ON OFF)
+ -DROCM_SYMLINK_LIBS=OFF
+ -DROCM_PATH="${EPREFIX}/usr"
+ -DRCCL_ROCPROFILER_REGISTER=OFF
+ -Wno-dev
+ )
+
+ cmake_src_configure
+}
+
+src_test() {
+ check_amdgpu
+ cd "${BUILD_DIR}" || die
+ LD_LIBRARY_PATH="${BUILD_DIR}" edob test/rccl-UnitTests
+}