diff options
Diffstat (limited to 'dev-libs/rccl')
-rw-r--r-- | dev-libs/rccl/Manifest | 4 | ||||
-rw-r--r-- | dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch | 12 | ||||
-rw-r--r-- | dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch | 250 | ||||
-rw-r--r-- | dev-libs/rccl/rccl-6.3.0.ebuild | 75 |
4 files changed, 341 insertions, 0 deletions
diff --git a/dev-libs/rccl/Manifest b/dev-libs/rccl/Manifest index 340cf2d171cd..cb020c19922c 100644 --- a/dev-libs/rccl/Manifest +++ b/dev-libs/rccl/Manifest @@ -1,7 +1,11 @@ AUX rccl-5.7.1-remove-chrpath.patch 592 BLAKE2B 8510698259154be9138c136c87c3650276140ac500a235049cdd1f8ea9dcc969ac6cd16b13ce628c448f1d5aa8c49818c455e761e96c24a0abc3aa0d3d431320 SHA512 7df69e5293edb3021f57caf9fa6ec12fd3c6a2841f62de4d6358092ec8f5c11f18d2f6434704e64c225327b3a55a4e64710cb07a82ebee103e4faa3865baa72d AUX rccl-6.0.2-fix-version-check.patch 552 BLAKE2B fb1230b262a959c783e6f796e83621c3b32f69cc81ab94c07a9596a8fa37bf2d496aebfe4bec97adf2873d3dc946e690d64f51b4d578528470436b9949cfa432 SHA512 efd6343ffd9b5dee9152787f993621e7bbd5a6d46d4b33d1c0c47ded4fd1a5da9dc753c956c646f48bde5dd6b61876a633d89810ba33861734ec2b2c0040d0ec +AUX rccl-6.3.0-headers-fix.patch 400 BLAKE2B 09bab334806f737e4a2675b435d8b386c1d671b8fdad8f458d1501cfdd9848d797d9cb6b385fe1b118ddfd991eb370763b936c693b77d232895653d56f610428 SHA512 e20da2a824e2669d160d6724b4efc362787e79dcfa547153ab2531731f1d239cf44394cf248e2e5abfd0feb5c7906e710acda05e08656584c529e4fa9a44f11f +AUX rccl-6.3.0-same-rank-sendrecv.patch 13180 BLAKE2B 28d2d7d904ce2cb3008fd4a7472a93336cb9f1e3efd3d15b18d8142eb8b34ca9860907a23f64f818ff8c5611b94d11351a8c8b9dfdd103f58e8206f9b9330838 SHA512 c8fc176f0b7ec560eeef312fc60a53efa7e5c0e73f06b21f279d4c0c86715464eb698749dbe7953b3feb55c0a6e85e95ea5ef68c8a669ce648d9c02dfccc3398 DIST rccl-5.7.1.tar.gz 1425561 BLAKE2B 852c111ad806d5c99f48b3c65c8cf37315c68b969f9544bfa14c1faf1d5557edcc57cdc21705ced6ded4a0288d42b1076e65fb67b3f89b4fa78cfba9d317b23e SHA512 5913b8ff67fa787714713b7d5b571374898be740d56c77db9f04fe7a3e6ca74023fa930a3494d8a6f984ac9e68ee318343835e110049d08700fe773376618af4 DIST rccl-6.1.1.tar.gz 1679144 BLAKE2B 371d64691dc74f875c49e14df8f3f2d8b9c607376e6c5a889bd2bdb50607e88715d6d75ffed4ba3184a5b9b241cb37b8501e927a5f495632212909e410102490 SHA512 6c6376dd822182bcf28f573c0f3b5c7e52f94f4b670ee7c88519232f51b443d52cd37cbe6c41b5b6e9cb0b93c1124246a989f6e6a2ae74935134135585118002 +DIST rccl-6.3.0.tar.gz 1828647 BLAKE2B 8c312fc51e7d600bb62fa059e1af53e153955b79b2ba2e8a6b6b52228b9217b7df6dc815c3a48c0800aaa9387f645070e079d04e99c0e8ebdfe41d5ebe0bda06 SHA512 a068b4a21786176638d108c8c85d5e5a8b0413335b555c2602f2a2e0b9f291f6872dbf68fbb5a17a6a0af9d9b5a90b1b37cce63b655a867b68fc9e20d49931ea EBUILD rccl-5.7.1.ebuild 1836 BLAKE2B 3286a92c9d08f9e0baac3ad3fbf0a9782109788b999bab8ac4864fa0ab47a6fcd53a73eee2d34a7cb5400998e60f246ec64df6f4a3f8bb07c38405e7f0b4417b SHA512 dd4dfeecdd908eeddba9d0450eba831ccc4778accbcab6023d4d47bac218d2e5d92a967744796b7c8854a579c5df16d8253795dd294183b2054dcf725a0372ae EBUILD rccl-6.1.1.ebuild 1612 BLAKE2B e175a46484a37e31f0fc0ab3db662a2faaa1ff72cd21f6cbf4540245bc7be012baa9c6c0dc40bdfde39674a0f08ea898b33673db395de3288879ebd778a94ff0 SHA512 2ae7ea089fbac169fe09aed8d82dadb0bd343bee2e525470965987068ea364999e29022298468dfc91d9c625bcfd06e0ea695550275604a4d211d3e30cd322fe +EBUILD rccl-6.3.0.ebuild 1705 BLAKE2B a732614eb178cb84b53441b83387d5b212c9386c3a48ac3ef39061038da4a60b48bb690aab6af3066aa92d350135913c30dedc054e7e1a87de458042cff91de5 SHA512 1e675f02d76ce4ae18b30cf308ac160db3c100dfbf55d16b94164806880f3f4ae9a970032d881debc19cee1591953220acdad4a120cb05c724b9377870d55fd2 MISC metadata.xml 695 BLAKE2B 7d52b2606665aebfade0d15c339f04fe9ac743d1ef402437adefa6a1ab710ae8e0367172cfa3bae8876609d40d26356cc9a93f555fd28f887cff957e38192416 SHA512 6b1d2a88fb7f88bc2bd1fd7126ba33a7d63b6e323cf43072a6c56fad3a5b8cde4262bc7ed7c9485a650544d562377b146d52088e2ecdd8c0e65d0b91addea57c diff --git a/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch b/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch new file mode 100644 index 000000000000..297627819f2c --- /dev/null +++ b/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch @@ -0,0 +1,12 @@ +gtest 1.14 included iomanip, gtest 1.15 does not anymore. +Upstream bug: https://github.com/ROCm/rccl/issues/1455 +--- a/test/common/TestBed.cpp ++++ b/test/common/TestBed.cpp +@@ -4,6 +4,7 @@ + * See LICENSE.txt for license information + ************************************************************************/ + #include <unistd.h> ++#include <iomanip> + #include "TestBed.hpp" + #include <rccl/rccl.h> + diff --git a/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch b/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch new file mode 100644 index 000000000000..435d6ac57b0f --- /dev/null +++ b/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch @@ -0,0 +1,250 @@ +Enable UT sendrecv to same rank. Fixes test failure. +Backports commit: https://github.com/ROCm/rccl/commit/fd9924cfe7afbb94b1f157972ba001865481480a +--- a/test/SendRecvTests.cpp ++++ b/test/SendRecvTests.cpp +@@ -16,7 +16,6 @@ namespace RcclUnitTesting + std::vector<int> const numElements = {1048576, 53327, 1024, 0}; + bool const inPlace = false; + bool const useManagedMem = false; +- int const groupCallId = 0; + + OptionalColArgs options; + bool isCorrect = true; +@@ -28,7 +27,10 @@ namespace RcclUnitTesting + int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; + int totalRanks = numGpus * ranksPerGpu; + int const numProcesses = isMultiProcess ? numGpus : 1; +- testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); ++ testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), ++ {1,2}, //two group, second group sendrecv to self, has 2 coll ++ testBed.GetNumStreamsPerGroup(1,2), ++ 2); + + for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) + for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) +@@ -37,6 +39,8 @@ namespace RcclUnitTesting + for (int recvRank = 0; recvRank < totalRanks; ++recvRank) + { + options.root = recvRank; ++ int groupCallId = sendRank == recvRank; //self sendrecv group has two coll ++ int recvId = sendRank == recvRank; //where recv will be second coll + testBed.SetCollectiveArgs(ncclCollSend, + dataTypes[dataIdx], + numElements[numIdx], +@@ -47,36 +51,46 @@ namespace RcclUnitTesting + sendRank); + if (recvRank == 0) + { +- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank); +- testBed.PrepareData(groupCallId, 0, sendRank); +- } +- if (recvRank != sendRank) +- { +- if (testBed.ev.showNames) // Show test names +- INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", +- isMultiProcess ? "MP" : "SP", +- ncclDataTypeNames[dataTypes[dataIdx]], +- sendRank, +- recvRank, +- numElements[numIdx]); +- +- options.root = sendRank; +- testBed.SetCollectiveArgs(ncclCollRecv, ++ //set up the collArg slot to make sure AllocateMem is called once and correctly ++ testBed.SetCollectiveArgs(ncclCollSend, + dataTypes[dataIdx], + numElements[numIdx], + numElements[numIdx], + options, + 0, +- groupCallId, +- recvRank); +- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank); +- testBed.PrepareData(groupCallId, 0, recvRank); +- testBed.ExecuteCollectives({sendRank, recvRank}); +- testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank); +- testBed.DeallocateMem(groupCallId, 0, recvRank); ++ !groupCallId, ++ sendRank); ++ testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank); ++ testBed.PrepareData(0, 0, sendRank); ++ testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank); ++ testBed.PrepareData(1, 0, sendRank); + } ++ ++ if (testBed.ev.showNames) // Show test names ++ INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", ++ isMultiProcess ? "MP" : "SP", ++ ncclDataTypeNames[dataTypes[dataIdx]], ++ sendRank, ++ recvRank, ++ numElements[numIdx]); ++ options.root = sendRank; ++ ++ testBed.SetCollectiveArgs(ncclCollRecv, ++ dataTypes[dataIdx], ++ numElements[numIdx], ++ numElements[numIdx], ++ options, ++ recvId, ++ groupCallId, ++ recvRank); ++ testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank); ++ testBed.PrepareData(groupCallId, recvId, recvRank); ++ testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId); ++ testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank); ++ testBed.DeallocateMem(groupCallId, recvId, recvRank); + } +- testBed.DeallocateMem(groupCallId, 0, sendRank); ++ testBed.DeallocateMem(0, 0, sendRank); ++ testBed.DeallocateMem(1, 0, sendRank); + } + testBed.DestroyComms(); + } +@@ -94,7 +108,6 @@ namespace RcclUnitTesting + bool const inPlace = false; + bool const useManagedMem = false; + bool const userRegistered = true; +- int const groupCallId = 0; + + OptionalColArgs options; + bool isCorrect = true; +@@ -106,7 +119,10 @@ namespace RcclUnitTesting + int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; + int totalRanks = numGpus * ranksPerGpu; + int const numProcesses = isMultiProcess ? numGpus : 1; +- testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); ++ testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), ++ {1,2}, //two group, second group sendrecv to self, has 2 coll ++ testBed.GetNumStreamsPerGroup(1,2), ++ 2); + + for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) + for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) +@@ -115,6 +131,8 @@ namespace RcclUnitTesting + for (int recvRank = 0; recvRank < totalRanks; ++recvRank) + { + options.root = recvRank; ++ int groupCallId = sendRank == recvRank; ++ int recvId = sendRank == recvRank; + testBed.SetCollectiveArgs(ncclCollSend, + dataTypes[dataIdx], + numElements[numIdx], +@@ -125,36 +143,45 @@ namespace RcclUnitTesting + sendRank); + if (recvRank == 0) + { +- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank, userRegistered); +- testBed.PrepareData(groupCallId, 0, sendRank); +- } +- if (recvRank != sendRank) +- { +- if (testBed.ev.showNames) // Show test names +- INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", +- isMultiProcess ? "MP" : "SP", +- ncclDataTypeNames[dataTypes[dataIdx]], +- sendRank, +- recvRank, +- numElements[numIdx]); +- +- options.root = sendRank; +- testBed.SetCollectiveArgs(ncclCollRecv, ++ testBed.SetCollectiveArgs(ncclCollSend, + dataTypes[dataIdx], + numElements[numIdx], + numElements[numIdx], + options, + 0, +- groupCallId, +- recvRank); +- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank, userRegistered); +- testBed.PrepareData(groupCallId, 0, recvRank); +- testBed.ExecuteCollectives({sendRank, recvRank}); +- testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank); +- testBed.DeallocateMem(groupCallId, 0, recvRank); ++ !groupCallId, ++ sendRank); ++ testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank, userRegistered); ++ testBed.PrepareData(0, 0, sendRank); ++ testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank, userRegistered); ++ testBed.PrepareData(1, 0, sendRank); + } ++ ++ if (testBed.ev.showNames) // Show test names ++ INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", ++ isMultiProcess ? "MP" : "SP", ++ ncclDataTypeNames[dataTypes[dataIdx]], ++ sendRank, ++ recvRank, ++ numElements[numIdx]); ++ ++ options.root = sendRank; ++ testBed.SetCollectiveArgs(ncclCollRecv, ++ dataTypes[dataIdx], ++ numElements[numIdx], ++ numElements[numIdx], ++ options, ++ recvId, ++ groupCallId, ++ recvRank); ++ testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank, userRegistered); ++ testBed.PrepareData(groupCallId, recvId, recvRank); ++ testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId); ++ testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank); ++ testBed.DeallocateMem(groupCallId, recvId, recvRank); + } +- testBed.DeallocateMem(groupCallId, 0, sendRank); ++ testBed.DeallocateMem(0, 0, sendRank); ++ testBed.DeallocateMem(1, 0, sendRank); + } + testBed.DestroyComms(); + } +--- a/test/common/TestBedChild.cpp ++++ b/test/common/TestBedChild.cpp +@@ -395,6 +395,8 @@ namespace RcclUnitTesting + { + CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx]; + CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem, userRegistered)); ++ if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) ++ CHILD_NCCL_CALL(ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister"); + if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d in group %d on device %d (%s,%s,%s) Input: %p Output %p\n", + globalRank, this->childId, collIdx, groupId, this->deviceIds[localRank], + inPlace ? "in-place" : "out-of-place", +@@ -646,8 +648,6 @@ namespace RcclUnitTesting + "ncclAllToAllv"); + break; + case ncclCollSend: +- if (collArg.userRegistered) +- CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister"); + CHILD_NCCL_CALL_RANK(errCode, ncclSend( + collArg.inputGpu.ptr, + collArg.numInputElements, +@@ -658,8 +658,6 @@ namespace RcclUnitTesting + "ncclSend"); + break; + case ncclCollRecv: +- if (collArg.userRegistered) +- CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.outputGpu.ptr, collArg.numOutputBytesAllocated, &(collArg.commRegHandle)), "ncclCommRegister"); + CHILD_NCCL_CALL_RANK(errCode, ncclRecv( + collArg.outputGpu.ptr, + collArg.numOutputElements, +@@ -891,8 +889,6 @@ namespace RcclUnitTesting + for (int collIdx = 0; collIdx < collArgs[groupId][localRank].size(); ++collIdx) + { + CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx]; +- if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) +- CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister"); + if (collId == -1 || collId == collIdx) + { + if (this->verbose) +@@ -900,6 +896,10 @@ namespace RcclUnitTesting + INFO("Child %d release memory for collective %d in group %d (Input: %p Output %p\n", + this->childId, collIdx, groupId, collArg.inputGpu.ptr, collArg.outputGpu.ptr); + } ++ if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) ++ { ++ CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister"); ++ } + + CHECK_CALL(collArg.DeallocateMem()); + } diff --git a/dev-libs/rccl/rccl-6.3.0.ebuild b/dev-libs/rccl/rccl-6.3.0.ebuild new file mode 100644 index 000000000000..d610f7eb139c --- /dev/null +++ b/dev-libs/rccl/rccl-6.3.0.ebuild @@ -0,0 +1,75 @@ +# Copyright 1999-2024 Gentoo Authors +# Distributed under the terms of the GNU General Public License v2 + +EAPI=8 + +ROCM_VERSION=${PV} + +inherit cmake edo rocm flag-o-matic + +DESCRIPTION="ROCm Communication Collectives Library (RCCL)" +HOMEPAGE="https://github.com/ROCm/rccl" +SRC_URI="https://github.com/ROCm/rccl/archive/rocm-${PV}.tar.gz -> rccl-${PV}.tar.gz" +S="${WORKDIR}/rccl-rocm-${PV}" + +LICENSE="BSD" +SLOT="0/$(ver_cut 1-2)" +KEYWORDS="~amd64" +IUSE="test" + +RDEPEND=" + dev-util/hip:${SLOT} + dev-util/rocm-smi:${SLOT}" +DEPEND="${RDEPEND} + sys-libs/binutils-libs" +BDEPEND=" + >=dev-build/cmake-3.22 + >=dev-build/rocm-cmake-5.7.1 + dev-util/hipify-clang:${SLOT} + test? ( dev-cpp/gtest )" + +RESTRICT="!test? ( test )" + +PATCHES=( + "${FILESDIR}/${PN}-6.0.2-fix-version-check.patch" + "${FILESDIR}/${PN}-6.3.0-same-rank-sendrecv.patch" + "${FILESDIR}/${PN}-6.3.0-headers-fix.patch" +) + +src_prepare() { + cmake_src_prepare + + # https://reviews.llvm.org/D69582 - clang does not support parallel jobs + sed '/parallel-jobs/d' -i CMakeLists.txt || die + + # complete fix-version-check patch + sed "s/@rocm_version@/${PV}/" -i CMakeLists.txt || die + + # don't install tests + sed "/rocm_install(TARGETS rccl-UnitTests/d" -i test/CMakeLists.txt || die +} + +src_configure() { + rocm_use_hipcc + + # lto flags make compilation fail with "undefined hidden symbol" + filter-lto + + local mycmakeargs=( + -DCMAKE_SKIP_RPATH=ON + -DAMDGPU_TARGETS="$(get_amdgpu_flags)" + -DBUILD_TESTS=$(usex test ON OFF) + -DROCM_SYMLINK_LIBS=OFF + -DROCM_PATH="${EPREFIX}/usr" + -DRCCL_ROCPROFILER_REGISTER=OFF + -Wno-dev + ) + + cmake_src_configure +} + +src_test() { + check_amdgpu + cd "${BUILD_DIR}" || die + LD_LIBRARY_PATH="${BUILD_DIR}" edob test/rccl-UnitTests +} |